In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import os
import re
import torch
from torch.utils.data import Dataset, random_split, DataLoader
import numpy as np
import os

The problem ask to identify the spatial distribution of porosities given a certain density. This problem can be modeled as finding conditioned distribution p(x|c) where c is the density of the microstructure and x is the 30e3 vector of the porosities.

In [None]:
path = '/gpfs/data/ssa/users/d602145/Workspace/scratch/Porosity/ETH/'
os.chdir(path)

Start with a standard data analysis before deciding which kind of strategy to apply to the problem


First We build a class to perform standard operations over the samples. Each sample is seen as a 3D distribution that we can plot in 3D. This is useful to understand the evolution of the spatial distribution of porosities. We want to check if some clusters appear increasing the level of porosity density. This imply that porosities are not issued from a random distribution, but there is a hidden criteria behind.

In [None]:
from Lib.Data import PorosityDistribution



We build then 2 functions to extract datas. First one is used to create a dictionary of samples, where each sample is a microstructure. Second one create a dataframe with all the porosities of all the samples. This 2 will be used for different stats

In [None]:
from Lib.Data import extract_microstructures,extract_porosities_points

In [None]:
sample_path = os.getcwd()+'/Job_Assignment_Data/Job_Assignment_Data/'

In [None]:
extracted_distributions = extract_microstructures(sample_path)
extracted_porosities, density_set = extract_porosities_points(sample_path)

Let's check the microstructures first

In [None]:
extracted_distributions[0].plot_porosity_distribution()

In [None]:
extracted_distributions[0].plot_porosity_histogram()

In [None]:
extracted_distributions[7].plot_porosity_distribution()

In [None]:
extracted_distributions[7].plot_porosity_histogram()

In [None]:

extracted_distributions[15].plot_porosity_distribution()

In [None]:
extracted_distributions[15].plot_porosity_histogram()

In [None]:
por = extracted_distributions[0].plot_porosity_pairgrid()

As we can oberve the density is proportional to the number of porosities within the structure. Cluster appears as the radius of porosities get bigger. The porosity are not uniformly distributed within the cluster.

In [None]:
extracted_distributions[0].as_dataframe()

In [None]:
extracted_distributions[10].as_dataframe()

In [None]:
extracted_distributions[15].as_dataframe()

As specified in the pdf the grid is equally spaced and porosities are given in a constant 3D grid among the samples

Lets check the macro trend of the density against the porosity mean within the samples

In [None]:
import plotly.express as px

porosity_means = []
density_means = []
sample_numbers = []  # To store sample numbers for hover data

for key, distribution in extracted_distributions.items():
    df = distribution.as_dataframe()
    porosity_means.append(df['porosity'].mean())
    density_means.append(df['density'].mean())
    sample_numbers.append(key)  # Store the sample number

# Create the scatter plot
fig = px.scatter(x=density_means, y=porosity_means,
                 hover_data={'Sample Number': sample_numbers},  # Add hover data
                 title="Scatter Plot of Density Mean vs. Porosity Mean")
fig.update_xaxes(title_text="Density Mean")
fig.update_yaxes(title_text="Porosity Mean")
fig.show()

We can see a clear trend (looks like a quadratic or exponential curve with appropriate scaling). It's interesting to see that for higher desity values there is more dispersion. I would expect more structures could be associated to one density value. like a one t many problem


In [None]:
extracted_porosities

It could be interesting to asses if there are more samples associated to the value of density and check if their microstructures are different

In [None]:
density_set.value_counts().value_counts()

In [None]:
density_set.value_counts()

In [None]:
density_set[density_set['density']==0.192]

In [None]:
extracted_distributions[225].plot_porosity_distribution()

In [None]:
extracted_distributions[225].plot_porosity_histogram()

In [None]:
extracted_distributions[208].plot_porosity_distribution()

In [None]:
extracted_distributions[208].plot_porosity_histogram()

In [None]:
density_set[density_set['density']==0.647]

In [None]:
extracted_distributions[197].plot_porosity_histogram()
extracted_distributions[277].plot_porosity_histogram()
extracted_distributions[295].plot_porosity_histogram()

In [None]:
extracted_distributions[24].plot_porosity_histogram()
extracted_distributions[229].plot_porosity_histogram()
extracted_distributions[115].plot_porosity_histogram()

In [None]:
extracted_porosities.shape

In [None]:
filt_extracted_distributions = extract_microstructures(sample_path,keep_density_doubles=False)
filt_extracted_porosities, density_set = extract_porosities_points(sample_path,keep_density_doubles=False)

In [None]:
filt_extracted_porosities.shape

In [None]:
filt_extracted_porosities.sort_values(by='density',inplace=True)
filt_extracted_porosities['density'].value_counts

In [None]:
density_set.sort_values(by='density',inplace=True)

In [None]:
density_set

In [None]:
px.histogram(density_set,nbins=50)

In [None]:
filt_extracted_porosities.head()

In [None]:
filt_extracted_porosities.groupby(by='density').count()

In [None]:
px.scatter(filt_extracted_porosities.groupby(by='density').count())

In [None]:
filt_extracted_porosities

In [None]:
from Lib.Tools import conditioned_random_sampling

In [None]:
balanced_porosities = conditioned_random_sampling(filt_extracted_porosities,n_samples=3000)

In [None]:
balanced_porosities.shape

In [None]:
px.scatter(balanced_porosities.groupby(by='density').count())

In [None]:
balanced_porosities['density'].unique()

In [None]:
fig = px.histogram(balanced_porosities[balanced_porosities['density']==0.975].iloc[:,:3],facet_col='variable',histnorm='probability',nbins=100)
fig.show()
fig = px.histogram(filt_extracted_porosities[filt_extracted_porosities['density']==0.975].iloc[:,:3],facet_col='variable',histnorm='probability',nbins=100)
fig.show()

In [None]:
fig = px.histogram(balanced_porosities[balanced_porosities['density']==0.536].iloc[:,:3],facet_col='variable',histnorm='probability',nbins=100)
fig.show()
fig = px.histogram(filt_extracted_porosities[filt_extracted_porosities['density']==0.536].iloc[:,:3],facet_col='variable',histnorm='probability',nbins=100)
fig.show()

In [None]:
fig = px.histogram(balanced_porosities[balanced_porosities['density']==0.231].iloc[:,:3],facet_col='variable',histnorm='probability',nbins=100)
fig.show()
fig = px.histogram(filt_extracted_porosities[filt_extracted_porosities['density']==0.231].iloc[:,:3],facet_col='variable',histnorm='probability',nbins=100)
fig.show()

In [None]:
balanced_porosities.head()

In [None]:
porosities = np.zeros((512,27000))
density = np.zeros((512,1))

In [None]:
for i in range(porosities.shape[0]):
    
    porosities[i,:] = extracted_distributions[i].as_dataframe()['porosity'].values
    density[i,0] = extracted_distributions[i].density

In [None]:
porosities

In [None]:
from sklearn.decomposition import PCA

pca_comp = 500

pca = PCA(n_components=500)

pca.fit(porosities)

In [None]:
pca.explained_variance_ratio_.cumsum()

In [None]:
px.scatter(pca.explained_variance_ratio_)

In [None]:
reduced = pca.transform(porosities)

In [None]:
df_reduced = pd.DataFrame(reduced, columns=[i for i in range(pca_comp)])

In [None]:
df_reduced['density'] = density

In [None]:
df_reduced.head()

In [None]:
sns.heatmap(df_reduced.corr())

In [None]:

g = sns.PairGrid(df_reduced.iloc[:200,::50], diag_sharey=False)
g.map_upper(sns.scatterplot, s=5)
g.map_lower(sns.kdeplot)
g.map_diag(sns.histplot,bins=30)

In [None]:
from sklearn.preprocessing import StandardScaler

st_sc = StandardScaler()

sc_df = st_sc.fit_transform(df_reduced.values)



In [None]:
sc_df = pd.DataFrame(sc_df,columns = df_reduced.columns)

In [None]:
g = sns.PairGrid(sc_df.iloc[:200,::50], diag_sharey=False)
g.map_upper(sns.scatterplot, s=5)
g.map_lower(sns.kdeplot)
g.map_diag(sns.histplot,bins=30)

In [None]:
train_split = 0.8
y_train,X_train = df_reduced.iloc[:int(train_split*df_reduced.shape[0]),:-1].values,df_reduced.iloc[:int(train_split*df_reduced.shape[0]),-1].values.reshape(-1,1)
y_test,X_test = df_reduced.iloc[int(train_split*df_reduced.shape[0]):,:-1].values,df_reduced.iloc[int(train_split*df_reduced.shape[0]):,-1].values.reshape(-1,1)

In [None]:
print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF

kernel = 1 * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2))
gaussian_process = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=9)
gaussian_process.fit(X_train, y_train)
gaussian_process.kernel_

In [None]:
mean_prediction, std_prediction = gaussian_process.predict(X_train, return_std=True)


In [None]:
mean_prediction

In [None]:
import numpy as np

In [None]:
results = np.ones((mean_prediction.shape[0],3))

In [None]:
results[:,0] = mean_prediction[:,10]
results[:,1] = std_prediction[:,10]
results[:,2] = X_train