In [4]:
import sys
sys.path.append('..')
from cleaned_code import *

# Data Analysis

In [5]:
import pandas as pd
dft_df = pd.read_csv("DFTperovskites.csv")
dft_df.head()
print(dft_df.dtypes)

Chemical formula               object
A                              object
B                              object
In literature                    bool
Valence A                      object
Valence B                      object
Radius A [ang]                float64
Radius B [ang]                float64
Lowest distortion              object
Formation energy [eV/atom]     object
Stability [eV/atom]            object
Magnetic moment [mu_B]         object
Volume per atom [A^3/atom]     object
Band gap [eV]                  object
a [ang]                        object
b [ang]                        object
c [ang]                        object
alpha [deg]                    object
beta [deg]                     object
gamma [deg]                    object
Vacancy energy [eV/O atom]     object
dtype: object


In [6]:
list_to_convert_to_numeric = ["Radius A [ang]","Radius B [ang]",
        "Formation energy [eV/atom]","Stability [eV/atom]","Magnetic moment [mu_B]"
        ,"Volume per atom [A^3/atom]","Band gap [eV]","a [ang]","b [ang]","c [ang]","alpha [deg]"
        ,"beta [deg]","gamma [deg]","Vacancy energy [eV/O atom]"]

dft_df[list_to_convert_to_numeric] = dft_df[list_to_convert_to_numeric].apply(pd.to_numeric, errors='coerce')
print(dft_df.dtypes)

Chemical formula               object
A                              object
B                              object
In literature                    bool
Valence A                      object
Valence B                      object
Radius A [ang]                float64
Radius B [ang]                float64
Lowest distortion              object
Formation energy [eV/atom]    float64
Stability [eV/atom]           float64
Magnetic moment [mu_B]        float64
Volume per atom [A^3/atom]    float64
Band gap [eV]                 float64
a [ang]                       float64
b [ang]                       float64
c [ang]                       float64
alpha [deg]                   float64
beta [deg]                    float64
gamma [deg]                   float64
Vacancy energy [eV/O atom]    float64
dtype: object


Change the numeric values to floats so we can apply calculations

In [7]:
parameter_columns = ["a [ang]","b [ang]","c [ang]","alpha [deg]","beta [deg]","gamma [deg]"]
na_rows = dft_df[parameter_columns].isna().any(axis=1)
print(dft_df[na_rows].shape)
dft_df.dropna(inplace=True)

(53, 21)


Removed 53 chemicals as they has NaN values in their unit cell parameters

In [8]:
import plotly.graph_objs as go

precentages = dft_df["Lowest distortion"].value_counts()/len(dft_df["Lowest distortion"])*100
# Create a bar plot
data = [go.Bar(x=dft_df["Lowest distortion"].value_counts().index,
               y=dft_df["Lowest distortion"].value_counts().values,
               text= [f'{x:.2f}%' for x in precentages],
               textposition='auto')]

# Create the layout
layout = go.Layout(xaxis=dict(title="Crystal Structure", showline=True, linewidth=2, linecolor='black',
                              ticks='inside', tickwidth=2, ticklen=5),
                   yaxis=dict(title="Count", showline=True, linewidth=2, linecolor='black',
                              ticks='inside', tickwidth=2, ticklen=5), 
                   width=800,
                   height=500,
                   plot_bgcolor='white',
                   paper_bgcolor='white', 
                   font=dict(family='Helvetica', size=16, color='black'),
                   margin=dict(l=10, r=10, b=10, t=10))

# Create the figure
fig = go.Figure(data=data, layout=layout)

# Show the figure
fig.show()

The amount of structures within the data is uneven, and does not include all 14 bravais lattices, there is clear bias within the data set we need to coinsider when analysing the model

In [9]:
# Create a list to store the box plot traces
box_plots = []

# Iterate over the columns in dft_df
for column in parameter_columns[:3]:
    # Create a box plot trace for each column
    box_plot = go.Box(y=dft_df[column], name=column)
    # Append the box plot trace to the list
    box_plots.append(box_plot)
    
layout = go.Layout(xaxis=dict(title="Values in Literature", showline=True, linewidth=2, linecolor='black',
                              ticks='inside', tickwidth=2, ticklen=5),
                   yaxis=dict(title="Count", showline=True, linewidth=2, linecolor='black',
                              ticks='inside', tickwidth=2, ticklen=5), 
                   width=800,
                   height=500,
                   plot_bgcolor='white',
                   paper_bgcolor='white', 
                   font=dict(family='Helvetica', size=16, color='black'),
                   margin=dict(l=10, r=10, b=10, t=10))

# Create the figure
fig = go.Figure(data=box_plots, layout=layout)

# Show the figure
fig.show()


Since these are perovskite materials, we would expect the largest range change to be within the distance on the c plane

In [10]:
# Create a bar plot
data = [go.Bar(x=dft_df["In literature"].value_counts().index,
               y=dft_df["In literature"].value_counts().values,
               text=dft_df["In literature"].value_counts().values,
               textposition='auto')]

# Create the layout
layout = go.Layout(xaxis=dict(title="Values in Literature", showline=True, linewidth=2, linecolor='black',
                              ticks='inside', tickwidth=2, ticklen=5),
                   yaxis=dict(title="Count", showline=True, linewidth=2, linecolor='black',
                              ticks='inside', tickwidth=2, ticklen=5), 
                   width=800,
                   height=500,
                   plot_bgcolor='white',
                   paper_bgcolor='white', 
                   font=dict(family='Helvetica', size=16, color='black'),
                   margin=dict(l=10, r=10, b=10, t=10))

# Create the figure
fig = go.Figure(data=data, layout=layout)

# Show the figure
fig.show()

False is the amount that has been calculated using DFT calculations where as true is measured

In literature data is small but could be interesting to explore?

In [11]:
in_lit = dft_df[dft_df["In literature"] == True]

precentages = in_lit["Lowest distortion"].value_counts()/len(in_lit["Lowest distortion"])*100
# Create a bar plot
data = [go.Bar(x=in_lit["Lowest distortion"].value_counts().index,
               y=in_lit["Lowest distortion"].value_counts().values,
               text= [f'{x:.2f}%' for x in precentages],
               textposition='auto')]


# Create the layout
layout = go.Layout(xaxis=dict(title="Crystal Structure", showline=True, linewidth=2, linecolor='black',
                              ticks='inside', tickwidth=2, ticklen=5),
                   yaxis=dict(title="Count", showline=True, linewidth=2, linecolor='black',
                              ticks='inside', tickwidth=2, ticklen=5), 
                   width=800,
                   height=500,
                   plot_bgcolor='white',
                   paper_bgcolor='white', 
                   font=dict(family='Helvetica', size=16, color='black'),
                   margin=dict(l=10, r=10, b=10, t=10))

# Create the figure
fig = go.Figure(data=data, layout=layout)

# Show the figure
fig.show()

Well thats a fairly bad skew, so I won't be seperating literature to non-literature

# Machine Learning

### Featurisation

- Create the coordinates for each material
- Featurise each coordinate using PHFs
- Return each value in the 18 long featurised list into it's own column

In [12]:
coordinates = []
for index, row in dft_df.iterrows():
    structure = BravaisLattice(row["a [ang]"], row["b [ang]"], row["c [ang]"],
                               row["alpha [deg]"], row["beta [deg]"], row["gamma [deg]"])
    coordinates.append(structure.get_coords(2))

dft_df["Coordinates"] = coordinates

In [13]:
features = PresistentHomologyFeatures(coords=coordinates)
topol_feat_mat, topol_feat_list = features.featurising_coords()

In [14]:
for i, feature in enumerate(topol_feat_mat.T):
    dft_df[f"Feature {i}"] = np.squeeze(feature)
    
dft_df.head()


Unnamed: 0,Chemical formula,A,B,In literature,Valence A,Valence B,Radius A [ang],Radius B [ang],Lowest distortion,Formation energy [eV/atom],...,Feature 8,Feature 9,Feature 10,Feature 11,Feature 12,Feature 13,Feature 14,Feature 15,Feature 16,Feature 17
0,Ac2O3,Ac,Ac,False,element not in BV,element not in BV,1.12,1.12,cubic,-2.732,...,0.0,6.22413,2.178908,0.0,2.945959,0.78535,0.0,0.0,0.0,0.0
1,AcAgO3,Ac,Ag,False,element not in BV,element not in BV,1.12,0.95,orthorhombic,-1.957,...,0.0,8.298746,2.30754,0.0,6.837703,1.299609,0.0,9.702421,5.520216,0.0
2,AcAlO3,Ac,Al,False,element not in BV,element not in BV,1.12,0.54,cubic,-3.532,...,0.0,5.103654,1.786659,0.0,2.187412,0.583132,0.0,0.0,0.0,0.0
3,AcAsO3,Ac,As,False,element not in BV,element not in BV,1.12,0.52,orthorhombic,-2.398,...,0.0,8.279229,2.307365,0.0,6.855119,1.294148,0.0,10.268007,5.699575,0.0
4,AcAuO3,Ac,Au,False,element not in BV,element not in BV,1.12,0.93,orthorhombic,-2.006,...,0.0,8.493155,2.472979,0.0,6.083788,1.449416,0.0,8.726824,5.336804,0.0


Features made and they all seem different for each structure which is good! 

Feature 15 and 16 only seem to appear for orthorhombic ones, it would be a good idea to explore this a bit more possibly in a bigger data set

## Random Forest Classification

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

features = dft_df[dft_df.columns[22:]]
dft_df["Lowest distortion"] = dft_df["Lowest distortion"].astype('category')
target = dft_df["Lowest distortion"].cat.codes

x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Create a Random Forest classifier
model = RandomForestClassifier(n_estimators=50, random_state=42, max_depth=5, min_samples_leaf=2, min_samples_split=2)

# Train the classifier on the training data
model.fit(x_train, y_train)

# Predict on the test data
predictions = model.predict(x_test)

# Perform cross-validation
cv_scores = cross_val_score(model, features, target, cv=5)

# Print the cross-validation scores
print("Cross-validation Scores:", cv_scores)

# Calculate mean cross-validation score
mean_cv_score = np.mean(cv_scores)
print("Mean Cross-validation Score:", mean_cv_score)

print(classification_report(y_test, predictions))

conf_matrix = confusion_matrix(y_test, predictions)
print("Confusion Matrix:", conf_matrix)

Cross-validation Scores: [1.         1.         0.99873897 0.99873897 1.        ]
Mean Cross-validation Score: 0.9994955863808321
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       485
           1       1.00      1.00      1.00       242
           2       1.00      1.00      1.00        53
           3       1.00      1.00      1.00        14

    accuracy                           1.00       794
   macro avg       1.00      1.00      1.00       794
weighted avg       1.00      1.00      1.00       794

Confusion Matrix: [[485   0   0   0]
 [  0 242   0   0]
 [  0   0  53   0]
 [  0   0   0  14]]


This is the biased dataset, it seems really overfitted

In [16]:
import random

# Define the number of samples to pick
num_samples = 70

# Filter the dataset based on the crystal structures
cubic_samples = dft_df[dft_df['Lowest distortion'] == 'cubic'].sample(num_samples, random_state=42)
orthorhombic_samples = dft_df[dft_df['Lowest distortion'] == 'orthorhombic'].sample(num_samples, random_state=42)
rhombohedral_samples = dft_df[dft_df['Lowest distortion'] == 'rhombohedral'].sample(num_samples, random_state=42)
tetragonal_samples = dft_df[dft_df['Lowest distortion'] == 'tetragonal'].sample(num_samples, random_state=42)

# Concatenate the samples from different crystal structures
random_samples = pd.concat([cubic_samples, orthorhombic_samples, rhombohedral_samples, tetragonal_samples])

# Reset the index of the concatenated samples
random_samples.reset_index(drop=True, inplace=True)

random_samples.head()

Unnamed: 0,Chemical formula,A,B,In literature,Valence A,Valence B,Radius A [ang],Radius B [ang],Lowest distortion,Formation energy [eV/atom],...,Feature 8,Feature 9,Feature 10,Feature 11,Feature 12,Feature 13,Feature 14,Feature 15,Feature 16,Feature 17
0,SmYO3,Sm,Y,False,not balanced,not balanced,1.24,0.9,cubic,-3.003,...,0.0,5.655294,1.979773,0.0,2.551476,0.680186,0.0,0.0,0.0,0.0
1,CdNdO3,Cd,Nd,False,not balanced,not balanced,1.31,0.98,cubic,-1.495,...,0.0,5.874891,2.056649,0.0,2.701521,0.720186,0.0,0.0,0.0,0.0
2,TmAsO3,Tm,As,False,3,3,1.05,0.58,cubic,-1.825,...,0.0,5.261076,1.841768,0.0,2.289394,0.610319,0.0,0.0,0.0,0.0
3,DyTcO3,Dy,Tc,False,element not in BV,element not in BV,1.08,0.6,cubic,-2.083,...,0.0,5.198901,1.820002,0.0,2.248931,0.599532,0.0,0.0,0.0,0.0
4,PmEuO3,Pm,Eu,False,element not in BV,element not in BV,1.14,1.06,cubic,-2.289,...,0.0,5.643387,1.975606,0.0,2.543422,0.67804,0.0,0.0,0.0,0.0


In [17]:
features = random_samples[random_samples.columns[22:]]
random_samples["Lowest distortion"] = random_samples["Lowest distortion"].astype('category')
target = random_samples["Lowest distortion"].cat.codes

x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Create a Random Forest classifier
model = RandomForestClassifier(n_estimators=50, random_state=42, max_depth=5, min_samples_leaf=2, min_samples_split=2)

# Train the classifier on the training data
model.fit(x_train, y_train)

# Predict on the test data
predictions = model.predict(x_test)

# Perform cross-validation
cv_scores = cross_val_score(model, features, target, cv=25)

# Calculate mean cross-validation score
mean_cv_score = np.mean(cv_scores)
print("Mean Cross-validation Score:", mean_cv_score)

std_score = np.std(cv_scores)
print("Standard Deviation of Cross-validation Score:", std_score)

print(classification_report(y_test, predictions))

conf_matrix = confusion_matrix(y_test, predictions)
print("Confusion Matrix:", conf_matrix)

Mean Cross-validation Score: 0.9927272727272728
Standard Deviation of Cross-validation Score: 0.02466301812045553
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        13
           3       1.00      1.00      1.00        15

    accuracy                           1.00        56
   macro avg       1.00      1.00      1.00        56
weighted avg       1.00      1.00      1.00        56

Confusion Matrix: [[15  0  0  0]
 [ 0 13  0  0]
 [ 0  0 13  0]
 [ 0  0  0 15]]


Not overfitted anymore! Key changes were the hyperparameters and the bias

This is a small data set so I think it would be great to see if we can make a dummy dataset and test it out, and also include all the different types of structures. We can also see how these features differ and what features are important