with physical features. without, assymmetric. 2 unit cells wide, feature importance, without PHFs included

Regressor and Classifier

In [1]:
import sys
sys.path.append('..')
from cleaned_code import *

import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_absolute_error, mean_squared_error, r2_score


def RFclassifier(features, target, folds = 5):
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2)
    
    # Create a Random Forest classifier
    model = RandomForestClassifier(n_estimators=50, max_depth=5, min_samples_leaf=2, min_samples_split=2)

    # Train the classifier on the training data
    model.fit(x_train, y_train)

    # Predict on the test data
    predictions = model.predict(x_test)
    
    accuracy = accuracy_score(y_test, predictions)
    
    conf_matrix = confusion_matrix(y_test, predictions)
    
    class_report = classification_report(y_test, predictions)
    
    cv_scores = cross_val_score(model, features, target, cv=folds)
    mean_cv_score = np.mean(cv_scores)
    standard_deviation_cv_scores = np.std(cv_scores)
    standard_error = standard_deviation_cv_scores / np.sqrt(folds)
    
    return model, accuracy, conf_matrix, class_report, mean_cv_score, standard_error

def RFregressor(features, target, 
                param_grid = {
                    'n_estimators': [50, 100, 150],
                    'max_depth': [None, 10, 20],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4],
                    'max_features': ['auto', 'sqrt']}):
    
    # Split data into training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2)

    # Define the RandomForestRegressor model
    model = RandomForestRegressor()

    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)

    # Fit the grid search to the data
    grid_search.fit(x_train, y_train)

    # Get the best parameters
    best_params = grid_search.best_params_
    print("Best Parameters:", best_params)

    # Use the best model for prediction
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(x_test)

    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    # Print evaluation metrics
    print("Mean Absolute Error (MAE):", mae)
    print("Mean Squared Error (MSE):", mse)
    print("Root Mean Squared Error (RMSE):", rmse)
    print("R-squared (R2) Score:", r2)
    
    return model, best_params, mae, mse, rmse, r2

# Data Work

Getting the data here, and cleaning it. the _2 comes from the fact that the CID also is a dataset, but we're only looking at the MP dataset as its open source and a bit smaller so easier to manage.

In [3]:
import pandas as pd
df_MP = pd.read_csv("3DSC_MP.csv", skiprows=1)
df_MP['cif'] = df_MP['cif'].str.replace('data/final/MP/', '')
df_MP.columns = df_MP.columns.str.replace('_2', '')
df_MP.head()

Unnamed: 0,formula_sc,formula_similarity,totreldiff,formula_frac,correct_formula_frac,formula,orig_formula_cif,tc,sc_class,sc_class_unique_sc,...,monoclinic,orthorhombic,tetragonal,triclinic,trigonal,primitive,base-centered,body-centered,face-centered,weight
0,Ag0.02Ge2Pd1.98Sr1,2,0.008,1.0,True,Ag0.02Ge2Pd1.98Sr1,Ge2Pd2Sr1,2.64,Other,True,...,0,0,7,0,0,0,0,1,0,1.0
1,Ag0.15Sn0.85Te1,3,0.15,1.0,True,Ag0.15Sn0.85Te1,Sn1Te1,2.15,Other,True,...,0,0,0,0,0,0,0,0,1,1.0
2,Ag0.1Ge2Pd1.9Sr1,2,0.04,1.0,True,Ag0.1Ge2Pd1.9Sr1,Ge2Pd2Sr1,2.62,Other,True,...,0,0,7,0,0,0,0,1,0,1.0
3,Ag0.1In0.9Te1,3,0.1,1.0,True,Ag0.1In0.9Te1,In1Te1,1.2,Other,True,...,0,0,0,0,0,0,0,0,1,1.0
4,Ag0.2Ba1Si1.8,3,0.133333,4.0,False,Ag0.8Ba4Si7.2,Ba4Si8,3.2,Other,True,...,0,0,0,0,0,1,0,0,0,1.0


In [4]:
df_MP_asymcell = df_MP.copy()

asymcell_features = np.load("featurised_datasets\PHF_AsymCell.npy")

for i, feature in enumerate(asymcell_features.T):
    df_MP_asymcell[f"Feature {i}"] = np.squeeze(feature)
    
df_MP_asymcell

Unnamed: 0,formula_sc,formula_similarity,totreldiff,formula_frac,correct_formula_frac,formula,orig_formula_cif,tc,sc_class,sc_class_unique_sc,...,Feature 8,Feature 9,Feature 10,Feature 11,Feature 12,Feature 13,Feature 14,Feature 15,Feature 16,Feature 17
0,Ag0.02Ge2Pd1.98Sr1,2,0.008000,1.0,True,Ag0.02Ge2Pd1.98Sr1,Ge2Pd2Sr1,2.640000,Other,True,...,0.0,2.781936,0.297758,0.0,1.809062,0.132656,0.0,4.888015,0.000000,0.0
1,Ag0.15Sn0.85Te1,3,0.150000,1.0,True,Ag0.15Sn0.85Te1,Sn1Te1,2.150000,Other,True,...,0.0,2.778744,0.000000,0.0,3.781856,0.000000,0.0,0.000000,0.000000,0.0
2,Ag0.1Ge2Pd1.9Sr1,2,0.040000,1.0,True,Ag0.1Ge2Pd1.9Sr1,Ge2Pd2Sr1,2.620000,Other,True,...,0.0,2.781936,0.297758,0.0,1.809062,0.132656,0.0,4.888015,0.000000,0.0
3,Ag0.1In0.9Te1,3,0.100000,1.0,True,Ag0.1In0.9Te1,In1Te1,1.200000,Other,True,...,0.0,2.721158,0.000000,0.0,3.664907,0.000000,0.0,0.000000,0.000000,0.0
4,Ag0.2Ba1Si1.8,3,0.133333,4.0,False,Ag0.8Ba4Si7.2,Ba4Si8,3.200000,Other,True,...,0.0,4.866506,0.682774,0.0,1.839699,0.182906,0.0,12.958865,3.481724,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5768,Y1Zn1,1,0.000000,1.0,True,Y1Zn1,Y1Zn1,0.000000,Other,True,...,0.0,1.552541,0.000000,0.0,1.579418,0.000000,0.0,0.000000,0.000000,0.0
5769,Yb1,1,0.000000,3.0,False,Yb3,Yb3,0.000000,Heavy_fermion,True,...,0.0,3.351341,0.000000,0.0,3.703747,0.000000,0.0,0.000000,0.000000,0.0
5770,Zn1,1,0.000000,2.0,False,Zn2,Zn2,0.850800,Other,True,...,0.0,1.506546,0.000000,0.0,1.509753,0.000000,0.0,0.000000,0.000000,0.0
5771,Zn2Zr1,1,0.000000,2.0,False,Zn4Zr2,Zn4Zr2,0.296667,Other,True,...,0.0,3.177911,0.000000,0.0,1.661336,0.000000,0.0,0.000000,0.000000,0.0


In [5]:
NA_values_columns = []
for column in df_MP.columns:
    if df_MP[column].dtype != 'object' and df_MP[column].dtype != 'bool':
        print(f"Column '{column}' has type: {df_MP[column].dtype}")
        if df_MP[column].isna().any():
            NA_values_columns.append(column)


Column 'formula_similarity' has type: int64
Column 'totreldiff' has type: float64
Column 'formula_frac' has type: float64
Column 'tc' has type: float64
Column 'num_elements_sc' has type: int64
Column 'lata' has type: float64
Column 'latb' has type: float64
Column 'latc' has type: float64
Column 'band_gap' has type: float64
Column 'density' has type: float64
Column 'e_above_hull' has type: float64
Column 'efermi' has type: float64
Column 'encut' has type: float64
Column 'energy' has type: float64
Column 'energy_per_atom' has type: float64
Column 'final_energy' has type: float64
Column 'final_energy_per_atom' has type: float64
Column 'formation_energy_per_atom' has type: float64
Column 'nsites' has type: int64
Column 'ntask_ids' has type: int64
Column 'total_magnetization' has type: float64
Column 'cell_volume' has type: float64
Column 'exchange_symmetry' has type: int64
Column 'num_unique_magnetic_sites' has type: int64
Column 'total_magnetization_normalized_vol' has type: float64
Colum

In [6]:
print(NA_values_columns)

['Reason for exclusion']


So there are no materials in here that need to be excluded

In [7]:
for column in df_MP.columns:
    if df_MP[column].dtype == 'bool':
        print(df_MP[column].value_counts(), "\n")

False    3372
True     2401
Name: correct_formula_frac, dtype: int64 

True     5762
False      11
Name: sc_class_unique_sc, dtype: int64 

True     5013
False     760
Name: has_bandstructure, dtype: int64 

True    5773
Name: is_ordered, dtype: int64 

False    4667
True     1106
Name: is_magnetic, dtype: int64 

True     3437
False    2336
Name: synth_doped, dtype: int64 

True    5773
Name: no_crystal_temp_given, dtype: int64 



In [8]:
for column in df_MP.columns:
    if df_MP[column].dtype == 'object':
        print(f"Column {column}")

Column formula_sc
Column formula
Column orig_formula_cif
Column sc_class
Column norm_formula_sc
Column chemical_composition_sc
Column origin_sc
Column old_formula_sc
Column database_id
Column original_formula
Column chemical_composition
Column norm_formula
Column spacegroup
Column crystal_system
Column cif
Column original_cif
Column material_id
Column band_structure
Column created_at
Column doi
Column doi_bibtex
Column dos
Column exp
Column has
Column icsd_ids
Column last_updated
Column magnetic_type
Column original_task_id
Column oxide_type
Column pretty_formula
Column pseudo_potential
Column reduced_cell_formula
Column run_type
Column task_id
Column task_ids
Column unit_cell_formula
Column ordering
Column magmoms
Column origin
Column cif_before_synthetic_doping
Column graph
Column point_group


In [9]:
df_MP["magnetic_type"].value_counts()

NM     4667
FM      977
FiM     102
AFM      27
Name: magnetic_type, dtype: int64

Interesting, but does not show me that I need to remove anything from this dataset

In [10]:
target = df_MP_asymcell["tc"]
PHFs = df_MP_asymcell.iloc[:, -18:]
PHFs

Unnamed: 0,Feature 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,Feature 11,Feature 12,Feature 13,Feature 14,Feature 15,Feature 16,Feature 17
0,0.573912,-0.000000,-1.0,4.0,1.0,0.0,1.699603,0.297758,0.0,2.781936,0.297758,0.0,1.809062,0.132656,0.0,4.888015,0.000000,0.0
1,0.000000,-1.000000,-1.0,1.0,0.0,0.0,2.778744,0.000000,0.0,2.778744,0.000000,0.0,3.781856,0.000000,0.0,0.000000,0.000000,0.0
2,0.573912,-0.000000,-1.0,4.0,1.0,0.0,1.699603,0.297758,0.0,2.781936,0.297758,0.0,1.809062,0.132656,0.0,4.888015,0.000000,0.0
3,0.000000,-1.000000,-1.0,1.0,0.0,0.0,2.721158,0.000000,0.0,2.721158,0.000000,0.0,3.664907,0.000000,0.0,0.000000,0.000000,0.0
4,0.688938,1.448728,-1.0,11.0,5.0,0.0,1.718738,0.368862,0.0,4.866506,0.682774,0.0,1.839699,0.182906,0.0,12.958865,3.481724,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5768,0.000000,-1.000000,-1.0,1.0,0.0,0.0,1.552541,0.000000,0.0,1.552541,0.000000,0.0,1.579418,0.000000,0.0,0.000000,0.000000,0.0
5769,0.303455,-1.000000,-1.0,2.0,0.0,0.0,2.740350,0.000000,0.0,3.351341,0.000000,0.0,3.703747,0.000000,0.0,0.000000,0.000000,0.0
5770,0.000000,-1.000000,-1.0,1.0,0.0,0.0,1.506546,0.000000,0.0,1.506546,0.000000,0.0,1.509753,0.000000,0.0,0.000000,0.000000,0.0
5771,0.605795,-1.000000,-1.0,5.0,0.0,0.0,1.605770,0.000000,0.0,3.177911,0.000000,0.0,1.661336,0.000000,0.0,0.000000,0.000000,0.0


In [11]:
physical_features = ["lata", "latb", "latc", "cell_volume",
                     "band_gap", "density","weight",
                     "e_above_hull", "efermi", "encut", "energy", "energy_per_atom", "final_energy", "final_energy_per_atom", "formation_energy_per_atom",
                     "nsites", "total_magnetization", "exchange_symmetry", "num_unique_magnetic_sites", "total_magnetization_normalized_vol", "total_magnetization_normalized_formula_units",
                     "num_magnetic_sites", "true_total_magnetization"]

In [20]:
def corr_matrix(dataframe, features):
    features.append("tc")    
    # Compute the correlation matrix
    correlation_matrix = dataframe[features].corr()

    # Get the correlation values
    correlation_values = correlation_matrix.values.round(2)

    # Create the heatmap trace
    heatmap = go.Heatmap(
        z=correlation_values,
        x=correlation_matrix.columns,
        y=correlation_matrix.index,
        colorscale='Viridis', 
        colorbar=dict(title='Pearson Coefficient', titleside='right', tickvals = [-1,-0.5,0,0.5,1], ticktext = [-1,-0.5,0,0.5,1]),
        text=correlation_values,
        texttemplate="%{text}",
        textfont={"size":8}
    )
    

    # Create the layout
    layout = go.Layout(
        xaxis=dict(
            title="",
            showline=True,
            linewidth=2,
            linecolor='black',
            ticks='inside',
            tickwidth=2,
            ticklen=5
        ),
        yaxis=dict(
            title="",
            showline=True,
            linewidth=2,
            linecolor='black',
            ticks='inside',
            tickwidth=2,
            ticklen=5
        ),
        width=1200,
        height=1000,
        plot_bgcolor='white',
        paper_bgcolor='white',
        font=dict(family='Helvetica', size=16, color='black'),
        margin=dict(l=5, r=5, b=5, t=10)
    )

    # Create the figure
    fig = go.Figure(data=[heatmap], layout=layout)

    # Show the figure
    fig.show()
    
corr_matrix(df_MP_asymcell, physical_features)

In [21]:
corr_matrix(df_MP_asymcell, PHFs.columns.tolist())

Not sure if these Correlation matrixs are showing anything significant in particular, I think a feature importance will be better.

In [22]:
regressor_df = pd.DataFrame(columns=["mae" ,"mse", "rmse", "r2", "n_estimators", "max_depth", "min_samples_split", "min_samples_leaf", "max_features"])

for i in range(10):
    model, best_params, mae, mse, rmse, r2 = RFregressor(PHFs, target)
    regressor_df = regressor_df.append({"mae": mae ,"mse": mse, "rmse": rmse, "r2": r2, 
                                        "n_estimators": best_params["n_estimators"], "max_depth": best_params["max_depth"], 
                                        "min_samples_split": best_params["min_samples_split"], "min_samples_leaf": best_params["min_samples_leaf"], "max_features": best_params["max_features"]}, ignore_index=True)

Best Parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Mean Absolute Error (MAE): 5.131562466303511
Mean Squared Error (MSE): 102.00212993870615
Root Mean Squared Error (RMSE): 10.099610385490429
R-squared (R2) Score: 0.724588242348051
Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Mean Absolute Error (MAE): 4.961795510175985
Mean Squared Error (MSE): 86.62748381669007
Root Mean Squared Error (RMSE): 9.30738866797181
R-squared (R2) Score: 0.766510693971636
Best Parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Mean Absolute Error (MAE): 4.60901765570279
Mean Squared Error (MSE): 82.87818469358822
Root Mean Squared Error (RMSE): 9.103745640866084
R-squared (R2) Score: 0.7555429858130959
Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf'

In [28]:
regressor_df

Unnamed: 0,mae,mse,rmse,r2,n_estimators,max_depth,min_samples_split,min_samples_leaf,max_features
0,5.131562,102.00213,10.09961,0.724588,100,20.0,2,1,sqrt
1,4.961796,86.627484,9.307389,0.766511,100,,2,1,sqrt
2,4.609018,82.878185,9.103746,0.755543,100,20.0,2,1,sqrt
3,4.999798,101.270844,10.063342,0.725802,50,,2,1,sqrt
4,4.904123,86.367589,9.293416,0.757168,100,20.0,2,1,sqrt
5,5.06354,95.64231,9.779689,0.735769,150,20.0,2,1,sqrt
6,4.323417,70.682404,8.407283,0.757046,100,20.0,2,1,sqrt
7,4.776687,78.718361,8.872337,0.785277,150,20.0,2,1,sqrt
8,5.114494,106.281152,10.309275,0.698254,100,,2,1,sqrt
9,4.856146,92.353719,9.610084,0.729323,100,20.0,2,1,sqrt


In [29]:
regressor_df.describe()

Unnamed: 0,mae,mse,rmse,r2
count,10.0,10.0,10.0,10.0
mean,4.874058,90.282418,9.484617,0.743528
std,0.25115,11.296363,0.600424,0.025318
min,4.323417,70.682404,8.407283,0.698254
25%,4.796552,83.750536,9.151163,0.726682
50%,4.932959,89.490602,9.458736,0.745656
75%,5.047604,99.863711,9.992428,0.757137
max,5.131562,106.281152,10.309275,0.785277


In [27]:
regressor_df["n_estimators"].mean()

105.0

### Hyperparameters should be:
- n_estimators = 100
- max_depth = 20
- min_samples_split = 2
- min_samples_leaf = 1
- max_features = sqrt

In [39]:
model.fit(PHFs, target)
# Get the feature importances
importances = model.feature_importances_

# Get the names of the features
feature_names = PHFs.columns.tolist()

import plotly.graph_objects as go

# Sort the feature importances in descending order
indices = np.argsort(importances)[::-1]

# Create the bar plot
fig = go.Figure(data=go.Bar(
    x=[feature_names[i] for i in indices],
    y=importances[indices],
    marker_color='rgb(49, 104, 142)'
))

# Set the layout
fig.update_layout(
    xaxis=dict(
        title="Features",
        showline=True,
        linewidth=2,
        linecolor='black',
        ticks='outside',
        tickson = "boundaries",
        tickwidth=2,
        ticklen=5
    ),
    yaxis=dict(
        title="Feature Importance",
        showline=True,
        linewidth=2,
        linecolor='black',
        ticks='inside',
        tickwidth=2,
        ticklen=5
    ),
    barmode='group',
    width=800,
    height=500,
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(family='Helvetica', size=16, color='black'),
    margin=dict(l=10, r=10, b=10, t=10),
    legend=dict(
        title=dict(
            text="Unit Cell Size",
            font=dict(
                family='Helvetica',
                size=16,
                color='black'
            )
        )
    )
)
# Show the plot
fig.show()

In [40]:
combined_features = pd.concat([PHFs, df_MP_asymcell[physical_features]], axis=1)

In [41]:
regressor_df = pd.DataFrame(columns=["mae" ,"mse", "rmse", "r2", "n_estimators", "max_depth", "min_samples_split", "min_samples_leaf", "max_features"])

for i in range(1):
    model, best_params, mae, mse, rmse, r2 = RFregressor(combined_features, target)
    regressor_df = regressor_df.append({"mae": mae ,"mse": mse, "rmse": rmse, "r2": r2, 
                                        "n_estimators": best_params["n_estimators"], "max_depth": best_params["max_depth"], 
                                        "min_samples_split": best_params["min_samples_split"], "min_samples_leaf": best_params["min_samples_leaf"], "max_features": best_params["max_features"]}, ignore_index=True)

Best Parameters: {'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}
Mean Absolute Error (MAE): 0.022498976267432434
Mean Squared Error (MSE): 0.028854099660026954
Root Mean Squared Error (RMSE): 0.16986494535373375
R-squared (R2) Score: 0.9999272347705195
