In [1]:
import sys
sys.path.append('..')
from cleaned_code import *

import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go

In [3]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_absolute_error, mean_squared_error, r2_score

def RFclassifier(features, target, folds = 5):
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2)
    
    # Create a Random Forest classifier
    model = RandomForestClassifier(n_estimators=50, max_depth=5, min_samples_leaf=2, min_samples_split=2)

    # Train the classifier on the training data
    model.fit(x_train, y_train)

    # Predict on the test data
    predictions = model.predict(x_test)
    
    accuracy = accuracy_score(y_test, predictions)
    
    conf_matrix = confusion_matrix(y_test, predictions)
    
    class_report = classification_report(y_test, predictions)
    
    cv_scores = cross_val_score(model, features, target, cv=folds)
    mean_cv_score = np.mean(cv_scores)
    standard_deviation_cv_scores = np.std(cv_scores)
    standard_error = standard_deviation_cv_scores / np.sqrt(folds)
    
    return model, accuracy, conf_matrix, class_report, mean_cv_score, standard_error

# Data Read In

In [4]:
df_MP = pd.read_csv("3DSC_MP.csv", skiprows=1)
df_MP['cif'] = df_MP['cif'].str.replace('data/final/MP/', '')
df_MP.columns = df_MP.columns.str.replace('_2', '')
df_MP.head()

Unnamed: 0,formula_sc,formula_similarity,totreldiff,formula_frac,correct_formula_frac,formula,orig_formula_cif,tc,sc_class,sc_class_unique_sc,...,monoclinic,orthorhombic,tetragonal,triclinic,trigonal,primitive,base-centered,body-centered,face-centered,weight
0,Ag0.02Ge2Pd1.98Sr1,2,0.008,1.0,True,Ag0.02Ge2Pd1.98Sr1,Ge2Pd2Sr1,2.64,Other,True,...,0,0,7,0,0,0,0,1,0,1.0
1,Ag0.15Sn0.85Te1,3,0.15,1.0,True,Ag0.15Sn0.85Te1,Sn1Te1,2.15,Other,True,...,0,0,0,0,0,0,0,0,1,1.0
2,Ag0.1Ge2Pd1.9Sr1,2,0.04,1.0,True,Ag0.1Ge2Pd1.9Sr1,Ge2Pd2Sr1,2.62,Other,True,...,0,0,7,0,0,0,0,1,0,1.0
3,Ag0.1In0.9Te1,3,0.1,1.0,True,Ag0.1In0.9Te1,In1Te1,1.2,Other,True,...,0,0,0,0,0,0,0,0,1,1.0
4,Ag0.2Ba1Si1.8,3,0.133333,4.0,False,Ag0.8Ba4Si7.2,Ba4Si8,3.2,Other,True,...,0,0,0,0,0,1,0,0,0,1.0


In [5]:
target_tc = df_MP["tc"]

In [6]:
df_MP["cat_crystal_system"] = df_MP["crystal_system"].astype('category').cat.codes
physical_features = ["cell_volume","lata", "latb", "latc", "efermi", "formation_energy_per_atom", "final_energy_per_atom", "exchange_symmetry", "cat_crystal_system"]
df_features = df_MP[physical_features]
df_features.head()

Unnamed: 0,cell_volume,lata,latb,latc,efermi,formation_energy_per_atom,final_energy_per_atom,exchange_symmetry,cat_crystal_system
0,101.453048,4.438672,4.438672,6.030548,4.015543,-0.713482,-4.972544,139,4
1,66.066855,4.53767,4.53767,4.53767,6.066451,-0.497162,-3.862598,225,0
2,101.453048,4.438672,4.438672,6.030548,4.015543,-0.713482,-4.972544,139,4
3,62.043965,4.443633,4.443633,4.443633,6.31406,-0.499851,-3.236343,225,0
4,310.486636,6.771439,6.771439,6.771439,5.202543,-0.299456,-4.603323,212,0


In [7]:
asymcell_features = np.load("featurised_datasets\PHF_AsymCell.npy")

for i, feature in enumerate(asymcell_features.T):
    df_features[f"Feature {i}"] = np.squeeze(feature)
    
df_features

Unnamed: 0,cell_volume,lata,latb,latc,efermi,formation_energy_per_atom,final_energy_per_atom,exchange_symmetry,cat_crystal_system,Feature 0,...,Feature 8,Feature 9,Feature 10,Feature 11,Feature 12,Feature 13,Feature 14,Feature 15,Feature 16,Feature 17
0,101.453048,4.438672,4.438672,6.030548,4.015543,-0.713482,-4.972544,139,4,0.573912,...,0.0,2.781936,0.297758,0.0,1.809062,0.132656,0.0,4.888015,0.000000,0.0
1,66.066855,4.537670,4.537670,4.537670,6.066451,-0.497162,-3.862598,225,0,0.000000,...,0.0,2.778744,0.000000,0.0,3.781856,0.000000,0.0,0.000000,0.000000,0.0
2,101.453048,4.438672,4.438672,6.030548,4.015543,-0.713482,-4.972544,139,4,0.573912,...,0.0,2.781936,0.297758,0.0,1.809062,0.132656,0.0,4.888015,0.000000,0.0
3,62.043965,4.443633,4.443633,4.443633,6.314060,-0.499851,-3.236343,225,0,0.000000,...,0.0,2.721158,0.000000,0.0,3.664907,0.000000,0.0,0.000000,0.000000,0.0
4,310.486636,6.771439,6.771439,6.771439,5.202543,-0.299456,-4.603323,212,0,0.688938,...,0.0,4.866506,0.682774,0.0,1.839699,0.182906,0.0,12.958865,3.481724,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5768,46.092232,3.585441,3.585441,3.585441,3.144927,-0.366744,-4.229851,221,0,0.000000,...,0.0,1.552541,0.000000,0.0,1.579418,0.000000,0.0,0.000000,0.000000,0.0
5769,123.042457,3.878974,3.878974,9.704523,1.419946,0.000000,-1.539608,166,6,0.303455,...,0.0,3.351341,0.000000,0.0,3.703747,0.000000,0.0,0.000000,0.000000,0.0
5770,31.114928,2.626730,2.626731,5.207234,2.851722,0.000000,-1.259744,194,1,0.000000,...,0.0,1.506546,0.000000,0.0,1.509753,0.000000,0.0,0.000000,0.000000,0.0
5771,101.994903,5.244424,5.244424,5.244424,3.547729,-0.302553,-3.991615,227,0,0.605795,...,0.0,3.177911,0.000000,0.0,1.661336,0.000000,0.0,0.000000,0.000000,0.0


In [8]:
PHFs = df_features.iloc[:, -18:]
PHFs

Unnamed: 0,Feature 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,Feature 11,Feature 12,Feature 13,Feature 14,Feature 15,Feature 16,Feature 17
0,0.573912,-0.000000,-1.0,4.0,1.0,0.0,1.699603,0.297758,0.0,2.781936,0.297758,0.0,1.809062,0.132656,0.0,4.888015,0.000000,0.0
1,0.000000,-1.000000,-1.0,1.0,0.0,0.0,2.778744,0.000000,0.0,2.778744,0.000000,0.0,3.781856,0.000000,0.0,0.000000,0.000000,0.0
2,0.573912,-0.000000,-1.0,4.0,1.0,0.0,1.699603,0.297758,0.0,2.781936,0.297758,0.0,1.809062,0.132656,0.0,4.888015,0.000000,0.0
3,0.000000,-1.000000,-1.0,1.0,0.0,0.0,2.721158,0.000000,0.0,2.721158,0.000000,0.0,3.664907,0.000000,0.0,0.000000,0.000000,0.0
4,0.688938,1.448728,-1.0,11.0,5.0,0.0,1.718738,0.368862,0.0,4.866506,0.682774,0.0,1.839699,0.182906,0.0,12.958865,3.481724,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5768,0.000000,-1.000000,-1.0,1.0,0.0,0.0,1.552541,0.000000,0.0,1.552541,0.000000,0.0,1.579418,0.000000,0.0,0.000000,0.000000,0.0
5769,0.303455,-1.000000,-1.0,2.0,0.0,0.0,2.740350,0.000000,0.0,3.351341,0.000000,0.0,3.703747,0.000000,0.0,0.000000,0.000000,0.0
5770,0.000000,-1.000000,-1.0,1.0,0.0,0.0,1.506546,0.000000,0.0,1.506546,0.000000,0.0,1.509753,0.000000,0.0,0.000000,0.000000,0.0
5771,0.605795,-1.000000,-1.0,5.0,0.0,0.0,1.605770,0.000000,0.0,3.177911,0.000000,0.0,1.661336,0.000000,0.0,0.000000,0.000000,0.0


# Regressor Models

In [9]:
def RFregressor(features, target, 
                param_grid = {
                    'n_estimators': [50, 100, 150],
                    'max_depth': [None, 10, 20],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4],
                    'max_features': ['log2', 'sqrt']}):
    
    # Split data into training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2)

    # Define the RandomForestRegressor model
    model = RandomForestRegressor()

    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)

    # Fit the grid search to the data
    grid_search.fit(x_train, y_train)

    # Get the best parameters
    best_params = grid_search.best_params_

    # Use the best model for prediction
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(x_test)
    feature_importances = best_model.feature_importances_

    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    return best_model, best_params, mae, mse, rmse, r2, feature_importances

In [10]:
def plot_feature_importances(importances, features):    
    # Get the names of the features
    feature_names = features.columns.tolist()

    import plotly.graph_objects as go

    # Sort the feature importances in descending order
    indices = np.argsort(importances)[::-1]

    # Create the bar plot
    fig = go.Figure(data=go.Bar(
        x=[feature_names[i] for i in indices],
        y=importances[indices],
        marker_color='rgb(33, 145, 140)'
    ))

    # Set the layout
    fig.update_layout(
        xaxis=dict(
            title="Features",
            showline=True,
            linewidth=2,
            linecolor='black',
            ticks='outside',
            tickson = "boundaries",
            tickwidth=2,
            ticklen=5
        ),
        yaxis=dict(
            title="Feature Importance",
            showline=True,
            linewidth=2,
            linecolor='black',
            ticks='inside',
            tickwidth=2,
            ticklen=5
        ),
        barmode='group',
        width=800,
        height=500,
        plot_bgcolor='white',
        paper_bgcolor='white',
        font=dict(family='Helvetica', size=16, color='black'),
        margin=dict(l=10, r=10, b=10, t=10),
        legend=dict(
            title=dict(
                text="Unit Cell Size",
                font=dict(
                    family='Helvetica',
                    size=16,
                    color='black'
                )
            )
        )
    )
    # Show the plot
    fig.show()

## PHFS only

In [11]:
regressor_df_PHFonly = pd.DataFrame(columns=["model", "mae" ,"mse", "rmse", "r2", "n_estimators", "max_depth", "min_samples_split", "min_samples_leaf", "max_features", "feature_importance"])

for i in range(10):
    model, best_params, mae, mse, rmse, r2, feature_importances = RFregressor(PHFs, target_tc)
    regressor_df_PHFonly = regressor_df_PHFonly.append({"model":model,
                                                        "mae": mae ,"mse": mse, "rmse": rmse, "r2": r2, 
                                                        "n_estimators": best_params["n_estimators"], "max_depth": best_params["max_depth"], 
                                                        "min_samples_split": best_params["min_samples_split"], "min_samples_leaf": best_params["min_samples_leaf"], 
                                                        "max_features": best_params["max_features"], 
                                                        "feature_importance": feature_importances}, ignore_index=True)

In [14]:
regressor_df_PHFonly.sort_values("r2")

Unnamed: 0,model,mae,mse,rmse,r2,n_estimators,max_depth,min_samples_split,min_samples_leaf,max_features,feature_importance
2,"(DecisionTreeRegressor(max_features='log2', ra...",5.101257,124.004034,11.13571,0.614807,50,,2,1,log2,"[0.07083501392275093, 0.09022062637458357, 0.0..."
3,"(DecisionTreeRegressor(max_depth=20, max_featu...",5.068781,105.420295,10.267439,0.657768,150,20.0,2,1,sqrt,"[0.08698127968082194, 0.0778601186408237, 0.03..."
5,"(DecisionTreeRegressor(max_depth=20, max_featu...",5.27414,115.497684,10.746985,0.67812,150,20.0,2,1,log2,"[0.08160045454020266, 0.07892965631473682, 0.0..."
4,"(DecisionTreeRegressor(max_depth=20, max_featu...",5.544898,123.808273,11.126917,0.704808,100,20.0,2,1,log2,"[0.08137468953963223, 0.07287988650320862, 0.0..."
7,"(DecisionTreeRegressor(max_depth=20, max_featu...",5.206239,120.448279,10.974893,0.711572,50,20.0,2,1,log2,"[0.0913145262650642, 0.08340434455267627, 0.04..."
1,"(DecisionTreeRegressor(max_depth=20, max_featu...",5.247793,97.722264,9.885457,0.718862,100,20.0,2,1,log2,"[0.0806315576349741, 0.08043519200516196, 0.04..."
6,"(DecisionTreeRegressor(max_features='log2', ra...",4.986827,99.766216,9.988304,0.722268,100,,2,1,log2,"[0.08699763672753492, 0.07146963879391562, 0.0..."
0,"(DecisionTreeRegressor(max_depth=20, max_featu...",4.975622,96.488384,9.82285,0.733512,100,20.0,2,1,log2,"[0.08203939680540785, 0.07850484027624215, 0.0..."
8,"(DecisionTreeRegressor(max_depth=20, max_featu...",4.556353,83.395693,9.132124,0.755655,150,20.0,2,1,sqrt,"[0.08755461982977651, 0.07506122063649567, 0.0..."
9,"(DecisionTreeRegressor(max_features='sqrt', ra...",4.558946,70.987053,8.425381,0.786968,150,,2,1,sqrt,"[0.08453275264194454, 0.06788654333731113, 0.0..."


In [15]:
regressor_df_PHFonly.describe()

Unnamed: 0,mae,mse,rmse,r2
count,10.0,10.0,10.0,10.0
mean,5.052086,103.753818,10.150606,0.708434
std,0.308397,17.71639,0.893815,0.048991
min,4.556353,70.987053,8.425381,0.614807
25%,4.978423,96.796854,9.838502,0.684792
50%,5.085019,102.593255,10.127871,0.715217
75%,5.237405,119.21063,10.917916,0.730701
max,5.544898,124.004034,11.13571,0.786968


In [24]:
best_model = regressor_df_PHFonly.iloc[regressor_df_PHFonly["r2"].idxmax()]["model"]
predicted_tc = best_model.predict(PHFs)

# Create a scatter plot
fig = go.Figure(data=go.Scatter(x=predicted_tc, y=target_tc, mode='markers'))


# Set the axis labels
fig.update_layout(xaxis=dict(title="Crystal System", showline=True, linewidth=2, linecolor='black',
                              ticks='inside', tickwidth=2, ticklen=5),
                   yaxis=dict(title="Count", showline=True, linewidth=2, linecolor='black',
                              ticks='inside', tickwidth=2, ticklen=5), 
                   width=800,
                   height=500,
                   plot_bgcolor='white',
                   paper_bgcolor='white', 
                   font=dict(family='Helvetica', size=16, color='black'),
                   margin=dict(l=10, r=10, b=10, t=10))

# Show the plot
fig.show()


## With Physical Features

In [12]:
regressor_df_all = pd.DataFrame(columns=["model", "mae" ,"mse", "rmse", "r2", "n_estimators", "max_depth", "min_samples_split", "min_samples_leaf", "max_features", "feature_importance"])

for i in range(10):
    model, best_params, mae, mse, rmse, r2, feature_importances = RFregressor(df_features, target_tc)
    regressor_df_all = regressor_df_all.append({"model":model,
                                                "mae": mae ,"mse": mse, "rmse": rmse, "r2": r2, 
                                                "n_estimators": best_params["n_estimators"], "max_depth": best_params["max_depth"], 
                                                "min_samples_split": best_params["min_samples_split"], "min_samples_leaf": best_params["min_samples_leaf"], 
                                                "max_features": best_params["max_features"], 
                                                "feature_importance": feature_importances}, ignore_index=True)