In [1]:
import sys
sys.path.append('..')
from cleaned_code import *

import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go

## Functions to run the code

In [3]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_absolute_error, mean_squared_error, r2_score

def RFclassifier(features, target, folds = 5):
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2)
    
    # Create a Random Forest classifier
    model = RandomForestClassifier(n_estimators=50, max_depth=5, min_samples_leaf=2, min_samples_split=2)

    # Train the classifier on the training data
    model.fit(x_train, y_train)

    # Predict on the test data
    predictions = model.predict(x_test)
    
    accuracy = accuracy_score(y_test, predictions)
    
    conf_matrix = confusion_matrix(y_test, predictions)
    
    class_report = classification_report(y_test, predictions)
    
    cv_scores = cross_val_score(model, features, target, cv=folds)
    mean_cv_score = np.mean(cv_scores)
    standard_deviation_cv_scores = np.std(cv_scores)
    standard_error = standard_deviation_cv_scores / np.sqrt(folds)
    
    return model, accuracy, conf_matrix, class_report, mean_cv_score, standard_error

In [4]:
def RFregressor(features, target, 
                param_grid = {
                    'n_estimators': [50, 100, 150],
                    'max_depth': [None, 10, 20],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4],
                    'max_features': ['log2', 'sqrt']}):
    
    # Split data into training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2)

    # Define the RandomForestRegressor model
    model = RandomForestRegressor()

    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)

    # Fit the grid search to the data
    grid_search.fit(x_train, y_train)

    # Get the best parameters
    best_params = grid_search.best_params_

    # Use the best model for prediction
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(x_test)
    feature_importances = best_model.feature_importances_

    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    return best_model, best_params, mae, mse, rmse, r2, feature_importances

In [44]:
def plot_feature_importances(importances, features, width=800, height=500):    
    # Get the names of the features
    feature_names = features.columns.tolist()

    import plotly.graph_objects as go

    # Sort the feature importances in descending order
    indices = np.argsort(importances)[::-1]

    # Create the bar plot
    fig = go.Figure(data=go.Bar(
        x=[feature_names[i] for i in indices],
        y=importances[indices],
        marker_color='rgb(33, 145, 140)', 
        text= [f'{x:.2f}%' for x in importances[indices]],
        textposition='auto'
    ))

    # Set the layout
    fig.update_layout(
        xaxis=dict(
            title="Features",
            showline=True,
            linewidth=2,
            linecolor='black',
            ticks='outside',
            tickson = "boundaries",
            tickwidth=2,
            ticklen=5
        ),
        yaxis=dict(
            title="Feature Importance",
            showline=True,
            linewidth=2,
            linecolor='black',
            ticks='inside',
            tickwidth=2,
            ticklen=5
        ),
        barmode='group',
        width=width,
        height=height,
        plot_bgcolor='white',
        paper_bgcolor='white',
        font=dict(family='Helvetica', size=18, color='black'),
        margin=dict(l=10, r=10, b=10, t=10),
        showlegend=False,
        )
    # Show the plot
    fig.show()

In [53]:
def plot_predicted_vs_actual(target_tc, predicted_tc, df):    
    fig = go.Figure(data=go.Scatter(x=target_tc, y=predicted_tc, mode='markers', text=df["formula_sc"], hoverinfo='text'))

    # Add y=x line
    fig.add_trace(go.Scatter(x=np.linspace(min(predicted_tc), max(predicted_tc), 100),
                            y=np.linspace(min(predicted_tc), max(predicted_tc), 100),
                            mode='lines',
                            name='y=x',
                            line=dict(color='red', width=5, dash='dash')))


    # Set the axis labels
    fig.update_layout(xaxis=dict(title="Real Value", showline=True, linewidth=2, linecolor='black',
                                ticks='inside', tickwidth=2, ticklen=5, range=[min(target_tc)-0.5, max(target_tc)+10]),
                    yaxis=dict(title="Predicted Value", showline=True, linewidth=2, linecolor='black',
                                ticks='inside', tickwidth=2, ticklen=5, range=[min(predicted_tc)-0.5, max(predicted_tc)+10]),
                    width=800,
                    height=500,
                    plot_bgcolor='white',
                    paper_bgcolor='white', 
                    font=dict(family='Helvetica', size=16, color='black'),
                    margin=dict(l=10, r=10, b=10, t=10))

    # Show the plot
    fig.show()

# Data Read In

In [7]:
df_MP = pd.read_csv("3DSC_MP.csv", skiprows=1)
df_MP['cif'] = df_MP['cif'].str.replace('data/final/MP/', '')
df_MP.columns = df_MP.columns.str.replace('_2', '')
df_MP.head()

Unnamed: 0,formula_sc,formula_similarity,totreldiff,formula_frac,correct_formula_frac,formula,orig_formula_cif,tc,sc_class,sc_class_unique_sc,...,monoclinic,orthorhombic,tetragonal,triclinic,trigonal,primitive,base-centered,body-centered,face-centered,weight
0,Ag0.02Ge2Pd1.98Sr1,2,0.008,1.0,True,Ag0.02Ge2Pd1.98Sr1,Ge2Pd2Sr1,2.64,Other,True,...,0,0,7,0,0,0,0,1,0,1.0
1,Ag0.15Sn0.85Te1,3,0.15,1.0,True,Ag0.15Sn0.85Te1,Sn1Te1,2.15,Other,True,...,0,0,0,0,0,0,0,0,1,1.0
2,Ag0.1Ge2Pd1.9Sr1,2,0.04,1.0,True,Ag0.1Ge2Pd1.9Sr1,Ge2Pd2Sr1,2.62,Other,True,...,0,0,7,0,0,0,0,1,0,1.0
3,Ag0.1In0.9Te1,3,0.1,1.0,True,Ag0.1In0.9Te1,In1Te1,1.2,Other,True,...,0,0,0,0,0,0,0,0,1,1.0
4,Ag0.2Ba1Si1.8,3,0.133333,4.0,False,Ag0.8Ba4Si7.2,Ba4Si8,3.2,Other,True,...,0,0,0,0,0,1,0,0,0,1.0


In [8]:
target_tc = df_MP["tc"]

In [9]:
df_MP["cat_crystal_system"] = df_MP["crystal_system"].astype('category').cat.codes
physical_features = ['lata', 'latb','latc',
                     'density',
                     'efermi','energy','final_energy','final_energy_per_atom','formation_energy_per_atom',
                     'nsites','total_magnetization','cell_volume','exchange_symmetry',
                     'total_magnetization_normalized_formula_units','true_total_magnetization','totreldiff']
df_features = df_MP[physical_features]
df_features.head()

Unnamed: 0,lata,latb,latc,density,efermi,energy,final_energy,final_energy_per_atom,formation_energy_per_atom,nsites,total_magnetization,cell_volume,exchange_symmetry,total_magnetization_normalized_formula_units,true_total_magnetization,totreldiff
0,4.438672,4.438672,6.030548,7.295677,4.015543,-24.862722,-24.862722,-4.972544,-0.713482,5,0.001259,101.453048,139,0.001259,0.001259,0.008
1,4.53767,4.53767,4.53767,6.19081,6.066451,-7.725196,-7.725196,-3.862598,-0.497162,2,0.0,66.066855,225,0.0,0.0,0.15
2,4.438672,4.438672,6.030548,7.295677,4.015543,-24.862722,-24.862722,-4.972544,-0.713482,5,0.001259,101.453048,139,0.001259,0.001259,0.04
3,4.443633,4.443633,4.443633,6.488053,6.31406,-6.472687,-6.472687,-3.236343,-0.499851,2,0.000182,62.043965,225,0.000182,0.000182,0.1
4,6.771439,6.771439,6.771439,4.13945,5.202543,-55.239871,-55.239871,-4.603323,-0.299456,12,0.0,310.486636,212,0.0,0.0,0.133333


In [10]:
asymcell_features = np.load("featurised_datasets\PHF_AsymCell.npy")

df_features_all = df_features.copy()

for i, feature in enumerate(asymcell_features.T):
    df_features_all[f"Feature {i}"] = np.squeeze(feature)
    
df_features_all

Unnamed: 0,lata,latb,latc,density,efermi,energy,final_energy,final_energy_per_atom,formation_energy_per_atom,nsites,...,Feature 8,Feature 9,Feature 10,Feature 11,Feature 12,Feature 13,Feature 14,Feature 15,Feature 16,Feature 17
0,4.438672,4.438672,6.030548,7.295677,4.015543,-24.862722,-24.862722,-4.972544,-0.713482,5,...,0.0,2.781936,0.297758,0.0,1.809062,0.132656,0.0,4.888015,0.000000,0.0
1,4.537670,4.537670,4.537670,6.190810,6.066451,-7.725196,-7.725196,-3.862598,-0.497162,2,...,0.0,2.778744,0.000000,0.0,3.781856,0.000000,0.0,0.000000,0.000000,0.0
2,4.438672,4.438672,6.030548,7.295677,4.015543,-24.862722,-24.862722,-4.972544,-0.713482,5,...,0.0,2.781936,0.297758,0.0,1.809062,0.132656,0.0,4.888015,0.000000,0.0
3,4.443633,4.443633,4.443633,6.488053,6.314060,-6.472687,-6.472687,-3.236343,-0.499851,2,...,0.0,2.721158,0.000000,0.0,3.664907,0.000000,0.0,0.000000,0.000000,0.0
4,6.771439,6.771439,6.771439,4.139450,5.202543,-55.239871,-55.239871,-4.603323,-0.299456,12,...,0.0,4.866506,0.682774,0.0,1.839699,0.182906,0.0,12.958865,3.481724,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5768,3.585441,3.585441,3.585441,5.559415,3.144927,-8.459703,-8.459703,-4.229851,-0.366744,2,...,0.0,1.552541,0.000000,0.0,1.579418,0.000000,0.0,0.000000,0.000000,0.0
5769,3.878974,3.878974,9.704523,7.005867,1.419946,-4.618825,-4.618825,-1.539608,0.000000,3,...,0.0,3.351341,0.000000,0.0,3.703747,0.000000,0.0,0.000000,0.000000,0.0
5770,2.626730,2.626731,5.207234,6.981485,2.851722,-2.519487,-2.519487,-1.259744,0.000000,2,...,0.0,1.506546,0.000000,0.0,1.509753,0.000000,0.0,0.000000,0.000000,0.0
5771,5.244424,5.244424,5.244424,7.229958,3.547729,-23.949691,-23.949691,-3.991615,-0.302553,6,...,0.0,3.177911,0.000000,0.0,1.661336,0.000000,0.0,0.000000,0.000000,0.0


In [11]:
PH_features = df_features_all.iloc[:, -18:]
PH_features

Unnamed: 0,Feature 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,Feature 11,Feature 12,Feature 13,Feature 14,Feature 15,Feature 16,Feature 17
0,0.573912,-0.000000,-1.0,4.0,1.0,0.0,1.699603,0.297758,0.0,2.781936,0.297758,0.0,1.809062,0.132656,0.0,4.888015,0.000000,0.0
1,0.000000,-1.000000,-1.0,1.0,0.0,0.0,2.778744,0.000000,0.0,2.778744,0.000000,0.0,3.781856,0.000000,0.0,0.000000,0.000000,0.0
2,0.573912,-0.000000,-1.0,4.0,1.0,0.0,1.699603,0.297758,0.0,2.781936,0.297758,0.0,1.809062,0.132656,0.0,4.888015,0.000000,0.0
3,0.000000,-1.000000,-1.0,1.0,0.0,0.0,2.721158,0.000000,0.0,2.721158,0.000000,0.0,3.664907,0.000000,0.0,0.000000,0.000000,0.0
4,0.688938,1.448728,-1.0,11.0,5.0,0.0,1.718738,0.368862,0.0,4.866506,0.682774,0.0,1.839699,0.182906,0.0,12.958865,3.481724,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5768,0.000000,-1.000000,-1.0,1.0,0.0,0.0,1.552541,0.000000,0.0,1.552541,0.000000,0.0,1.579418,0.000000,0.0,0.000000,0.000000,0.0
5769,0.303455,-1.000000,-1.0,2.0,0.0,0.0,2.740350,0.000000,0.0,3.351341,0.000000,0.0,3.703747,0.000000,0.0,0.000000,0.000000,0.0
5770,0.000000,-1.000000,-1.0,1.0,0.0,0.0,1.506546,0.000000,0.0,1.506546,0.000000,0.0,1.509753,0.000000,0.0,0.000000,0.000000,0.0
5771,0.605795,-1.000000,-1.0,5.0,0.0,0.0,1.605770,0.000000,0.0,3.177911,0.000000,0.0,1.661336,0.000000,0.0,0.000000,0.000000,0.0


# Regressor Models

## PHFS only

In [12]:
regressor_df_PHFonly = pd.DataFrame(columns=["model", "mae" ,"mse", "rmse", "r2", "n_estimators", "max_depth", "min_samples_split", "min_samples_leaf", "max_features", "feature_importance"])

for i in range(10):
    model, best_params, mae, mse, rmse, r2, feature_importances = RFregressor(PH_features, target_tc)
    regressor_df_PHFonly = regressor_df_PHFonly.append({"model":model,
                                                        "mae": mae ,"mse": mse, "rmse": rmse, "r2": r2, 
                                                        "n_estimators": best_params["n_estimators"], "max_depth": best_params["max_depth"], 
                                                        "min_samples_split": best_params["min_samples_split"], "min_samples_leaf": best_params["min_samples_leaf"], 
                                                        "max_features": best_params["max_features"], 
                                                        "feature_importance": feature_importances}, ignore_index=True)

In [13]:
regressor_df_PHFonly.sort_values("r2")

Unnamed: 0,model,mae,mse,rmse,r2,n_estimators,max_depth,min_samples_split,min_samples_leaf,max_features,feature_importance
4,"(DecisionTreeRegressor(max_depth=20, max_featu...",5.284092,118.962806,10.907007,0.647928,150,20.0,2,1,sqrt,"[0.08419258258806972, 0.09408142987010977, 0.0..."
6,"(DecisionTreeRegressor(max_features='sqrt', ra...",5.010803,108.211507,10.402476,0.701399,100,,2,1,sqrt,"[0.0822957853312113, 0.0863244303733284, 0.037..."
2,"(DecisionTreeRegressor(max_depth=20, max_featu...",4.705414,99.720663,9.986023,0.716373,150,20.0,2,1,sqrt,"[0.07532772217452026, 0.08495892352965362, 0.0..."
8,"(DecisionTreeRegressor(max_depth=20, max_featu...",4.798923,94.050936,9.697986,0.73789,150,20.0,2,1,sqrt,"[0.07516967779611178, 0.09014387051473803, 0.0..."
9,"(DecisionTreeRegressor(max_features='log2', ra...",4.66114,85.200458,9.230409,0.743781,150,,2,1,log2,"[0.08788545441873763, 0.06679393729800429, 0.0..."
1,"(DecisionTreeRegressor(max_depth=20, max_featu...",4.868226,91.092207,9.544224,0.7463,150,20.0,2,1,sqrt,"[0.08858841526901805, 0.07642431276976494, 0.0..."
3,"(DecisionTreeRegressor(max_features='log2', ra...",4.757506,89.313498,9.450582,0.749493,150,,2,1,log2,"[0.08339058269210295, 0.08061926583757612, 0.0..."
5,"(DecisionTreeRegressor(max_depth=20, max_featu...",5.06701,91.378499,9.55921,0.756815,100,20.0,2,1,sqrt,"[0.09471154606246378, 0.07441932159206323, 0.0..."
0,"(DecisionTreeRegressor(max_depth=20, max_featu...",5.082017,95.039439,9.748817,0.762211,150,20.0,2,1,sqrt,"[0.08227872258721541, 0.0872504839069084, 0.02..."
7,"(DecisionTreeRegressor(max_depth=20, max_featu...",5.133106,91.46347,9.563654,0.791283,100,20.0,2,1,sqrt,"[0.090856168704172, 0.0683076720481105, 0.0497..."


In [14]:
regressor_df_PHFonly.describe()

Unnamed: 0,mae,mse,rmse,r2
count,10.0,10.0,10.0,10.0
mean,4.936824,96.443348,9.809039,0.735347
std,0.207646,10.11256,0.501226,0.039248
min,4.66114,85.200458,9.230409,0.647928
25%,4.767861,91.16378,9.54797,0.721752
50%,4.939514,92.757203,9.63082,0.745041
75%,5.078266,98.550357,9.926722,0.754984
max,5.284092,118.962806,10.907007,0.791283


In [89]:
regressor_df_PHFonly.mean()

mae                    4.936824
mse                   96.443348
rmse                   9.809039
r2                     0.735347
n_estimators         135.000000
max_depth             20.000000
min_samples_split      2.000000
min_samples_leaf       1.000000
dtype: float64

Average Hyper Paramters:
- n_estimators         135.0
- max_depth             20.0
- min_samples_split      2.0
- min_samples_leaf       1.0
- max_features           sqrt

In [54]:
best_model_PHFONLY = regressor_df_PHFonly.iloc[regressor_df_PHFonly["r2"].idxmax()]["model"]
predicted_tc = best_model_PHFONLY.predict(PH_features)

plot_predicted_vs_actual(target_tc, predicted_tc, df_MP)

In [45]:
feature_importance_array = np.array(regressor_df_PHFonly["feature_importance"].tolist())
importances = feature_importance_array.mean(axis=0)
plot_feature_importances(importances, PH_features, width=1000, height=600)

## Physical Features Only

In [17]:
regressor_df_PF = pd.DataFrame(columns=["model", "mae" ,"mse", "rmse", "r2", "n_estimators", "max_depth", "min_samples_split", "min_samples_leaf", "max_features", "feature_importance"])

for i in range(10):
    model, best_params, mae, mse, rmse, r2, feature_importances = RFregressor(df_features, target_tc)
    regressor_df_PF = regressor_df_PF.append({"model":model,
                                                "mae": mae ,"mse": mse, "rmse": rmse, "r2": r2, 
                                                "n_estimators": best_params["n_estimators"], "max_depth": best_params["max_depth"], 
                                                "min_samples_split": best_params["min_samples_split"], "min_samples_leaf": best_params["min_samples_leaf"], 
                                                "max_features": best_params["max_features"], 
                                                "feature_importance": feature_importances}, ignore_index=True)

In [18]:
regressor_df_PF.sort_values("r2")

Unnamed: 0,model,mae,mse,rmse,r2,n_estimators,max_depth,min_samples_split,min_samples_leaf,max_features,feature_importance
5,"(DecisionTreeRegressor(max_depth=10, max_featu...",4.837081,99.649516,9.98246,0.721099,100,10.0,2,1,sqrt,"[0.03376636965702596, 0.09250267171595862, 0.1..."
1,"(DecisionTreeRegressor(max_depth=10, max_featu...",4.886652,94.551464,9.723758,0.722753,50,10.0,5,1,sqrt,"[0.035392453282737615, 0.09030994884476554, 0...."
4,"(DecisionTreeRegressor(max_features='sqrt', mi...",4.925371,102.568115,10.127592,0.72909,50,,2,4,sqrt,"[0.03567669101946515, 0.08242845122186042, 0.1..."
0,"(DecisionTreeRegressor(max_depth=10, max_featu...",4.568932,80.211506,8.956088,0.743928,150,10.0,2,1,sqrt,"[0.03432796638996079, 0.08454265634341426, 0.1..."
7,"(DecisionTreeRegressor(max_depth=10, max_featu...",4.465211,75.709525,8.701122,0.758036,100,10.0,2,1,sqrt,"[0.03472463288020199, 0.08359372162892202, 0.1..."
6,"(DecisionTreeRegressor(max_depth=10, max_featu...",4.621197,77.86313,8.824009,0.758908,150,10.0,2,1,log2,"[0.033306510176404514, 0.09135805758915493, 0...."
3,"(DecisionTreeRegressor(max_depth=10, max_featu...",4.782051,91.224855,9.55117,0.763699,150,10.0,2,1,log2,"[0.03364228094137065, 0.0774969959484921, 0.12..."
9,"(DecisionTreeRegressor(max_depth=10, max_featu...",4.649009,80.358483,8.964289,0.764623,150,10.0,5,1,log2,"[0.0323308141712951, 0.07870980283568668, 0.13..."
2,"(DecisionTreeRegressor(max_depth=10, max_featu...",4.524473,86.688776,9.310681,0.77221,100,10.0,5,1,sqrt,"[0.03396217261558996, 0.08807721365831486, 0.1..."
8,"(DecisionTreeRegressor(max_depth=10, max_featu...",4.982227,87.774481,9.368804,0.779737,50,10.0,5,2,log2,"[0.027365540160768018, 0.07671512620408502, 0...."


In [19]:
regressor_df_PF.describe()

Unnamed: 0,mae,mse,rmse,r2
count,10.0,10.0,10.0,10.0
mean,4.72422,87.659985,9.350997,0.751408
std,0.181618,9.278062,0.493103,0.020963
min,4.465211,75.709525,8.701122,0.721099
25%,4.581999,80.24825,8.958138,0.7328
50%,4.71553,87.231629,9.339742,0.758472
75%,4.874259,93.719812,9.680611,0.764392
max,4.982227,102.568115,10.127592,0.779737


In [90]:
regressor_df_PF.mean()

mae                    4.724220
mse                   87.659985
rmse                   9.350997
r2                     0.751408
n_estimators         105.000000
max_depth             10.000000
min_samples_split      3.200000
min_samples_leaf       1.400000
dtype: float64

Average Hyper Paramters:
- n_estimators         105.0
- max_depth             10.0
- min_samples_split      3.2
- min_samples_leaf       1.4
- max_features           sqrt

In [55]:
best_model = regressor_df_PF.iloc[regressor_df_PF["r2"].idxmax()]["model"]
predicted_tc = best_model.predict(df_features)

plot_predicted_vs_actual(target_tc, predicted_tc, df_MP)

In [47]:
feature_importance_array_A = np.array(regressor_df_PF["feature_importance"].tolist())
importances_A = feature_importance_array_A.mean(axis=0)
plot_feature_importances(importances_A, df_features, width=1000, height=800)

## Combined

In [22]:
regressor_df_all = pd.DataFrame(columns=["model", "mae" ,"mse", "rmse", "r2", "n_estimators", "max_depth", "min_samples_split", "min_samples_leaf", "max_features", "feature_importance"])

for i in range(10):
    model, best_params, mae, mse, rmse, r2, feature_importances = RFregressor(df_features_all, target_tc)
    regressor_df_all = regressor_df_all.append({"model":model,
                                                        "mae": mae ,"mse": mse, "rmse": rmse, "r2": r2, 
                                                        "n_estimators": best_params["n_estimators"], "max_depth": best_params["max_depth"], 
                                                        "min_samples_split": best_params["min_samples_split"], "min_samples_leaf": best_params["min_samples_leaf"], 
                                                        "max_features": best_params["max_features"], 
                                                        "feature_importance": feature_importances}, ignore_index=True)

In [23]:
regressor_df_all.sort_values("r2")

Unnamed: 0,model,mae,mse,rmse,r2,n_estimators,max_depth,min_samples_split,min_samples_leaf,max_features,feature_importance
0,"(DecisionTreeRegressor(max_depth=10, max_featu...",4.793616,99.417215,9.970818,0.687728,150,10,2,1,sqrt,"[0.03013694931350047, 0.0694904161109185, 0.08..."
7,"(DecisionTreeRegressor(max_depth=20, max_featu...",4.910836,107.72069,10.378858,0.704905,150,20,2,2,sqrt,"[0.02953268437623298, 0.06262187611926957, 0.0..."
3,"(DecisionTreeRegressor(max_depth=10, max_featu...",5.274816,110.049383,10.490442,0.706996,100,10,5,1,sqrt,"[0.028384079542472115, 0.06622548668996493, 0...."
6,"(DecisionTreeRegressor(max_depth=10, max_featu...",4.675404,89.189992,9.444045,0.734008,150,10,5,1,log2,"[0.032124587486332686, 0.07655814493396501, 0...."
5,"(DecisionTreeRegressor(max_depth=10, max_featu...",4.865586,93.705005,9.680135,0.742922,100,10,2,1,log2,"[0.02669252337566738, 0.07579078298320951, 0.0..."
2,"(DecisionTreeRegressor(max_depth=10, max_featu...",4.627754,90.322865,9.503834,0.775665,150,10,2,1,sqrt,"[0.027818744215372053, 0.06967042839481197, 0...."
1,"(DecisionTreeRegressor(max_depth=10, max_featu...",5.064804,93.475932,9.668295,0.786509,100,10,2,1,log2,"[0.028150740239570343, 0.07890198555019147, 0...."
8,"(DecisionTreeRegressor(max_depth=10, max_featu...",4.972707,87.172317,9.336612,0.789139,150,10,2,1,log2,"[0.028198753646947985, 0.08030477286191669, 0...."
9,"(DecisionTreeRegressor(max_depth=10, max_featu...",4.824279,85.514601,9.24741,0.790159,150,10,2,2,sqrt,"[0.030846785247567503, 0.06416689671013975, 0...."
4,"(DecisionTreeRegressor(max_depth=10, max_featu...",4.911735,90.986516,9.538685,0.794423,150,10,2,1,sqrt,"[0.02627076809252274, 0.06317905704878311, 0.0..."


In [24]:
regressor_df_all.describe()

Unnamed: 0,mae,mse,rmse,r2
count,10.0,10.0,10.0,10.0
mean,4.892154,94.755452,9.725914,0.751245
std,0.187193,8.394434,0.42434,0.041043
min,4.627754,85.514601,9.24741,0.687728
25%,4.801282,89.47321,9.458993,0.713749
50%,4.888211,92.231224,9.60349,0.759293
75%,4.957464,97.989163,9.898147,0.788481
max,5.274816,110.049383,10.490442,0.794423


In [91]:
regressor_df_all.mean()

mae                    4.892154
mse                   94.755452
rmse                   9.725914
r2                     0.751245
n_estimators         135.000000
max_depth             11.000000
min_samples_split      2.600000
min_samples_leaf       1.200000
dtype: float64

Average Hyper Paramters:
- n_estimators         135.0
- max_depth             11.0
- min_samples_split      2.6
- min_samples_leaf       1.2
- max_features           sqrt

In [56]:
best_model = regressor_df_all.iloc[regressor_df_all["r2"].idxmax()]["model"]
predicted_tc = best_model.predict(df_features_all)

plot_predicted_vs_actual(target_tc, predicted_tc, df_MP)

In [49]:
feature_importance_array_B = np.array(regressor_df_all["feature_importance"].tolist())
importances_B = feature_importance_array_B.mean(axis=0)
plot_feature_importances(importances_B, df_features_all, width=1200, height=800)

Removing features that have little to no contribution

## With only superconductors

In [50]:
df_MP_nonzero = df_MP[df_MP["tc"] != 0]
target_tc_nonzero = df_MP_nonzero["tc"]
df_features_nonzero = df_features_all.loc[df_MP_nonzero.index]
df_features_nonzero.columns

Index(['lata', 'latb', 'latc', 'density', 'efermi', 'energy', 'final_energy',
       'final_energy_per_atom', 'formation_energy_per_atom', 'nsites',
       'total_magnetization', 'cell_volume', 'exchange_symmetry',
       'total_magnetization_normalized_formula_units',
       'true_total_magnetization', 'totreldiff', 'Feature 0', 'Feature 1',
       'Feature 2', 'Feature 3', 'Feature 4', 'Feature 5', 'Feature 6',
       'Feature 7', 'Feature 8', 'Feature 9', 'Feature 10', 'Feature 11',
       'Feature 12', 'Feature 13', 'Feature 14', 'Feature 15', 'Feature 16',
       'Feature 17'],
      dtype='object')

In [52]:
df_features_nonzero = df_features_nonzero.drop(columns=['Feature 5', 'Feature 8', 'Feature 14', 'Feature 11', 'Feature 17', 'Feature 3', 'Feature 2', "nsites"])
df_features_nonzero.columns

Index(['lata', 'latb', 'latc', 'density', 'efermi', 'energy', 'final_energy',
       'final_energy_per_atom', 'formation_energy_per_atom',
       'total_magnetization', 'cell_volume', 'exchange_symmetry',
       'total_magnetization_normalized_formula_units',
       'true_total_magnetization', 'totreldiff', 'Feature 0', 'Feature 1',
       'Feature 4', 'Feature 6', 'Feature 7', 'Feature 9', 'Feature 10',
       'Feature 12', 'Feature 13', 'Feature 15', 'Feature 16'],
      dtype='object')

In [57]:
regressor_df_nonzero = pd.DataFrame(columns=["model", "mae" ,"mse", "rmse", "r2", "n_estimators", "max_depth", "min_samples_split", "min_samples_leaf", "max_features", "feature_importance"])

for i in range(10):
    model, best_params, mae, mse, rmse, r2, feature_importances = RFregressor(df_features_nonzero, target_tc_nonzero)
    regressor_df_nonzero = regressor_df_nonzero.append({"model":model,
                                                        "mae": mae ,"mse": mse, "rmse": rmse, "r2": r2, 
                                                        "n_estimators": best_params["n_estimators"], "max_depth": best_params["max_depth"], 
                                                        "min_samples_split": best_params["min_samples_split"], "min_samples_leaf": best_params["min_samples_leaf"], 
                                                        "max_features": best_params["max_features"], 
                                                        "feature_importance": feature_importances}, ignore_index=True)

In [58]:
regressor_df_nonzero.sort_values("r2")

Unnamed: 0,model,mae,mse,rmse,r2,n_estimators,max_depth,min_samples_split,min_samples_leaf,max_features,feature_importance
3,"(DecisionTreeRegressor(max_depth=10, max_featu...",3.891535,48.384999,6.955933,0.869713,50,10.0,5,1,sqrt,"[0.04727381670344991, 0.0787237644851206, 0.12..."
7,"(DecisionTreeRegressor(max_depth=10, max_featu...",4.016574,56.210681,7.497378,0.869772,150,10.0,2,1,log2,"[0.03311353029460513, 0.0596138155467293, 0.12..."
2,"(DecisionTreeRegressor(max_features='sqrt', mi...",4.127664,64.425507,8.02655,0.869891,150,,5,1,sqrt,"[0.034317697481324586, 0.06727333611788594, 0...."
8,"(DecisionTreeRegressor(max_features='sqrt', mi...",3.861345,60.292064,7.764796,0.884063,100,,10,2,sqrt,"[0.036246904675318554, 0.08343185314839134, 0...."
6,"(DecisionTreeRegressor(max_features='sqrt', mi...",4.001274,54.944824,7.412478,0.884937,100,,10,1,sqrt,"[0.02913687997097901, 0.06737529237982169, 0.1..."
9,"(DecisionTreeRegressor(max_depth=10, max_featu...",3.692528,46.868627,6.846067,0.887912,150,10.0,2,1,sqrt,"[0.03491981001764058, 0.05785021594671695, 0.1..."
4,"(DecisionTreeRegressor(max_depth=10, max_featu...",4.238576,57.444916,7.579242,0.889473,150,10.0,2,1,sqrt,"[0.029283000243658854, 0.07051167707848999, 0...."
5,"(DecisionTreeRegressor(max_features='sqrt', mi...",3.58897,44.199788,6.648292,0.892387,50,,10,1,sqrt,"[0.02907555705672398, 0.0611241236781617, 0.12..."
1,"(DecisionTreeRegressor(max_depth=10, max_featu...",4.068621,50.222969,7.086817,0.898166,100,10.0,2,1,sqrt,"[0.033604360209763644, 0.06858216758719633, 0...."
0,"(DecisionTreeRegressor(max_depth=10, max_featu...",3.814775,45.697565,6.759997,0.903577,100,10.0,2,1,sqrt,"[0.024205829220790315, 0.07758766205243094, 0...."


In [59]:
regressor_df_nonzero.describe()

Unnamed: 0,mae,mse,rmse,r2
count,10.0,10.0,10.0,10.0
mean,3.930186,52.869194,7.257755,0.884989
std,0.199512,6.788606,0.464502,0.012001
min,3.58897,44.199788,6.648292,0.869713
25%,3.826418,47.24772,6.873533,0.873434
50%,3.946404,52.583897,7.249647,0.886424
75%,4.055609,57.136358,7.558776,0.891659
max,4.238576,64.425507,8.02655,0.903577


In [92]:
regressor_df_nonzero.mean()

mae                    3.930186
mse                   52.869194
rmse                   7.257755
r2                     0.884989
n_estimators         110.000000
max_depth             10.000000
min_samples_split      5.000000
min_samples_leaf       1.100000
dtype: float64

Average Hyper Paramters:
- n_estimators         110.0
- max_depth             10.0
- min_samples_split      5.0
- min_samples_leaf       1.1
- max_features           sqrt

In [62]:
best_model = regressor_df_nonzero.iloc[regressor_df_all["r2"].idxmax()]["model"]
predicted_tc = best_model.predict(df_features_nonzero)

plot_predicted_vs_actual(target_tc_nonzero, predicted_tc, df_MP_nonzero)

In [93]:
import plotly.express as px

df_MP_nonzero["predicted_tc"] = predicted_tc

unique_categories = df_MP_nonzero["sc_class"].unique()

colors = px.colors.qualitative.G10

color_map = {
    category: colors[i % len(colors)]
    for i, category in enumerate(unique_categories)
}


traces = []
for category, color in color_map.items():
    filtered_df = df_MP_nonzero[df_MP_nonzero["sc_class"] == category]
    trace = go.Scatter(
        x=filtered_df["tc"], 
        y=filtered_df["predicted_tc"], 
        mode='markers', 
        text=filtered_df["formula_sc"], 
        hoverinfo='text',
        marker=dict(
            color=color,
            size=8,  # Adjust marker size as needed
            opacity=0.8  # Adjust marker opacity as needed
        ),
        name=category  # Use category name for legend
    )
    traces.append(trace)

# Add y=x line
traces.append(go.Scatter(
    x=np.linspace(min(predicted_tc), max(predicted_tc), 100),
    y=np.linspace(min(predicted_tc), max(predicted_tc), 100),
    mode='lines',
    name='y=x',
    line=dict(color='red', width=5, dash='dash')
))

# Create layout
layout = go.Layout(
    xaxis=dict(title="Real Value", showline=True, linewidth=2, linecolor='black',
               ticks='inside', tickwidth=2, ticklen=5, range=[min(target_tc)-0.5, max(target_tc)+10]),
    yaxis=dict(title="Predicted Value", showline=True, linewidth=2, linecolor='black',
               ticks='inside', tickwidth=2, ticklen=5, range=[min(predicted_tc)-0.5, max(predicted_tc)+10]),
    width=1000,
    height=800,
    plot_bgcolor='white',
    paper_bgcolor='white', 
    font=dict(family='Helvetica', size=18, color='black'),
    margin=dict(l=10, r=10, b=10, t=10),
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
)

# Create figure
fig = go.Figure(data=traces, layout=layout)

# Show the plot
fig.show()

In [61]:
feature_importance_array_B = np.array(regressor_df_nonzero["feature_importance"].tolist())
importances_B = feature_importance_array_B.mean(axis=0)
plot_feature_importances(importances_B, df_features_nonzero, width=1200, height=800)