In [None]:
import sys
sys.path.append('..')
from cleaned_code import *

import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
import plotly.express as px
from pymatgen.io.cif import CifParser

## Functions to run the code

In [None]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_absolute_error, mean_squared_error, r2_score

def RFclassifier(features, target, folds = 5):
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2)
    
    # Create a Random Forest classifier
    model = RandomForestClassifier(n_estimators=50, max_depth=5, min_samples_leaf=2, min_samples_split=2)

    # Train the classifier on the training data
    model.fit(x_train, y_train)

    # Predict on the test data
    predictions = model.predict(x_test)
    
    accuracy = accuracy_score(y_test, predictions)
    
    conf_matrix = confusion_matrix(y_test, predictions)
    
    class_report = classification_report(y_test, predictions)
    
    cv_scores = cross_val_score(model, features, target, cv=folds)
    mean_cv_score = np.mean(cv_scores)
    standard_deviation_cv_scores = np.std(cv_scores)
    standard_error = standard_deviation_cv_scores / np.sqrt(folds)
    
    return model, accuracy, conf_matrix, class_report, mean_cv_score, standard_error

In [None]:
def RFregressor(features, target, 
                param_grid = {
                    'n_estimators': [50, 100, 150],
                    'max_depth': [None, 10, 20],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4],
                    'max_features': ['log2', 'sqrt']}):
    
    # Split data into training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2)

    # Define the RandomForestRegressor model
    model = RandomForestRegressor()

    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=10, scoring="r2")

    # Fit the grid search to the data
    grid_search.fit(x_train, y_train)

    # Get the best parameters
    best_params = grid_search.best_params_

    # Use the best model for prediction
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(x_test)
    feature_importances = best_model.feature_importances_

    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    return best_model, best_params, mae, mse, rmse, r2, feature_importances

In [None]:
def plot_feature_importances(importances, features, width=800, height=500):    
    # Get the names of the features
    feature_names = features.columns.tolist()
    feature_names = [name.replace('_', ' ') for name in feature_names]
    
    # Sort the feature importances in descending order
    indices = np.argsort(importances)[::-1]

    # Create the bar plot
    fig = go.Figure(data=go.Bar(
        x=[feature_names[i] for i in indices],
        y=importances[indices],
        marker_color='rgb(33, 145, 140)', 
        text= [f'{x:.2f}%' for x in importances[indices]],
        textposition='auto'
    ))

    # Set the layout
    fig.update_layout(
        xaxis=dict(
            title="Features",
            showline=True,
            linewidth=5,
            linecolor='black',
            ticks='outside',
            tickson = "boundaries",
            tickwidth=3,
            ticklen=5
        ),
        yaxis=dict(
            title="Feature Importance",
            showline=True,
            linewidth=5,
            linecolor='black',
            ticks='inside',
            tickwidth=3,
            ticklen=5
        ),
        barmode='group',
        width=width,
        height=height,
        plot_bgcolor='white',
        paper_bgcolor='white',
        font=dict(family='Helvetica', size=24, color='black'),
        margin=dict(l=10, r=10, b=10, t=10),
        showlegend=False,
        )
    # Show the plot
    fig.show()

In [None]:
def plot_predicted_vs_actual(target_tc, predicted_tc, df):    
    df["predicted_tc"] = predicted_tc
    
        # Define a dictionary to map the old names to the new names
    name_mapping = {
        'Other': 'Other',
        'Not_supercon': 'Not Superconductor',
        'Cuprate': 'Cuprate',
        'Ferrite': 'Ferrite',
        'Heavy_fermion': 'Heavy Fermion',
        'Oxide': 'Oxide',
        'Chevrel': 'Chevrel',
        'Carbon': 'Carbon',
        'Heavy_fermionChevrel': 'Heavy Fermion Chevrel',
        'OxideHeavy_fermion': 'Oxide Heavy Fermion'
    }

    # Replace the values in the sc_class column with the new names
    df['sc_class_name'] = df['sc_class'].map(name_mapping)
    
    unique_categories = df["sc_class_name"].unique()

    colors = px.colors.qualitative.G10

    color_map = {
        category: colors[i % len(colors)]
        for i, category in enumerate(unique_categories)
    }


    traces = []
    for category, color in color_map.items():
        filtered_df = df[df["sc_class_name"] == category]
        trace = go.Scatter(
            x=filtered_df["tc"], 
            y=filtered_df["predicted_tc"], 
            mode='markers', 
            text=filtered_df["formula_sc"], 
            hoverinfo='text',
            marker=dict(
                color=color,
                size=8,  # Adjust marker size as needed
                opacity=0.8  # Adjust marker opacity as needed
            ),
            name=category  # Use category name for legend
        )
        traces.append(trace)

    # Add y=x line
    traces.append(go.Scatter(
        x=np.linspace(min(predicted_tc), max(predicted_tc), 100),
        y=np.linspace(min(predicted_tc), max(predicted_tc), 100),
        mode='lines',
        name='y=x',
        line=dict(color='black', width=5, dash='dash')
    ))

    # Create layout
    layout = go.Layout(
        xaxis=dict(title="Real Value", showline=True, linewidth=5, linecolor='black',
                ticks='inside', tickwidth=4, ticklen=5, range=[min(target_tc)-0.5, max(target_tc)+10]),
        yaxis=dict(title="Predicted Value", showline=True, linewidth=5, linecolor='black',
                ticks='inside', tickwidth=4, ticklen=5, range=[min(predicted_tc)-0.5, max(predicted_tc)+10]),
        width=1200,
        height=800,
        plot_bgcolor='white',
        paper_bgcolor='white', 
        font=dict(family='Helvetica', size=24, color='black'),
        margin=dict(l=10, r=10, b=10, t=10),
        legend=dict(orientation="v", yanchor="bottom", y=0, xanchor="right", x=1.5)
    )

    # Create figure
    fig = go.Figure(data=traces, layout=layout)

    # Show the plot
    fig.show()

# Data Read In

In [None]:
df_MP = pd.read_csv("3DSC_MP.csv", skiprows=1)
df_MP['cif'] = df_MP['cif'].str.replace('data/final/MP/', '')
df_MP.columns = df_MP.columns.str.replace('_2', '')
df_MP.head()

In [None]:
for cif_path in df_MP["cif"]:
    parser = CifParser(cif_path)
    structure = parser.get_structures()[0]  # Assuming there's only one structure in the file

    # Get the composition of the structure
    composition = structure.composition

    # Calculate the average atomic weight
    average_atomic_weight = composition.weight
    
    df_MP.loc[df_MP["cif"] == cif_path, "average_atomic_weight"] = average_atomic_weight

In [None]:
df_MP_nonzero = df_MP[df_MP["tc"] != 0]

In [None]:
target_tc = df_MP["tc"]

In [None]:
physical_features =['num_elements_sc',
                    'lata',
                    'latb',
                    'latc',
                    'density',
                    'e_above_hull',
                    'efermi',
                    'final_energy',
                    'final_energy_per_atom',
                    'formation_energy_per_atom',
                    'nsites',
                    'cell_volume',
                    'exchange_symmetry',
                    'true_total_magnetization',
                    'average_atomic_weight',
                    'totreldiff']

df_features = df_MP[physical_features]
df_features.head()

In [None]:
asymcell_features = np.load("featurised_datasets\PHF_AsymCell.npy")

df_features_all = df_features.copy()

for i, feature in enumerate(asymcell_features.T):
    df_features_all[f"Feature {i}"] = np.squeeze(feature)
    
df_features_all

In [None]:
PH_features = df_features_all.iloc[:, -18:]
PH_features

# Regressor Models

## PHFS only

In [None]:
regressor_df_PHFonly = pd.DataFrame(columns=["model", "mae" ,"mse", "rmse", "r2", "n_estimators", "max_depth", "min_samples_split", "min_samples_leaf", "max_features", "feature_importance"])

for i in range(10):
    model, best_params, mae, mse, rmse, r2, feature_importances = RFregressor(PH_features, target_tc)
    regressor_df_PHFonly = regressor_df_PHFonly.append({"model":model,
                                                        "mae": mae ,"mse": mse, "rmse": rmse, "r2": r2, 
                                                        "n_estimators": best_params["n_estimators"], "max_depth": best_params["max_depth"], 
                                                        "min_samples_split": best_params["min_samples_split"], "min_samples_leaf": best_params["min_samples_leaf"], 
                                                        "max_features": best_params["max_features"], 
                                                        "feature_importance": feature_importances}, ignore_index=True)

In [None]:
regressor_df_PHFonly.sort_values("r2")

In [None]:
regressor_df_PHFonly.describe()

In [None]:
regressor_df_PHFonly.mean()

Average Hyper Paramters:
- n_estimators         135.0
- max_depth             20.0
- min_samples_split      2.0
- min_samples_leaf       1.0
- max_features           sqrt

In [None]:
best_model_PHFONLY = regressor_df_PHFonly.iloc[regressor_df_PHFonly["r2"].idxmax()]["model"]
predicted_tc = best_model_PHFONLY.predict(PH_features)

plot_predicted_vs_actual(target_tc, predicted_tc, df_MP)

In [None]:
feature_importance_array = np.array(regressor_df_PHFonly["feature_importance"].tolist())
importances = feature_importance_array.mean(axis=0)
plot_feature_importances(importances, PH_features, width=1000, height=600)

In [None]:
cross_validate_model(best_model_PHFONLY, PH_features, target_tc)

## Physical Features Only

In [None]:
regressor_df_PF = pd.DataFrame(columns=["model", "mae" ,"mse", "rmse", "r2", "n_estimators", "max_depth", "min_samples_split", "min_samples_leaf", "max_features", "feature_importance"])

for i in range(10):
    model, best_params, mae, mse, rmse, r2, feature_importances = RFregressor(df_features, target_tc)
    regressor_df_PF = regressor_df_PF.append({"model":model,
                                                "mae": mae ,"mse": mse, "rmse": rmse, "r2": r2, 
                                                "n_estimators": best_params["n_estimators"], "max_depth": best_params["max_depth"], 
                                                "min_samples_split": best_params["min_samples_split"], "min_samples_leaf": best_params["min_samples_leaf"], 
                                                "max_features": best_params["max_features"], 
                                                "feature_importance": feature_importances}, ignore_index=True)

In [None]:
regressor_df_PF.sort_values("r2")

In [None]:
regressor_df_PF.describe()

In [None]:
regressor_df_PF.mean()

Average Hyper Paramters:
- n_estimators         105.0
- max_depth             10.0
- min_samples_split      3.2
- min_samples_leaf       1.4
- max_features           sqrt

In [None]:
best_model = regressor_df_PF.iloc[regressor_df_PF["r2"].idxmax()]["model"]
predicted_tc = best_model.predict(df_features)

plot_predicted_vs_actual(target_tc, predicted_tc, df_MP)

In [None]:
feature_importance_array_A = np.array(regressor_df_PF["feature_importance"].tolist())
importances_A = feature_importance_array_A.mean(axis=0)
plot_feature_importances(importances_A, df_features, width=1000, height=800)

In [None]:
cross_validate_model(best_model, df_features, target_tc)

## Combined

In [None]:
regressor_df_all = pd.DataFrame(columns=["model", "mae" ,"mse", "rmse", "r2", "n_estimators", "max_depth", "min_samples_split", "min_samples_leaf", "max_features", "feature_importance"])

for i in range(10):
    model, best_params, mae, mse, rmse, r2, feature_importances = RFregressor(df_features_all, target_tc)
    regressor_df_all = regressor_df_all.append({"model":model,
                                                        "mae": mae ,"mse": mse, "rmse": rmse, "r2": r2, 
                                                        "n_estimators": best_params["n_estimators"], "max_depth": best_params["max_depth"], 
                                                        "min_samples_split": best_params["min_samples_split"], "min_samples_leaf": best_params["min_samples_leaf"], 
                                                        "max_features": best_params["max_features"], 
                                                        "feature_importance": feature_importances}, ignore_index=True)

In [None]:
regressor_df_all.sort_values("r2")

In [None]:
regressor_df_all.describe()

In [None]:
regressor_df_all.mean()

Average Hyper Paramters:
- n_estimators         135.0
- max_depth             11.0
- min_samples_split      2.6
- min_samples_leaf       1.2
- max_features           sqrt

In [None]:
best_model = regressor_df_all.iloc[regressor_df_all["r2"].idxmax()]["model"]
predicted_tc = best_model.predict(df_features_all)

plot_predicted_vs_actual(target_tc, predicted_tc, df_MP)

In [None]:
feature_importance_array_B = np.array(regressor_df_all["feature_importance"].tolist())
importances_B = feature_importance_array_B.mean(axis=0)
plot_feature_importances(importances_B, df_features_all, width=1200, height=800)

Removing features that have little to no contribution

## With only superconductors

In [None]:
target_tc_nonzero = df_MP_nonzero["tc"]
df_features_nonzero = df_features_all.loc[df_MP_nonzero.index]
df_features_nonzero.columns

In [None]:
regressor_df_nonzero = pd.DataFrame(columns=["model", "mae" ,"mse", "rmse", "r2", "n_estimators", "max_depth", "min_samples_split", "min_samples_leaf", "max_features", "feature_importance"])

for i in range(10):
    model, best_params, mae, mse, rmse, r2, feature_importances = RFregressor(df_features_nonzero, target_tc_nonzero)
    regressor_df_nonzero = regressor_df_nonzero.append({"model":model,
                                                        "mae": mae ,"mse": mse, "rmse": rmse, "r2": r2, 
                                                        "n_estimators": best_params["n_estimators"], "max_depth": best_params["max_depth"], 
                                                        "min_samples_split": best_params["min_samples_split"], "min_samples_leaf": best_params["min_samples_leaf"], 
                                                        "max_features": best_params["max_features"], 
                                                        "feature_importance": feature_importances}, ignore_index=True)

In [None]:
regressor_df_nonzero.sort_values("r2")

In [None]:
regressor_df_nonzero.describe()

In [None]:
regressor_df_nonzero.mean()

Average Hyper Paramters:
- n_estimators         110.0
- max_depth             10.0
- min_samples_split      5.0
- min_samples_leaf       1.1
- max_features           sqrt

In [None]:
best_model = regressor_df_nonzero.iloc[regressor_df_all["r2"].idxmax()]["model"]
predicted_tc = best_model.predict(df_features_nonzero)

plot_predicted_vs_actual(target_tc_nonzero, predicted_tc, df_MP_nonzero)

In [None]:
feature_importance_array_B = np.array(regressor_df_nonzero["feature_importance"].tolist())
importances_B = feature_importance_array_B.mean(axis=0)
plot_feature_importances(importances_B, df_features_nonzero, width=1200, height=800)

In [None]:
cross_validate_model(best_model, df_features_nonzero, target_tc_nonzero)

# Classification Model

In [None]:
df_MP["super_or_not"] = np.where(df_MP["tc"] == 0, "Not_supercon", "Supercon") 
df_MP["supercon_or_not_cat"] = df_MP["super_or_not"].astype('category').cat.codes
df_MP[["supercon_or_not_cat", "super_or_not", "formula_sc", "tc"]]

In [None]:
model, accuracy, conf_matrix, class_report, mean_cv_score, standard_error = RFclassifier(df_features_all, df_MP["supercon_or_not_cat"])

In [None]:
print(f"Accuracy: {accuracy}")
print(f"Mean CV Score: {mean_cv_score}")
print(f"Standard Error: {standard_error}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")