with physical features. without, assymmetric. 2 unit cells wide, feature importance, without PHFs included

Regressor and Classifier

In [None]:
import sys
sys.path.append('..')
from cleaned_code import *

import warnings
warnings.filterwarnings("ignore")

: 

In [None]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_absolute_error, mean_squared_error, r2_score


def RFclassifier(features, target, folds = 5):
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2)
    
    # Create a Random Forest classifier
    model = RandomForestClassifier(n_estimators=50, max_depth=5, min_samples_leaf=2, min_samples_split=2)

    # Train the classifier on the training data
    model.fit(x_train, y_train)

    # Predict on the test data
    predictions = model.predict(x_test)
    
    accuracy = accuracy_score(y_test, predictions)
    
    conf_matrix = confusion_matrix(y_test, predictions)
    
    class_report = classification_report(y_test, predictions)
    
    cv_scores = cross_val_score(model, features, target, cv=folds)
    mean_cv_score = np.mean(cv_scores)
    standard_deviation_cv_scores = np.std(cv_scores)
    standard_error = standard_deviation_cv_scores / np.sqrt(folds)
    
    return model, accuracy, conf_matrix, class_report, mean_cv_score, standard_error

def RFregressor(features, target, 
                param_grid = {
                    'n_estimators': [50, 100, 150],
                    'max_depth': [None, 10, 20],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4],
                    'max_features': ['log2', 'sqrt']}):
    
    # Split data into training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2)

    # Define the RandomForestRegressor model
    model = RandomForestRegressor()

    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)

    # Fit the grid search to the data
    grid_search.fit(x_train, y_train)

    # Get the best parameters
    best_params = grid_search.best_params_
    print("Best Parameters:", best_params)

    # Use the best model for prediction
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(x_test)

    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    # Print evaluation metrics
    print("Mean Absolute Error (MAE):", mae)
    print("Mean Squared Error (MSE):", mse)
    print("Root Mean Squared Error (RMSE):", rmse)
    print("R-squared (R2) Score:", r2)
    
    return model, best_params, mae, mse, rmse, r2

: 

# Data Work

Getting the data here, and cleaning it. the _2 comes from the fact that the CID also is a dataset, but we're only looking at the MP dataset as its open source and a bit smaller so easier to manage.

In [None]:
import pandas as pd
df_MP = pd.read_csv("3DSC_MP.csv", skiprows=1)
df_MP['cif'] = df_MP['cif'].str.replace('data/final/MP/', '')
df_MP.columns = df_MP.columns.str.replace('_2', '')
df_MP.head()

: 

In [None]:
df_MP_asymcell = df_MP.copy()

asymcell_features = np.load("featurised_datasets\PHF_AsymCell.npy")

for i, feature in enumerate(asymcell_features.T):
    df_MP_asymcell[f"Feature {i}"] = np.squeeze(feature)
    
df_MP_asymcell

: 

In [None]:
NA_values_columns = []
for column in df_MP.columns:
    if df_MP[column].dtype != 'object' and df_MP[column].dtype != 'bool':
        print(f"Column '{column}' has type: {df_MP[column].dtype}")
        if df_MP[column].isna().any():
            NA_values_columns.append(column)


: 

In [None]:
print(NA_values_columns)

: 

So there are no materials in here that need to be excluded

In [None]:
for column in df_MP.columns:
    if df_MP[column].dtype == 'bool':
        print(df_MP[column].value_counts(), "\n")

: 

In [None]:
for column in df_MP.columns:
    if df_MP[column].dtype == 'object':
        print(f"Column {column}")

: 

In [None]:
df_MP["magnetic_type"].value_counts()

: 

Interesting, but does not show me that I need to remove anything from this dataset

In [None]:
target = df_MP_asymcell["tc"]
PHFs = df_MP_asymcell.iloc[:, -18:]
PHFs

: 

In [None]:
physical_features = ["lata", "latb", "latc", "cell_volume",
                     "band_gap", "density","weight",
                     "e_above_hull", "efermi", "encut", "energy", "energy_per_atom", "final_energy", "final_energy_per_atom", "formation_energy_per_atom",
                     "nsites", "total_magnetization", "exchange_symmetry", "num_unique_magnetic_sites", "total_magnetization_normalized_vol", "total_magnetization_normalized_formula_units",
                     "num_magnetic_sites", "true_total_magnetization"]

: 

In [None]:
def corr_matrix(dataframe, features):
    features.append("tc")    
    # Compute the correlation matrix
    correlation_matrix = dataframe[features].corr()

    # Get the correlation values
    correlation_values = correlation_matrix.values.round(2)

    # Create the heatmap trace
    heatmap = go.Heatmap(
        z=correlation_values,
        x=correlation_matrix.columns,
        y=correlation_matrix.index,
        colorscale='Viridis', 
        colorbar=dict(title='Pearson Coefficient', titleside='right', tickvals = [-1,-0.5,0,0.5,1], ticktext = [-1,-0.5,0,0.5,1]),
        text=correlation_values,
        texttemplate="%{text}",
        textfont={"size":8}
    )
    

    # Create the layout
    layout = go.Layout(
        xaxis=dict(
            title="",
            showline=True,
            linewidth=2,
            linecolor='black',
            ticks='inside',
            tickwidth=2,
            ticklen=5
        ),
        yaxis=dict(
            title="",
            showline=True,
            linewidth=2,
            linecolor='black',
            ticks='inside',
            tickwidth=2,
            ticklen=5
        ),
        width=1200,
        height=1000,
        plot_bgcolor='white',
        paper_bgcolor='white',
        font=dict(family='Helvetica', size=16, color='black'),
        margin=dict(l=5, r=5, b=5, t=10)
    )

    # Create the figure
    fig = go.Figure(data=[heatmap], layout=layout)

    # Show the figure
    fig.show()
    
corr_matrix(df_MP_asymcell, physical_features)

: 

In [None]:
corr_matrix(df_MP_asymcell, PHFs.columns.tolist())

: 

Not sure if these Correlation matrixs are showing anything significant in particular, I think a feature importance will be better.

In [None]:
regressor_df = pd.DataFrame(columns=["mae" ,"mse", "rmse", "r2", "n_estimators", "max_depth", "min_samples_split", "min_samples_leaf", "max_features"])

for i in range(1):
    model, best_params, mae, mse, rmse, r2 = RFregressor(PHFs, target)
    regressor_df = regressor_df.append({"mae": mae ,"mse": mse, "rmse": rmse, "r2": r2, 
                                        "n_estimators": best_params["n_estimators"], "max_depth": best_params["max_depth"], 
                                        "min_samples_split": best_params["min_samples_split"], "min_samples_leaf": best_params["min_samples_leaf"], "max_features": best_params["max_features"]}, ignore_index=True)

: 

In [None]:
regressor_df

: 

In [None]:
regressor_df.describe()

: 

In [None]:
regressor_df["n_estimators"].mean()

: 

### Hyperparameters should be:
- n_estimators = 100
- max_depth = 20
- min_samples_split = 2
- min_samples_leaf = 1
- max_features = sqrt

In [None]:
model.fit(PHFs, target)
# Get the feature importances
importances = model.feature_importances_

# Get the names of the features
feature_names = PHFs.columns.tolist()

import plotly.graph_objects as go

# Sort the feature importances in descending order
indices = np.argsort(importances)[::-1]

# Create the bar plot
fig = go.Figure(data=go.Bar(
    x=[feature_names[i] for i in indices],
    y=importances[indices],
    marker_color='rgb(49, 104, 142)'
))

# Set the layout
fig.update_layout(
    xaxis=dict(
        title="Features",
        showline=True,
        linewidth=2,
        linecolor='black',
        ticks='outside',
        tickson = "boundaries",
        tickwidth=2,
        ticklen=5
    ),
    yaxis=dict(
        title="Feature Importance",
        showline=True,
        linewidth=2,
        linecolor='black',
        ticks='inside',
        tickwidth=2,
        ticklen=5
    ),
    barmode='group',
    width=800,
    height=500,
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(family='Helvetica', size=16, color='black'),
    margin=dict(l=10, r=10, b=10, t=10),
    legend=dict(
        title=dict(
            text="Unit Cell Size",
            font=dict(
                family='Helvetica',
                size=16,
                color='black'
            )
        )
    )
)
# Show the plot
fig.show()

: 

In [None]:
combined_features = pd.concat([PHFs, df_MP_asymcell[physical_features]], axis=1)

: 

In [None]:
regressor_df = pd.DataFrame(columns=["mae" ,"mse", "rmse", "r2", "n_estimators", "max_depth", "min_samples_split", "min_samples_leaf", "max_features"])

for i in range(1):
    model, best_params, mae, mse, rmse, r2 = RFregressor(combined_features, target)
    regressor_df = regressor_df.append({"mae": mae ,"mse": mse, "rmse": rmse, "r2": r2, 
                                        "n_estimators": best_params["n_estimators"], "max_depth": best_params["max_depth"], 
                                        "min_samples_split": best_params["min_samples_split"], "min_samples_leaf": best_params["min_samples_leaf"], "max_features": best_params["max_features"]}, ignore_index=True)

: 