In [1]:
import sys
sys.path.append('..')
from cleaned_code import *
import pandas as pd
import glob

import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

def RFclassifier(features, target, folds = 5):
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2)
    
    # Create a Random Forest classifier
    model = RandomForestClassifier(n_estimators=50, max_depth=5, min_samples_leaf=2, min_samples_split=2)

    # Train the classifier on the training data
    model.fit(x_train, y_train)

    # Predict on the test data
    predictions = model.predict(x_test)
    
    accuracy = accuracy_score(y_test, predictions)
    
    conf_matrix = confusion_matrix(y_test, predictions)
    
    class_report = classification_report(y_test, predictions)
    
    cv_scores = cross_val_score(model, features, target, cv=folds)
    mean_cv_score = np.mean(cv_scores)
    standard_deviation_cv_scores = np.std(cv_scores)
    standard_error = standard_deviation_cv_scores / np.sqrt(folds)
    
    return model, accuracy, conf_matrix, class_report, mean_cv_score, standard_error

Load in data and clean it, model it for analysis

In [3]:
# Read each dataset and store them in a list
data_df = pd.DataFrame(columns=['Length', 'Size', "Accuracy" ,"Cross val mean", "Cross val error"])

for file_path in glob.glob('dummy_datasets/*.csv'):
    unit_cell_size = file_path.split('_')[-3]
    dataset_size = file_path.split('_')[-1].split('.')[0]
    
    dataset = pd.read_csv(file_path)
    
    features = dataset[dataset.columns[7:]]
    
    target = dataset["structure type"].astype('category').cat.codes
    #Model was ran 100 times which were later averaged out
    for i in range(100):
        model, accuracy, conf_matrix, class_report, mean_cv_score, standard_error = RFclassifier(features, target)
        
        data_df = data_df.append({'Length': unit_cell_size, 'Size': dataset_size, "Accuracy" : accuracy,
                                "Cross val mean": mean_cv_score, "Cross val error": standard_error}, ignore_index=True)
    

In [4]:
data_df

Unnamed: 0,Length,Size,Accuracy,Cross val mean,Cross val error
0,2,14000,0.825714,0.832000,0.001013
1,2,14000,0.842857,0.828071,0.003033
2,2,14000,0.829643,0.825571,0.004698
3,2,14000,0.821786,0.831000,0.002363
4,2,14000,0.842500,0.827571,0.002860
...,...,...,...,...,...
1495,4,7000,0.829286,0.828857,0.002700
1496,4,7000,0.837143,0.832571,0.001287
1497,4,7000,0.837143,0.828857,0.001745
1498,4,7000,0.825714,0.827571,0.002438


In [5]:
mean_grouped_data = data_df.groupby(['Size', 'Length']).mean().reset_index()
mean_grouped_data["Size"] = mean_grouped_data["Size"].astype(int)
mean_grouped_data["Length"] = mean_grouped_data["Length"].astype(int)
mean_grouped_data = mean_grouped_data.sort_values("Size").reset_index(drop = True)

std_grouped_data = data_df.groupby(['Size', 'Length']).std().reset_index()
std_grouped_data["Size"] = std_grouped_data["Size"].astype(int)
std_grouped_data["Length"] = std_grouped_data["Length"].astype(int)
std_grouped_data = std_grouped_data.sort_values("Size").reset_index(drop=True)

In [6]:
import plotly.graph_objs as go

# Create a trace for each length
traces = []
colors = {0: 'rgb(53, 183, 121)', 1: 'rgb(49, 104, 142)', 2: 'rgb(72, 28, 110)'}  # Specify the colors you want

for i, length in enumerate(mean_grouped_data['Length'].unique()):
    mean_filtered_data = mean_grouped_data[mean_grouped_data['Length'] == length]
    std_filtered_data = std_grouped_data[std_grouped_data['Length'] == length]
    
    number = mean_filtered_data['Size'].astype(str)
    trace = go.Bar(
        x=number,
        y=mean_filtered_data['Cross val mean'],
        name=f'Size: {length}',
        error_y=dict(
            type='data',
            # array=mean_filtered_data['Cross val error'],
            array = std_filtered_data['Cross val mean'], #if you want to use the standard deviation instead
            visible=True,
        ),
        marker=dict(
            color=colors[i]  # Assign the color to each bar
        )
    )
    traces.append(trace)

# Create the layout for the plot
layout = go.Layout(
    xaxis=dict(
        title="Dataset Size",
        showline=True,
        linewidth=2,
        linecolor='black',
        ticks='outside',
        tickson = "boundaries",
        tickwidth=2,
        ticklen=5
    ),
    yaxis=dict(
        title="Average Cross Val Mean",
        showline=True,
        linewidth=2,
        linecolor='black',
        ticks='inside',
        tickwidth=2,
        ticklen=5
    ),
    barmode='group',
    width=800,
    height=500,
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(family='Helvetica', size=16, color='black'),
    margin=dict(l=10, r=10, b=10, t=10),
    legend=dict(
        title=dict(
            text="Unit Cell Size",
            font=dict(
                family='Helvetica',
                size=16,
                color='black'
            )
        )
    )
)

# Create the figure and add the traces and layout
fig = go.Figure(data=traces, layout=layout)

# Show the plot
fig.show()


Both of these factors dont change the outcome drastically, but the larger the dataset size the more accurate the cross validation score is, addditionally these great values form the cross validation score shows the model is not overfitting and is also producing really great results.

This is a much better representation of how well PHFs can work in a model, our superconductor dataset is around 3500 in size and since the unit cell size does not drastically change, I will only test the assymmetric unit cell vs unit cell size 2

In [7]:
# Create a trace for each length
traces = []
colors = {0: 'rgb(53, 183, 121)', 1: 'rgb(49, 104, 142)', 2: 'rgb(72, 28, 110)'}  # Specify the colors you want

for i, length in enumerate(mean_grouped_data['Length'].unique()):
    mean_filtered_data = mean_grouped_data[mean_grouped_data['Length'] == length]
    std_filtered_data = std_grouped_data[std_grouped_data['Length'] == length]
    
    number = mean_filtered_data['Size'].astype(str)
    trace = go.Bar(
        x=number,
        y=mean_filtered_data['Accuracy'],
        name=f'Size: {length}',
        error_y=dict(
            type='data',
            # array=mean_filtered_data['Cross val error'],
            array = std_filtered_data['Accuracy'], # if you want to use the standard deviation instead
            visible=True,
        ),
        marker=dict(
            color=colors[i]  # Assign the color to each bar
        )
    )
    traces.append(trace)

# Create the layout for the plot
layout = go.Layout(
    xaxis=dict(
        title="Dataset Size",
        showline=True,
        linewidth=2,
        linecolor='black',
        ticks='outside',
        tickson = "boundaries",
        tickwidth=2,
        ticklen=5
    ),
    yaxis=dict(
        title="Average Accuracy",
        showline=True,
        linewidth=2,
        linecolor='black',
        ticks='inside',
        tickwidth=2,
        ticklen=5
    ),
    barmode='group',
    width=800,
    height=500,
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(family='Helvetica', size=16, color='black'),
    margin=dict(l=10, r=10, b=10, t=10),
    legend=dict(
        title=dict(
            text="Unit Cell Size",
            font=dict(
                family='Helvetica',
                size=16,
                color='black'
            )
        )
    )
)

# Create the figure and add the traces and layout
fig = go.Figure(data=traces, layout=layout)

# Show the plot
fig.show()

The smaller dataset size 700 shows there is a lot of variability, but the unit cell size still shows no difference.

The accuracy in dataset sizes above 3500 is fairly the same number.