### RFC

### import modules and configure notebook

In [1]:
import pandas as pd
import numpy as np
import swifter
import seaborn as sns
import matplotlib.pyplot

pd.set_option('max.rows', None)
pd.set_option('max.columns', None)

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score, f1_score

%matplotlib inline

### Load variables stored by data_preproccessing notebook

In [2]:
%store -r train_data
%store -r test_data
%store -r my_data
%store -r uniques


In [3]:
save_plots = False

In [4]:
train_data.columns.values[9:-1]

array(['Li7', 'Be9', 'B11', 'Mg24', 'Al27', 'Si28', 'P31', 'S33', 'K39',
       'Ca42', 'Sc45', 'Ti47', 'V51', 'Cr52', 'Mn55', 'Fe56', 'Co59',
       'Ni60', 'Cu63', 'Zn68', 'Ga69', 'Ge72', 'As75', 'Rb85', 'Sr88',
       'Y89', 'Zr90', 'Nb93', 'Mo95', 'Cd111', 'In115', 'Sn118', 'Cs133',
       'Ba137', 'La139', 'Ce140', 'Pr141', 'Nd146', 'Sm147', 'Eu153',
       'Gd157', 'Tb159', 'Dy163', 'Ho165', 'Er166', 'Tm169', 'Yb172',
       'Lu175', 'Hf178', 'Ta181', 'Pb208', 'Th232', 'U238'], dtype=object)

### Turn feature data and class to be predicited into numpy arrays

In [5]:
X = np.array(train_data[train_data.columns.values[9:-1]])
y = np.array(train_data['class'])

### Carry out 10-f0ld stratified cross validation, class f1 scores and macro f1 scores with weighted averages are calculated

In [None]:
svc = SVC(verbose=3)
skf = StratifiedKFold(n_splits=10)
skf.get_n_splits(X, y)
class_f1_scores = []
macro_f1_scores = []
accuracy_scores = []
feat_imp =[]
f1_dict = {}
count = 0
for train_index, test_index in skf.split(X, y):
    count = count + 1
    print('making model:')
    key = 'round' + str(count)
    print(count)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    svc.fit(X_train, y_train)
    y_pred = svc.predict(X_test)
    class_f1_scores = f1_score(y_test, y_pred, average = None)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)
    macro_f1_scores.append(f1_score(y_test, y_pred, average = 'weighted'))
    f1_dict[key] = class_f1_scores 
   

making model:
1
[LibSVM]

In [None]:
f1_df = pd.DataFrame(data = f1_dict)


In [None]:
for key in f1_dict:
    print(len(f1_dict[key]))

### Below are the encodings for the class variable

In [None]:
print(train_data['class'].unique())
print(list(uniques))

In [None]:
f1_df_final = pd.concat([f1_df, pd.Series(uniques)], axis = 1)

In [None]:
f1_df_final.rename(columns={0:'class'}, inplace=True)
f1_df_final.set_index('class', drop = True, inplace = True)

### Boxplot showing the distribution of class f1 scores from 10 models

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
plot = sns.boxplot(data = f1_df_final.T)
plot.set_title('F1 scores for each site', fontdict={'fontsize': 14})
plot.set_ylabel('F1 score', fontdict={'fontsize': 11})
plot.set_xlabel("Bedrock site or superficial deposit 'region'", fontdict={'fontsize': 11})

if save_plots == True:
    fig = plot.get_figure()
    fig.savefig('site_specific_f1_scores.png')

### Boxplot showing the macro F1 score with weighted averages

In [None]:
plot = sns.boxplot(macro_f1_scores)
plot.set_title('Average-weighted macro-f1 score', fontdict={'fontsize': 14})
plot.set_xlabel("F1-score", fontdict={'fontsize': 11})

if save_plots == True:
    fig = plot.get_figure()
    fig.savefig('macro_f1_scores.png')

### Boxplot showing accuracy scores

In [None]:
sns.boxplot(accuracy_scores)

### Get feature importances

In [None]:
feat_imp_df = pd.DataFrame(data = feat_imp_dict)
feat_imp_df.head()

In [None]:
feat_imp_df_final = pd.concat([feat_imp_df, pd.Series(my_data[my_data.columns.values[9:-1]].columns.values)], axis = 1)
feat_imp_df_final.rename(columns = {0:'element'}, inplace = True )
feat_imp_df_final.head()

In [None]:
feat_imp_df_final.set_index('element', inplace=True)


In [None]:
feat_imp_df_final_plot = feat_imp_df_final.T

In [None]:
feat_imp_df_final_plot

elements = feat_imp_df_final_plot.columns.values 
mean_feature_importance = []
for col in list(feat_imp_df_final_plot.columns.values):
    mean_feature_importance.append(feat_imp_df_final_plot[col].mean())
    

In [None]:
mean_feature_importance_df = pd.concat([pd.Series(elements), pd.Series(mean_feature_importance)], axis = 1)

In [None]:
mean_feature_importance_df.rename(columns={0:'elements', 1:'mean_importance'}, inplace=True)

In [None]:
mean_feature_importance_df.sort_values(by='mean_importance', ascending=False, inplace=True)

In [None]:
ordered_col_names = list(mean_feature_importance_df['elements'])

In [None]:
sns.set_style("whitegrid")
sns.set_style()
sns.set(rc={'figure.figsize':(20,20)})
plot = sns.boxplot(data = feat_imp_df_final_plot[ordered_col_names])
plot.set_xticklabels(plot.get_xticklabels(),rotation=90, ha = 'left')
plot.set_title('Feature (element) importance', fontdict={'fontsize': 20})
plot.set_ylabel('Feature importance', fontdict={'fontsize': 15})
plot.set_xlabel("Element", fontdict={'fontsize': 15})

if save_plots == True:
    fig = plot.get_figure()
    fig.savefig('feature_importances.png')

### Model is built for predicting source of artefacts 

In [None]:
RFC_final = RandomForestClassifier(n_estimators=800)

In [None]:
RFC_final.fit(X, y)

In [None]:
print(train_data['class'].unique())
print(uniques)

In [None]:
df_for_identifiers = test_data.copy(deep = True)
identifiers =  df_for_identifiers['Analysis']

### Predictions are made for the artefacts

In [None]:
y_pred = RFC_final.predict(np.array(test_data[test_data.columns.values[8:-1]]))

y_pred_proba = RFC_final.predict_proba(np.array(test_data[test_data.columns.values[8:-1]]))


In [None]:
probabilities_df = pd.DataFrame(data = y_pred_proba, columns = uniques)
probabilities_df_final = pd.concat([probabilities_df, pd.Series(list(identifiers))], axis = 1)

In [None]:
probabilities_df_final.rename(columns = {0:'identifier'}, inplace=True)

In [None]:
final_pred_df = pd.concat([pd.Series(y_pred), probabilities_df_final], axis = 1)

In [None]:
final_pred_df.rename(columns={0:'class_number'}, inplace = True)

In [None]:
final_pred_df.to_csv('predictions.csv')


In [None]:
uniques_list = list(uniques)
def get_pred_names(row):
    return(uniques_list[row['class_number']])
final_pred_df['class_predictions'] = final_pred_df.apply(get_pred_names, axis = 1)

In [None]:
#final_pred_df_modal = final_pred_df.groupby(by = 'class_number')

In [None]:
final_pred_df