### RFC

### import modules and configure notebook

In [53]:
import pandas as pd
import numpy as np
import swifter
import seaborn as sns
import matplotlib.pyplot
import pickle

pd.set_option('max.rows', None)
pd.set_option('max.columns', None)

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score

%matplotlib inline

### Load variables stored by data_preproccessing notebook

In [87]:
%store -r train_data_formodel
%store -r test_data
%store -r my_data
%store -r uniques
%store -r best_feats


In [82]:
save_plots = False
random_seed_state = 42
classify_bedrock_only = True
pickle_model = False
pickle_model_name = 'grouped'

### I label encode the class column again because just bedrock is being classified and when class was label encoded in data reproccessing script label encoding was done for both bedrock sites and superficial regions

In [89]:
if classify_bedrock_only:
    
    test_data_superficial = train_data_formodel[train_data_formodel['Geology']=='Superficial']
    train_data_formodel = train_data_formodel[train_data_formodel['Geology']=='Bedrock']
    train_data_formodel['class'], uniques = pd.factorize(train_data_formodel['class'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [88]:
train_data_formodel.head()

Unnamed: 0,Analysis,Geology,Province,Region,Site,SubSite,Formation,Band,Nodule,Li7,Be9,B11,Mg24,Al27,Si28,P31,S33,K39,Ca42,Sc45,Ti47,V51,Cr52,Mn55,Fe56,Co59,Ni60,Cu63,Zn68,Ga69,Ge72,As75,Rb85,Sr88,Y89,Zr90,Nb93,Mo95,Cd111,In115,Sn118,Cs133,Ba137,La139,Ce140,Pr141,Nd146,Sm147,Eu153,Gd157,Tb159,Dy163,Ho165,Er166,Tm169,Yb172,Lu175,Hf178,Ta181,Pb208,Th232,U238,class
0,10_FH1_1_1,Bedrock,Northern,,FH,FH1,Burnham,FH1,FH1_1_1,15.63,0.12,48.36,154.63,943.71,464944.18,50.28,538.57,455.94,712.39,0.42,15.58,0.27,3.3,0.69,8.46,0.05,0.8,1.62,10.82,0.25,1.22,0.16,0.43,12.94,0.88,1.51,0.09,0.05,0.02,0.0,0.05,0.01,6.54,0.84,0.95,0.23,0.87,0.16,0.04,0.16,0.02,0.11,0.03,0.06,0.01,0.02,0.0,0.04,0.01,0.24,0.07,0.05,FH
1,11_FH1_1_1,Bedrock,Northern,,FH,FH1,Burnham,FH1,FH1_1_1,11.5,0.09,44.77,22.42,1077.11,465010.94,70.91,438.2,387.82,515.24,0.44,18.47,0.29,3.45,1.01,11.59,0.11,0.36,0.53,8.93,0.34,0.85,0.1,0.45,13.22,0.95,1.74,0.07,0.01,0.02,0.0,0.04,0.02,8.04,0.92,1.01,0.23,0.98,0.18,0.04,0.18,0.02,0.13,0.03,0.06,0.01,0.04,0.01,0.05,0.0,0.07,0.08,0.04,FH
2,12_FH1_1_1,Bedrock,Northern,,FH,FH1,Burnham,FH1,FH1_1_1,20.05,0.06,44.88,42.7,620.21,465295.41,104.47,372.66,363.71,957.89,0.76,19.89,0.55,3.25,1.21,87.99,0.21,1.68,1.53,11.98,0.25,1.71,0.13,0.43,8.52,0.87,0.93,0.1,0.02,0.02,0.0,0.05,0.01,3.13,0.9,1.08,0.26,0.84,0.15,0.04,0.19,0.02,0.14,0.02,0.07,0.01,0.06,0.0,0.02,0.01,0.46,0.05,0.05,FH
3,13_FH1_1_2,Bedrock,Northern,,FH,FH1,Burnham,FH1,FH1_1_2,11.16,0.73,47.06,162.42,1143.19,465099.89,56367.93,1075.89,547.55,2174.3,0.43,42.3,0.67,152.42,4.84,145.34,0.3,2.45,5.02,17.15,0.35,2.13,0.84,0.76,13.16,0.97,2.0,0.1,0.29,0.18,0.01,0.78,0.04,8.74,0.93,0.95,0.21,0.75,0.13,0.04,0.25,0.02,0.09,0.03,0.05,0.0,0.03,0.0,0.08,0.0,0.64,0.05,0.03,FH
4,14_FH1_1_2,Bedrock,Northern,,FH,FH1,Burnham,FH1,FH1_1_2,17.71,0.32,48.26,33.52,547.22,465027.11,44.44,464.78,278.25,1551.63,0.71,11.18,0.27,2.56,1.73,25.38,0.05,0.8,0.55,9.8,0.41,1.41,0.12,0.28,9.9,0.9,0.9,0.08,0.04,0.1,0.0,0.09,0.01,2.74,0.97,1.09,0.27,1.0,0.17,0.04,0.19,0.02,0.15,0.03,0.05,0.01,0.05,0.01,0.02,0.01,0.59,0.06,0.09,FH


In [57]:
train_data_formodel.columns.values[9:-1]

array(['Li7', 'Be9', 'B11', 'Mg24', 'Al27', 'Si28', 'P31', 'S33', 'K39',
       'Ca42', 'Sc45', 'Ti47', 'V51', 'Cr52', 'Mn55', 'Fe56', 'Co59',
       'Ni60', 'Cu63', 'Zn68', 'Ga69', 'Ge72', 'As75', 'Rb85', 'Sr88',
       'Y89', 'Zr90', 'Nb93', 'Mo95', 'Cd111', 'In115', 'Sn118', 'Cs133',
       'Ba137', 'La139', 'Ce140', 'Pr141', 'Nd146', 'Sm147', 'Eu153',
       'Gd157', 'Tb159', 'Dy163', 'Ho165', 'Er166', 'Tm169', 'Yb172',
       'Lu175', 'Hf178', 'Ta181', 'Pb208', 'Th232', 'U238'], dtype=object)

In [90]:
uniques

Index(['FH', 'ER', 'WW', 'TC', 'BC_CS', 'KQ', 'AR', 'SL', 'FG', 'WB_BX', 'PF',
       'BM', 'WH', 'SQ_BP', 'WN', 'BH', 'PH', 'LB'],
      dtype='object')

In [59]:
y = np.array(train_data_formodel['class'])

In [60]:
train_data_feats = train_data_formodel[train_data_formodel.columns.values[9:-1]]

In [61]:
train_data_feats.shape

(808, 53)

### Turn feature data and class to be predicited into numpy arrays

In [62]:
X = np.array(train_data_feats)

In [63]:
print(X.shape)
print(y.shape)

(808, 53)
(808,)


### Carry out 10-f0ld stratified cross validation, class f1 scores and macro f1 scores with weighted averages are calculated

In [64]:
RFC = RandomForestClassifier(n_estimators=2000, random_state=random_seed_state)
skf = StratifiedKFold(n_splits=10, random_state=random_seed_state)
skf.get_n_splits(X, y)
class_f1_scores = []
macro_f1_scores = []
accuracy_scores = []
feat_imp =[]
f1_dict = {}
feat_imp_dict = {}
count = 0
for train_index, test_index in skf.split(X, y):
    count = count + 1
    print('making model:')
    key = 'round' + str(count)
    print(count)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    RFC.fit(X_train, y_train)
    y_pred = RFC.predict(X_test)
    class_f1_scores = f1_score(y_test, y_pred, average = None)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)
    macro_f1_scores.append(f1_score(y_test, y_pred, average = 'weighted'))
    f1_dict[key] = class_f1_scores 
    feat_imp_dict[key] = RFC.feature_importances_

making model:
1


KeyboardInterrupt: 

In [None]:
f1_df = pd.DataFrame(data = f1_dict)


In [None]:
for key in f1_dict:
    print(len(f1_dict[key]))

### Below are the encodings for the class variable

In [None]:
print(train_data_formodel['class'].unique())
print(list(uniques))

In [None]:
f1_df_final = pd.concat([f1_df, pd.Series(uniques)], axis = 1)

In [None]:
f1_df_final.rename(columns={0:'class'}, inplace=True)
f1_df_final.set_index('class', drop = True, inplace = True)

### Boxplot showing the distribution of class f1 scores from 10 models

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
plot = sns.boxplot(data = f1_df_final.T)
plot.set_title('F1 scores for each site', fontdict={'fontsize': 14})
plot.set_ylabel('F1 score', fontdict={'fontsize': 11})
plot.set_xlabel("Bedrock site or superficial site", fontdict={'fontsize': 11})

if save_plots == True:
    fig = plot.get_figure()
    fig.savefig('output/site_specific_f1_scores.png')

In [None]:
pd.DataFrame(data = f1_df_final.T.median()).to_csv('output/median_class_f1_scores.csv')

### Boxplot showing the macro F1 score with weighted averages

In [None]:
plot = sns.boxplot(macro_f1_scores)
plot.set_title('Average-weighted macro-f1 score', fontdict={'fontsize': 14})
plot.set_xlabel("F1-score", fontdict={'fontsize': 11})

if save_plots == True:
    fig = plot.get_figure()
    fig.savefig('output/macro_f1_scores.png')

In [None]:
pd.Series(pd.Series(macro_f1_scores).median()).to_csv('output/median_macro_f1.csv')

### Boxplot showing accuracy scores

In [None]:
sns.boxplot(accuracy_scores)

### Get feature importances

In [None]:
feat_imp_df = pd.DataFrame(data = feat_imp_dict)
feat_imp_df.head()

In [None]:
feat_imp_df_final = pd.concat([feat_imp_df, pd.Series(my_data[my_data.columns.values[9:-1]].columns.values)], axis = 1)
feat_imp_df_final.rename(columns = {0:'element'}, inplace = True )
feat_imp_df_final.head()

In [None]:
feat_imp_df_final.set_index('element', inplace=True)


In [None]:
feat_imp_df_final_plot = feat_imp_df_final.T

In [None]:
feat_imp_df_final_plot

elements = feat_imp_df_final_plot.columns.values 
mean_feature_importance = []
for col in list(feat_imp_df_final_plot.columns.values):
    mean_feature_importance.append(feat_imp_df_final_plot[col].mean())
    

In [None]:
mean_feature_importance_df = pd.concat([pd.Series(elements), pd.Series(mean_feature_importance)], axis = 1)

In [None]:
mean_feature_importance_df.rename(columns={0:'elements', 1:'mean_importance'}, inplace=True)

In [None]:
mean_feature_importance_df.sort_values(by='mean_importance', ascending=False, inplace=True)

In [None]:
ordered_col_names = list(mean_feature_importance_df['elements'])

In [None]:
sns.set_style("whitegrid")
sns.set_style()
sns.set(rc={'figure.figsize':(20,20)})
plot = sns.boxplot(data = feat_imp_df_final_plot[ordered_col_names])
plot.set_xticklabels(plot.get_xticklabels(),rotation=90, ha = 'left')
plot.set_title('Feature (element) importance', fontdict={'fontsize': 20})
plot.set_ylabel('Feature importance', fontdict={'fontsize': 15})
plot.set_xlabel("Element", fontdict={'fontsize': 15})

if save_plots == True:
    fig = plot.get_figure()
    fig.savefig('output/feature_importances.png')

### Model is built for predicting source of artefacts 

In [65]:
RFC_final = RandomForestClassifier(n_estimators=2000, random_state = random_seed_state)

In [66]:
RFC_final.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [67]:
if pickle_model == True:
    pickle.dump(RFC_final, open('models/' + pickle_model_name + '_' + 'rfc_model.sav', 'wb'))

In [68]:
print(train_data_formodel['class'].unique())
print(uniques)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]
Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], dtype='int64')


In [69]:
df_for_identifiers = test_data_superficial.copy(deep = True)
identifiers =  df_for_identifiers['Analysis']

### Predictions are made for the artefacts

In [70]:


test_data_superficial.columns.values[9:-1]

y_pred = RFC_final.predict(np.array(test_data_superficial[test_data.columns.values[9:-1]]))

y_pred_proba = RFC_final.predict_proba(np.array(test_data_superficial[test_data.columns.values[9:-1]]))


In [71]:
test_data_superficial.head()

Unnamed: 0,Analysis,Geology,Province,Region,Site,SubSite,Formation,Band,Nodule,Li7,Be9,B11,Mg24,Al27,Si28,P31,S33,K39,Ca42,Sc45,Ti47,V51,Cr52,Mn55,Fe56,Co59,Ni60,Cu63,Zn68,Ga69,Ge72,As75,Rb85,Sr88,Y89,Zr90,Nb93,Mo95,Cd111,In115,Sn118,Cs133,Ba137,La139,Ce140,Pr141,Nd146,Sm147,Eu153,Gd157,Tb159,Dy163,Ho165,Er166,Tm169,Yb172,Lu175,Hf178,Ta181,Pb208,Th232,U238,class
808,005_AB_1,Superficial,,AB,AB,AB,,AB,AB_1,19.35,0.02,58.85,114.31,306.53,461376.55,258.5,487.95,217.05,6947.3,1.0,11.26,1.03,6.31,5.91,73.56,0.05,1.03,0.24,12.06,0.16,0.8,0.49,0.52,9.6,0.21,0.24,0.04,0.04,0.44,0.01,0.11,0.04,1.06,0.18,0.24,0.04,0.17,0.03,0.01,0.01,0.01,0.04,0.0,0.01,0.0,0.01,0.0,0.01,0.0,0.56,0.05,0.21,18
809,006_AB_1,Superficial,,AB,AB,AB,,AB,AB_1,2.61,0.05,85.03,41.1,413.97,465895.71,63.29,486.15,353.39,180.54,1.25,13.83,0.6,4.78,0.49,55.86,0.05,1.02,0.11,9.73,0.29,0.64,0.32,0.53,1.18,0.21,0.3,0.05,0.04,0.04,0.0,0.06,0.02,0.58,0.16,0.3,0.06,0.17,0.02,0.01,0.03,0.0,0.03,0.0,0.02,0.0,0.02,0.0,0.0,0.0,0.24,0.06,0.51,18
810,007_AB_1,Superficial,,AB,AB,AB,,AB,AB_1,2.29,0.05,80.04,44.56,364.5,465926.09,64.45,480.47,329.78,213.17,1.12,11.18,0.6,4.62,0.57,84.15,0.05,0.86,0.22,11.18,0.27,0.67,0.39,0.47,1.25,0.21,0.25,0.04,0.04,0.05,0.01,0.06,0.02,0.49,0.2,0.27,0.07,0.24,0.03,0.01,0.03,0.0,0.03,0.01,0.02,0.0,0.0,0.0,0.02,0.01,0.48,0.05,0.63,18
811,008_AB_2,Superficial,,AB,AB,AB,,AB,AB_2,6.86,0.29,18.37,10.69,139.26,466203.58,54.53,482.84,90.89,225.6,1.46,11.29,0.83,4.46,1.66,355.62,0.07,1.22,0.31,9.66,0.19,0.71,0.13,0.15,4.35,0.67,0.24,0.09,0.04,0.03,0.0,0.09,0.0,4.74,0.8,1.04,0.2,0.63,0.15,0.02,0.13,0.01,0.1,0.02,0.07,0.01,0.04,0.0,0.0,0.01,3.11,0.06,0.03,18
812,009_AB_2,Superficial,,AB,AB,AB,,AB,AB_2,8.45,0.19,17.89,18.67,180.19,466244.39,56.98,506.4,122.66,130.84,1.47,7.65,0.59,4.65,0.91,245.17,0.06,1.22,0.24,10.16,0.19,0.98,0.16,0.23,3.85,0.57,0.3,0.05,0.04,0.02,0.01,0.08,0.01,4.22,0.7,1.02,0.18,0.57,0.11,0.03,0.12,0.01,0.1,0.01,0.05,0.0,0.04,0.0,0.01,0.0,3.04,0.06,0.06,18


In [72]:
uniques.shape

(18,)

In [93]:
probabilities_df = pd.DataFrame(data = y_pred_proba, columns = uniques)
probabilities_df_final = pd.concat([probabilities_df, pd.Series(list(identifiers))], axis = 1)

In [94]:
probabilities_df_final.rename(columns = {0:'identifier'}, inplace=True)

In [95]:
final_pred_df = pd.concat([pd.Series(y_pred), probabilities_df_final], axis = 1)

In [96]:
final_pred_df.rename(columns={0:'class_number'}, inplace = True)

In [102]:
final_pred_df.to_csv('supertobed_predictions.csv')


In [98]:
uniques_list = list(uniques)
def get_pred_names(row):
    return(uniques_list[row['class_number']])
final_pred_df['class_predictions'] = final_pred_df.apply(get_pred_names, axis = 1)

In [99]:
#final_pred_df_modal = final_pred_df.groupby(by = 'class_number')

In [100]:
final_pred_df.head()

Unnamed: 0,class_number,FH,ER,WW,TC,BC_CS,KQ,AR,SL,FG,WB_BX,PF,BM,WH,SQ_BP,WN,BH,PH,LB,identifier,class_predictions
0,16,0.0575,0.0085,0.024,0.002,0.063,0.034,0.009,0.0485,0.065,0.1275,0.0025,0.033,0.072,0.05,0.086,0.055,0.25,0.0125,005_AB_1,PH
1,9,0.069,0.006,0.0315,0.0425,0.044,0.038,0.001,0.058,0.0155,0.333,0.0,0.044,0.0145,0.015,0.0105,0.079,0.182,0.0165,006_AB_1,WB_BX
2,9,0.037,0.007714,0.031,0.0435,0.0495,0.0385,0.001,0.0725,0.0115,0.373571,0.0,0.0375,0.0125,0.013,0.0115,0.0865,0.158214,0.015,007_AB_1,WB_BX
3,2,0.2255,0.028286,0.2285,0.0275,0.1275,0.014,0.002,0.032,0.032,0.1075,0.0005,0.0375,0.009,0.0145,0.037,0.003,0.065214,0.0085,008_AB_2,WW
4,2,0.201375,0.028813,0.2515,0.019,0.099,0.024,0.002,0.028,0.0505,0.107625,0.001,0.0495,0.0095,0.013,0.0445,0.0035,0.057188,0.01,009_AB_2,WW
