### predictions

### import modules and configure notebook

In [1]:
import pandas as pd
import numpy as np
import swifter
import seaborn as sns
import matplotlib.pyplot
import pickle

pd.set_option('max.rows', None)
pd.set_option('max.columns', None)

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE, ADASYN

%matplotlib inline

### Load variables stored by data_preproccessing notebook

In [2]:
%store -r train_data_formodel
%store -r test_data
%store -r my_data
%store -r uniques
%store -r best_feats
%store -r best_feats_ranks
%store -r X_test_labeled_df



### configurations
* save_plots -> True|False
* random_seed_state -> number, sets random state for model and for stratified splits 
* classify_bedrock_only -> True|False
* pickle_model -> True|False, wether model should be serialised and saved
* pickle_model_name -> string, name of serialised model
* grid_search -> True|False, if set to true then grid search is performed to identify optimum hyperparamaters for model 
* scale -> True|False if set to True then features scaled to all have mean value 0 and standard deviation 1
* pickle_file_path -> string,  filepath for serialised model to be saved to

In [3]:
save_plots = False
random_seed_state = 42
classify_bedrock_only = False
grid_search = False
scale = False
save_predictions = False
modelName = 'rfc'

### if only bedrock sites are classified then classes are label encoded, if bedrock sites alone are not being classified then the class sites would have already been label encoded in the 1 data_preproccessing notebook 

In [4]:
if classify_bedrock_only:
    train_data_formodel['class'], uniques = pd.factorize(train_data_formodel['class'])
    train_data_formodel = train_data_formodel[train_data_formodel['Geology']=='Bedrock']

### counts of instances in all classes before oversampling

In [5]:
train_data_formodel['class'].value_counts()

19    148
4     135
21    105
15    100
20     74
16     61
22     60
0      53
9      47
11     45
14     36
13     36
12     36
2      36
10     30
7      30
6      30
8      27
5      27
18     27
1      24
17     18
3      18
Name: class, dtype: int64

### The class column is stored as the variable y 

In [6]:
y = np.array(train_data_formodel['class'])

### The variables identified as best by the 2 feature_selection notebook are used as features

In [7]:
train_data_feats = train_data_formodel[best_feats]

### address class imbalance using synthetic minority oversampling technique (SMOTE) algorithm

In [8]:
if scale:
    my_scaler = StandardScaler()
    X = np.array(my_scaler.fit_transform(np.array(train_data_feats)))
else:
    X = np.array(np.array(train_data_feats))

### the dimensions of the class and features are checked

In [9]:
print(X.shape)
print(y.shape)

(1203, 30)
(1203,)


### grid search is utilised to identify optimum hyperparamaters

In [None]:
X_post_smote, y_post_smote = SMOTE(random_state=42).fit_sample(X, y)

esti = RandomForestClassifier(n_estimators=100, random_state = random_seed_state)


max_depth = [80, 100, 120]
min_samples_split = [2, 3, 4]
min_samples_leaf = [1, 2, 3]

param_grid = {
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
                }

clf = GridSearchCV(estimator = esti, param_grid= param_grid,
                                  n_jobs=-1, scoring='f1_macro', cv = 10, verbose=3)
clf.fit(X_post_smote, y_post_smote)
esti_final = clf.best_estimator_


### Model is built for predicting source of artefacts 

In [33]:
esti_final.fit(X_post_smote, y_post_smote)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [34]:
print(train_data_formodel['class'].unique())
print(uniques)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22]
Index(['FH', 'ER', 'WW', 'TC', 'BC_CS', 'KQ', 'AR', 'SL', 'FG', 'WB_BX', 'PF',
       'WH', 'SQ_BP', 'WN', 'BH', 'PH', 'LB', 'AB', 'LV', 'SV_SE', 'BA', 'WA',
       'MM'],
      dtype='object')


In [35]:
identifiers =  X_test_labeled_df['Analysis']

### Predictions are made for the artefacts

In [36]:
X_test_labeled_df.columns.values[1:-1]

array(['Zr90', 'Nd146', 'Ba137', 'Sr88', 'Rb85', 'Ge72', 'Fe56', 'Cr52',
       'Sc45', 'U238', 'Ca42', 'B11', 'S33', 'P31', 'Mg24', 'Al27'],
      dtype=object)

In [37]:
y_pred = esti_final.predict(np.array(X_test_labeled_df[X_test_labeled_df.columns.values[1:-1]]))

y_pred_proba = esti_final.predict_proba(np.array(X_test_labeled_df[X_test_labeled_df.columns.values[1:-1]]))


ValueError: Number of features of the model must match the input. Model n_features is 30 and input n_features is 16 

In [None]:
probabilities_df = pd.DataFrame(data = y_pred_proba, columns = uniques)
probabilities_df_final = pd.concat([probabilities_df, identifiers], axis = 1)

In [None]:
probabilities_df_final.head()

In [None]:
final_pred_df = pd.concat([pd.Series(y_pred), probabilities_df_final], axis = 1).rename(columns={0:'class_number'})

In [None]:
final_pred_df.head()

### labels outputted by local outlier factor model are appended to predictions

In [None]:
final_predictions_df = pd.concat([final_pred_df, X_test_labeled_df['inlierLabel']], axis = 1)

In [None]:
final_predictions_df.head()

In [None]:
uniques_list = list(uniques)
def get_pred_names(row):
    return(uniques_list[row['class_number']])
final_predictions_df['class_predictions'] = final_predictions_df.apply(get_pred_names, axis = 1)

In [None]:
final_predictions_df.head()

### final predictions including assignments from local outlier factor model are added

In [None]:
def outlierAssigner(row):
    if row['inlierLabel'] == -1:
        return('other')
    else:
        return(row['class_predictions'])
    
final_predictions_df['class_predictions'] = final_predictions_df.swifter.apply(outlierAssigner, axis = 1)

In [None]:
final_predictions_df.head()

### predictions are stored as a variable into memory

In [None]:
%store final_predictions_df

### predictions are outputted as csv file

In [None]:
if save_predictions:
    final_predictions_df.to_csv('predictions.csv')
