### RFC

### import modules and configure notebook

In [1]:
import pandas as pd
import numpy as np
import swifter
import seaborn as sns
import matplotlib.pyplot

pd.set_option('max.rows', None)
pd.set_option('max.columns', None)

from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import LocalOutlierFactor

%matplotlib inline

### Load variables stored by data_preproccessing notebook

In [5]:
%store -r train_data_formodel
%store -r test_data
%store -r my_data
%store -r uniques
%store -r best_feats

### configurations
* save_plots -> True|False
* random_seed_state -> number, sets random state for model and for stratified splits 
* classify_bedrock_only -> True|False
* pickle_model -> True|False, wether model should be serialised and saved
* pickle_model_name -> string, name of serialised model
* grid_search -> True|False, if set to true then grid search is performed to identify optimum hyperparamaters for model 
* scale -> True|False if set to True then features scaled to all have mean value 0 and standard deviation 1
* pickle_file_path -> string,  filepath for serialised model to be saved to

In [2]:
save_plots = False
random_seed_state = 42
classify_bedrock_only = False
pickle_model = False
pickle_model_name = 'grouped'
pickle_file_path = '../../../model'
grid_search = False
scale = False

### if only bedrock sites are classified then classes are label encoded, if bedrock sites alone are not being classified then the class sites would have already been label encoded in the 1 data_preproccessing notebook 

In [3]:
if classify_bedrock_only:
    train_data_formodel['class'], uniques = pd.factorize(train_data_formodel['class'])
    train_data_formodel = train_data_formodel[train_data_formodel['Geology']=='Bedrock']

### counts of instances in all classes before oversampling

In [6]:
train_data_formodel['class'].value_counts()

22    120
4     105
23    105
16    100
21     74
17     61
24     60
0      53
12     45
14     36
2      36
15     36
6      30
11     30
10     30
7      30
20     28
5      27
8      27
19     27
1      24
13     21
3      18
18     18
9      17
Name: class, dtype: int64

### The class column is stored as the variable y 

In [7]:
y_train = np.array(train_data_formodel['class'])

### The variables identified as best by the 2 feature_selection notebook are used as features

In [8]:
train_data_feats = train_data_formodel[best_feats]

In [9]:
test_data_feats = test_data[best_feats]
test_data_identifiers = test_data['Analysis']

In [10]:
train_data_feats.head()

Unnamed: 0,Zr90,Nd146,Ba137,Sr88,Rb85,Ge72,Fe56,Cr52,Sc45,U238,Ca42,B11,S33,P31,Mg24,Al27
0,1.51,0.87,6.54,12.94,0.43,1.22,8.46,3.3,0.42,0.05,712.39,48.36,538.57,50.28,33.658126,943.71
1,1.74,0.98,8.04,13.22,0.45,0.85,11.59,3.45,0.44,0.04,515.24,44.77,438.2,70.91,33.658126,1077.11
2,0.93,0.84,3.13,8.52,0.43,1.71,87.99,3.25,0.76,0.05,957.89,44.88,372.66,104.47,42.7,620.21
3,2.0,0.75,8.74,13.16,0.76,2.13,145.34,152.42,0.43,0.03,2174.3,47.06,1075.89,2386.415832,33.658126,1143.19
4,0.9,1.0,2.74,9.9,0.28,1.41,25.38,2.56,0.71,0.09,1551.63,48.26,464.78,44.44,33.52,547.22


In [11]:
if scale:
    my_scaler = StandardScaler()
    X_test = np.array(my_scaler.fit_transform(test_data_feats))
    X_train = np.array(my_scaler.fit_transform(train_data_feats))
else:
    X_test = np.array(test_data_feats)
    X_train = np.array(train_data_feats)

### the dimensions of the class and features are checked

In [12]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

(1158, 16)
(363, 16)
(1158,)


In [13]:
lof = LocalOutlierFactor(novelty=True)

In [14]:
lof.fit(X_train)



LocalOutlierFactor(algorithm='auto', contamination='legacy', leaf_size=30,
          metric='minkowski', metric_params=None, n_jobs=None,
          n_neighbors=20, novelty=True, p=2)

In [15]:
predictions = lof.predict(X_test)

In [16]:
nInliers = list(pd.Series(predictions).value_counts())[0]
nOutliers = list(pd.Series(predictions).value_counts())[1]


In [18]:
pd.Series(predictions).value_counts()

 1    328
-1     35
dtype: int64

In [19]:
print('there are {0} inliers and {1} outliers, so the proportion of inliers is {2}'.format(nInliers, nOutliers, (nInliers)/(X_test.shape[0])))

there are 328 inliers and 35 outliers, so the proportion of inliers is 0.9035812672176309


In [20]:
X_test_df = pd.DataFrame(data = X_test, columns = test_data_feats.columns.values)

In [23]:
test_data_identifiers

1243                06_DH1_1
1244                07_DH1_2
1245                08_DH1_3
1246                09_DH2_1
1247                10_DH2_2
1248                11_DH2_3
1249                12_DH3_1
1250                13_DH3_2
1251                14_DH3_3
1252                15_DH4_1
1253                16_DH4_2
1254                17_DH4_3
1255                18_DH5_1
1256                19_DH5_2
1257                20_DH5_3
1258      06_NMAGGold_8260_1
1259      07_NMAGGold_8260_2
1260      08_NMAGGold_8260_3
1261      09_NMAGGold_4848_1
1262      10_NMAGGold_4848_2
1263      11_NMAGGold_4848_3
1264      12_NMAGGold_4816_1
1265      13_NMAGGold_4816_2
1266      14_NMAGGold_4816_3
1267      15_NMAGGold_4869_1
1268      16_NMAGGold_4869_2
1269      17_NMAGGold_4869_3
1270      18_NMAGGold_4852_1
1271      19_NMAGGold_4852_2
1272      20_NMAGGold_4852_3
1273      26_NMAGGold_6591_1
1274      27_NMAGGold_6591_2
1275      28_NMAGGold_6591_3
1276      29_NMAGGold_6525_1
1277      30_N

In [24]:
X_test_labeled_df = pd.concat([test_data_identifiers.reset_index(drop = True), X_test_df, pd.Series(predictions)], axis =1).rename(columns = {0:'inlierLabel'})

In [25]:
X_test_labeled_df.head()

Unnamed: 0,Analysis,Zr90,Nd146,Ba137,Sr88,Rb85,Ge72,Fe56,Cr52,Sc45,U238,Ca42,B11,S33,P31,Mg24,Al27,inlierLabel
0,06_DH1_1,1.43,0.34,5.95,2.0,0.72,1.37,35.38,5.18,1.08,0.305667,119.05,93.21,806.55,35.07,27.83,635.24,1
1,07_DH1_2,1.47,0.24,7.51,1.84,0.77,1.93,49.23,5.43,1.09,0.305667,119.74,95.19,807.55,31.65,27.99,639.09,1
2,08_DH1_3,1.16,0.08,4.4,1.93,0.71,2.14,16.46,2.61,1.17,0.305667,140.99,93.63,837.73,25.43,26.43,670.48,1
3,09_DH2_1,1.45,0.25,12.48,4.29,1.62,1.59,16.76,2.87,1.19,0.08,141.7,63.8,750.62,24.63,31.98,628.024063,1
4,10_DH2_2,1.99,0.51,14.4,4.44,0.739,0.81,62.16,5.92,1.22,0.09,130.52,60.96,796.21,22.59,44.95,628.024063,1


In [26]:
%store X_test_labeled_df

Stored 'X_test_labeled_df' (DataFrame)
