### RFC

### import modules and configure notebook

In [1]:
import pandas as pd
import numpy as np
import swifter
import seaborn as sns
import matplotlib.pyplot
import pickle

pd.set_option('max.rows', None)
pd.set_option('max.columns', None)

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import OneClassSVM
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.neighbors import LocalOutlierFactor

%matplotlib inline

### Load variables stored by data_preproccessing notebook

In [2]:
%store -r train_data_formodel
%store -r test_data
%store -r my_data
%store -r uniques
%store -r best_feats



### configurations
* save_plots -> True|False
* random_seed_state -> number, sets random state for model and for stratified splits 
* classify_bedrock_only -> True|False
* pickle_model -> True|False, wether model should be serialised and saved
* pickle_model_name -> string, name of serialised model
* grid_search -> True|False, if set to true then grid search is performed to identify optimum hyperparamaters for model 
* scale -> True|False if set to True then features scaled to all have mean value 0 and standard deviation 1
* pickle_file_path -> string,  filepath for serialised model to be saved to

In [4]:
save_plots = False
random_seed_state = 42
classify_bedrock_only = False
pickle_model = False
pickle_model_name = 'grouped'
pickle_file_path = '../../../model'
grid_search = False
scale = True

### if only bedrock sites are classified then classes are label encoded, if bedrock sites alone are not being classified then the class sites would have already been label encoded in the 1 data_preproccessing notebook 

In [5]:
if classify_bedrock_only:
    train_data_formodel['class'], uniques = pd.factorize(train_data_formodel['class'])
    train_data_formodel = train_data_formodel[train_data_formodel['Geology']=='Bedrock']

### counts of instances in all classes before oversampling

In [6]:
train_data_formodel['class'].value_counts()

22    120
4     105
23    105
16    100
21     74
17     61
24     60
0      53
12     45
14     36
2      36
15     36
6      30
11     30
10     30
7      30
20     28
5      27
8      27
19     27
1      24
13     21
3      18
18     18
9      17
Name: class, dtype: int64

### The class column is stored as the variable y 

In [7]:
y_pre_smote = np.array(train_data_formodel['class'])

### The variables identified as best by the 2 feature_selection notebook are used as features

In [8]:
train_data_feats = train_data_formodel[best_feats]

In [9]:
test_data_feats = test_data[best_feats]

In [10]:
train_data_feats.head()

Unnamed: 0,Li7,B11,Mg24,Al27,Si28,P31,S33,K39,Ca42,Sc45,V51,Cr52,Mn55,Fe56,Ni60,Cu63,Zn68,Ga69,Ge72,As75,Rb85,Sr88,Y89,Zr90,Cd111,Ba137,La139,Ce140,Pr141,Nd146,U238
0,15.63,48.36,33.658126,943.71,464944.18,50.28,538.57,455.94,712.39,0.42,0.27,3.3,0.69,8.46,0.8,1.62,10.82,0.25,1.22,0.16,0.43,12.94,0.88,1.51,0.02,6.54,0.84,0.95,0.23,0.87,0.05
1,11.5,44.77,33.658126,1077.11,465010.94,70.91,438.2,387.82,515.24,0.44,0.29,3.45,1.01,11.59,0.36,0.53,8.93,0.34,0.85,0.1,0.45,13.22,0.95,1.74,0.02,8.04,0.92,1.01,0.23,0.98,0.04
2,20.05,44.88,42.7,620.21,465295.41,104.47,372.66,363.71,957.89,0.76,0.55,3.25,1.21,87.99,1.68,1.53,11.98,0.25,1.71,0.13,0.43,8.52,0.87,0.93,0.02,3.13,0.9,1.08,0.26,0.84,0.05
3,11.16,47.06,33.658126,1143.19,462214.304116,2386.415832,1075.89,547.55,2174.3,0.43,0.67,152.42,4.84,145.34,2.45,5.02,17.15,0.35,2.13,0.84,0.76,13.16,0.97,2.0,0.18,8.74,0.93,0.95,0.21,0.75,0.03
4,17.71,48.26,33.52,547.22,465027.11,44.44,464.78,278.25,1551.63,0.71,0.27,2.56,1.73,25.38,0.8,0.55,9.8,0.41,1.41,0.12,0.28,9.9,0.9,0.9,0.1,2.74,0.97,1.09,0.27,1.0,0.09


### address class imbalance using synthetic minority oversampling technique (SMOTE) algorithm

In [11]:

X_post_smote, y = SMOTE(random_state=42).fit_sample(np.array(train_data_feats), y_pre_smote)

In [12]:
if scale:
    my_scaler = StandardScaler()
    X_train = np.array(my_scaler.fit_transform(X_post_smote))
    X_test = np.array(my_scaler.fit_transform(test_data_feats))
else:
    X = np.array(X_post_smote)
    X_test = np.array(test_data_feats)

### the dimensions of the class and features are checked

In [13]:
print(X_train.shape)
print(X_test.shape)
print(y.shape)

(3000, 31)
(363, 31)
(3000,)


In [14]:
estimator = OneClassSVM(random_state = 42)
estimator.fit(X_train)



OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma='auto_deprecated',
      kernel='rbf', max_iter=-1, nu=0.5, random_state=42, shrinking=True,
      tol=0.001, verbose=False)

In [15]:
predictions = estimator.predict(X_test)

In [16]:
len(predictions)

363

In [17]:
pd.Series(predictions).value_counts()

-1    186
 1    177
dtype: int64

In [18]:
lof = LocalOutlierFactor(novelty=True)

In [20]:
lof.fit(X_train)



LocalOutlierFactor(algorithm='auto', contamination='legacy', leaf_size=30,
          metric='minkowski', metric_params=None, n_jobs=None,
          n_neighbors=20, novelty=True, p=2)

In [21]:
predictions = lof.predict(X_test)

In [24]:
pd.Series(predictions).value_counts()

 1    215
-1    148
dtype: int64

In [25]:
215/(215+148)

0.5922865013774105