### RFC

### import modules and configure notebook

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import LocalOutlierFactor

pd.set_option('max.rows', None)
pd.set_option('max.columns', None)
%matplotlib inline

### Load variables stored by data_preproccessing notebook

In [2]:
%store -r train_data_formodel
%store -r test_data
%store -r my_data
%store -r uniques
%store -r best_feats

### configurations
* save_plots -> boolean
* random_seed_state -> number, sets random state for model and for stratified splits 
* pickle_model -> True|False, wether model should be serialised and saved
* pickle_model_name -> string, name of serialised model
* scale -> True|False if set to True then features scaled to all have mean value 0 and standard deviation 1
* pickle_file_path -> string,  filepath for serialised model to be saved to

In [3]:
save_plots = False
random_seed_state = 42
pickle_model = False
pickle_model_name = 'grouped'
pickle_file_path = '../../../model'
scale = False

### counts of instances in all classes before oversampling

In [4]:
train_data_formodel['class'].value_counts()

4     105
17    100
18     61
0      53
10     47
13     45
15     36
16     36
2      36
12     30
11     30
8      30
7      30
5      30
6      27
9      27
1      24
14     21
3      18
Name: class, dtype: int64

### The class column is stored as the variable y 

In [5]:
y_train = np.array(train_data_formodel['class'])

### The variables identified as best by feature selection are used as features

In [6]:
train_data_feats = train_data_formodel[best_feats]
test_data_feats = test_data[best_feats]
test_data_identifiers = test_data['Analysis']

### features are standardised

In [7]:
if scale:
    my_scaler = StandardScaler()
    X_test = np.array(my_scaler.fit_transform(test_data_feats))
    X_train = np.array(my_scaler.fit_transform(train_data_feats))
else:
    X_test = np.array(test_data_feats)
    X_train = np.array(train_data_feats)

### the dimensions of the class and features are checked

In [8]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

(786, 25)
(363, 25)
(786,)


### Local Outlier Factor model is fitted on data that will be used for training the final classifier model

In [9]:
lof = LocalOutlierFactor(novelty=True, )

### print details of model

In [10]:
print(lof)

LocalOutlierFactor(algorithm='auto', contamination='legacy', leaf_size=30,
          metric='minkowski', metric_params=None, n_jobs=None,
          n_neighbors=20, novelty=True, p=2)


In [11]:
lof.fit(X_train)



LocalOutlierFactor(algorithm='auto', contamination='legacy', leaf_size=30,
          metric='minkowski', metric_params=None, n_jobs=None,
          n_neighbors=20, novelty=True, p=2)

### Local Outlier Factor model predicts whether the artefact samples are inliers or outliers encoded with 1 and -1 respectively

In [12]:
predictions = lof.predict(X_test)

In [13]:
pd.Series(predictions).value_counts()

-1    182
 1    181
dtype: int64

In [14]:
nInliers = list(pd.Series(predictions).value_counts())[0]
nOutliers = list(pd.Series(predictions).value_counts())[1]

In [15]:
print('there are {0} inliers and {1} outliers, so the proportion of inliers is {2}'.format(nInliers, nOutliers, (nInliers)/(X_test.shape[0])))

there are 182 inliers and 181 outliers, so the proportion of inliers is 0.5013774104683195


### column encoding inlier status is added to artefact-sample dataset 

In [16]:
X_test_df = pd.DataFrame(data = X_test, columns = test_data_feats.columns.values)

In [17]:
X_test_labeled_df = pd.concat([test_data_identifiers.reset_index(drop = True), X_test_df, pd.Series(predictions)], axis =1).rename(columns = {0:'inlierLabel'})

In [18]:
X_test_labeled_df.head()

Unnamed: 0,Analysis,Li7,Nd146,Pr141,La139,Ba137,Y89,Sr88,Rb85,As75,Ge72,Ga69,Zn68,Cu63,Fe56,Zr90,Cr52,B11,Mg24,Mn55,P31,S33,K39,Al27,Sc45,V51,inlierLabel
0,06_DH1_1,2.79,0.34,0.07,0.18,5.95,0.21,2.0,0.72,0.44,1.37,0.69,20.75,11.14,35.38,1.43,5.18,93.21,27.83,3.27,35.07,806.55,443.1,635.24,1.08,0.51,1
1,07_DH1_2,2.37,0.24,0.06,0.19,7.51,0.21,1.84,0.77,0.44,1.93,0.86,18.15,8.64,49.23,1.47,5.43,95.19,27.99,2.3,31.65,807.55,442.1,639.09,1.09,0.6,1
2,08_DH1_3,2.46,0.08,0.03,0.11,4.4,0.13,1.93,0.71,0.42,2.14,0.79,17.54,3.44,16.46,1.16,2.61,93.63,26.43,1.4,25.43,837.73,444.28,670.48,1.17,0.62,1
3,09_DH2_1,18.47,0.25,0.06,0.23,12.48,0.2,4.29,1.62,0.46,1.59,0.75,14.33,1.1,16.76,1.45,2.87,63.8,31.98,1.09,24.63,750.62,383.353245,620.245528,1.19,1.52,1
4,10_DH2_2,19.98,0.51,0.12,0.48,14.4,0.27,4.44,0.73562,0.43,0.81,0.76,14.59,1.23,62.16,1.99,5.92,60.96,44.95,1.26,22.59,796.21,383.353245,620.245528,1.22,2.44,1


### dataset is stored

In [19]:
%store X_test_labeled_df

Stored 'X_test_labeled_df' (DataFrame)
