### RFC

### import modules and configure notebook

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import LocalOutlierFactor

pd.set_option('max.rows', None)
pd.set_option('max.columns', None)
%matplotlib inline

### Load variables stored by data_preproccessing notebook

In [5]:
%store -r train_data_formodel
%store -r test_data
%store -r my_data
%store -r uniques
%store -r best_feats

### configurations
* save_plots -> boolean
* random_seed_state -> number, sets random state for model and for stratified splits 
* pickle_model -> True|False, wether model should be serialised and saved
* pickle_model_name -> string, name of serialised model
* scale -> True|False if set to True then features scaled to all have mean value 0 and standard deviation 1
* pickle_file_path -> string,  filepath for serialised model to be saved to

In [2]:
save_plots = False
random_seed_state = 42
pickle_model = False
pickle_model_name = 'grouped'
pickle_file_path = '../../../model'
scale = False

### counts of instances in all classes before oversampling

In [6]:
train_data_formodel['class'].value_counts()

22    120
4     105
23    105
16    100
21     74
17     61
24     60
0      53
12     45
14     36
2      36
15     36
6      30
11     30
10     30
7      30
20     28
5      27
8      27
19     27
1      24
13     21
3      18
18     18
9      17
Name: class, dtype: int64

### The class column is stored as the variable y 

In [7]:
y_train = np.array(train_data_formodel['class'])

### The variables identified as best by feature selection are used as features

In [8]:
train_data_feats = train_data_formodel[best_feats]
test_data_feats = test_data[best_feats]
test_data_identifiers = test_data['Analysis']

### features are standardised

In [11]:
if scale:
    my_scaler = StandardScaler()
    X_test = np.array(my_scaler.fit_transform(test_data_feats))
    X_train = np.array(my_scaler.fit_transform(train_data_feats))
else:
    X_test = np.array(test_data_feats)
    X_train = np.array(train_data_feats)

### the dimensions of the class and features are checked

In [12]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

(1158, 16)
(363, 16)
(1158,)


### Local Outlier Factor model is fitted on data that will be used for training the final classifier model

In [13]:
lof = LocalOutlierFactor(novelty=True)

### print details of model

In [None]:
print(lof)

In [14]:
lof.fit(X_train)



LocalOutlierFactor(algorithm='auto', contamination='legacy', leaf_size=30,
          metric='minkowski', metric_params=None, n_jobs=None,
          n_neighbors=20, novelty=True, p=2)

### Local Outlier Factor model predicts whether the artefact samples are inliers or outliers encoded with 1 and -1 respectively

In [15]:
predictions = lof.predict(X_test)

In [16]:
nInliers = list(pd.Series(predictions).value_counts())[0]
nOutliers = list(pd.Series(predictions).value_counts())[1]

In [19]:
print('there are {0} inliers and {1} outliers, so the proportion of inliers is {2}'.format(nInliers, nOutliers, (nInliers)/(X_test.shape[0])))

there are 328 inliers and 35 outliers, so the proportion of inliers is 0.9035812672176309


### column encoding inlier status is added to artefact-sample dataset 

In [20]:
X_test_df = pd.DataFrame(data = X_test, columns = test_data_feats.columns.values)

In [24]:
X_test_labeled_df = pd.concat([test_data_identifiers.reset_index(drop = True), X_test_df, pd.Series(predictions)], axis =1).rename(columns = {0:'inlierLabel'})

In [25]:
X_test_labeled_df.head()

Unnamed: 0,Analysis,Zr90,Nd146,Ba137,Sr88,Rb85,Ge72,Fe56,Cr52,Sc45,U238,Ca42,B11,S33,P31,Mg24,Al27,inlierLabel
0,06_DH1_1,1.43,0.34,5.95,2.0,0.72,1.37,35.38,5.18,1.08,0.305667,119.05,93.21,806.55,35.07,27.83,635.24,1
1,07_DH1_2,1.47,0.24,7.51,1.84,0.77,1.93,49.23,5.43,1.09,0.305667,119.74,95.19,807.55,31.65,27.99,639.09,1
2,08_DH1_3,1.16,0.08,4.4,1.93,0.71,2.14,16.46,2.61,1.17,0.305667,140.99,93.63,837.73,25.43,26.43,670.48,1
3,09_DH2_1,1.45,0.25,12.48,4.29,1.62,1.59,16.76,2.87,1.19,0.08,141.7,63.8,750.62,24.63,31.98,628.024063,1
4,10_DH2_2,1.99,0.51,14.4,4.44,0.739,0.81,62.16,5.92,1.22,0.09,130.52,60.96,796.21,22.59,44.95,628.024063,1


### dataset is stored

In [26]:
%store X_test_labeled_df

Stored 'X_test_labeled_df' (DataFrame)
