### RFC

### import modules and configure notebook

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import LocalOutlierFactor

pd.set_option('max.rows', None)
pd.set_option('max.columns', None)
%matplotlib inline

### Load variables stored by data_preproccessing notebook

In [2]:
%store -r train_data_formodel
%store -r test_data
%store -r my_data
%store -r uniques
%store -r best_feats

### configurations
* save_plots -> boolean
* scale -> True|False if set to True then features scaled to all have mean value 0 and standard deviation 1

In [3]:
save_plots = False

#the data should be scaled because LOF model uses KNN
scale = True

### counts of instances in all classes before oversampling

In [4]:
train_data_formodel['class'].value_counts()

4     105
15    100
16     61
0      53
11     45
13     36
14     36
2      36
10     30
7      30
6      30
5      27
8      27
1      24
12     21
3      18
9      17
Name: class, dtype: int64

### The class column is stored as the variable y 

In [5]:
y_train = np.array(train_data_formodel['class'])

### The variables identified as best by feature selection are used as features

In [6]:
train_data_feats = train_data_formodel[best_feats]
test_data_feats = test_data[best_feats]
test_data_identifiers = test_data['Analysis']

### features are standardised

In [7]:
if scale:
    my_scaler = StandardScaler()
    X_test = np.array(my_scaler.fit_transform(test_data_feats))
    X_train = np.array(my_scaler.fit_transform(train_data_feats))
else:
    X_test = np.array(test_data_feats)
    X_train = np.array(train_data_feats)

### the dimensions of the class and features are checked

In [8]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

(696, 15)
(363, 15)
(696,)


### Local Outlier Factor model is fitted on data that will be used for training the final classifier model
* The model is used for the purpose of novelty detection

In [9]:
lof = LocalOutlierFactor(novelty=True,n_neighbors=25 )
#set novelty to true, this is suitable because we are trying to assess whether the artefcats come from the same distribution of the geological samples, refer to sklearn documentation for more details 

### print details of model

In [10]:
print(lof)

LocalOutlierFactor(algorithm='auto', contamination='legacy', leaf_size=30,
          metric='minkowski', metric_params=None, n_jobs=None,
          n_neighbors=25, novelty=True, p=2)


In [11]:
#train model on geological samples
lof.fit(X_train)



LocalOutlierFactor(algorithm='auto', contamination='legacy', leaf_size=30,
          metric='minkowski', metric_params=None, n_jobs=None,
          n_neighbors=25, novelty=True, p=2)

### Local Outlier Factor model predicts whether the artefact samples are inliers or outliers 

* inlier = 1
* outier = -1

In [12]:
predictions = lof.predict(X_test)

In [13]:
pd.Series(predictions).value_counts()

 1    212
-1    151
dtype: int64

In [14]:
nInliers = list(pd.Series(predictions).value_counts())[0]
nOutliers = list(pd.Series(predictions).value_counts())[1]

In [15]:
print('there are {0} inliers and {1} outliers, so the proportion of inliers is {2}'.format(nInliers, nOutliers, (nInliers)/(X_test.shape[0])))

there are 212 inliers and 151 outliers, so the proportion of inliers is 0.5840220385674931


### column encoding inlier status is added to artefact-sample dataset 

In [16]:
X_test_df = pd.DataFrame(data = X_test, columns = test_data_feats.columns.values)
X_test_labeled_df = pd.concat([test_data_identifiers.reset_index(drop = True), X_test_df, pd.Series(predictions)], axis =1).rename(columns = {0:'inlierLabel'})

In [17]:
X_test_labeled_df.head()

Unnamed: 0,Analysis,Zr90,Ba137,Sr88,Ge72,Cr52,S33,U238,Al27,B11,Mg24,Nd146,Sc45,K39,Pr141,Li7,inlierLabel
0,06_DH1_1,0.415835,0.368775,-0.637129,0.040125,-0.562542,-0.742558,0.219193,-0.023444,1.53357,0.438877,-0.364648,1.545144,0.828576,-0.680255,-0.575952,1
1,07_DH1_2,0.466569,0.72632,-0.776295,1.494307,-0.407728,-0.730983,0.219193,-0.011073,1.684246,0.456902,-0.704044,1.583953,0.82079,-0.805314,-0.675981,-1
2,08_DH1_3,0.073374,0.013522,-0.698014,2.039624,-2.154028,-0.381635,0.219193,0.089794,1.565532,0.281163,-1.247078,1.894423,0.837763,-1.180494,-0.654546,-1
3,09_DH2_1,0.441202,1.865423,1.354689,0.611411,-1.993021,-1.389974,-1.161815,2.804214,-0.704501,0.906387,-0.670104,1.972041,0.252682,-0.805314,3.158491,1
4,10_DH2_2,1.126124,2.305478,1.485157,-1.414056,-0.104293,-0.862249,-1.080618,0.219742,-0.920622,2.367498,0.212325,2.088467,0.252682,-0.054956,3.518122,1


### dataset is stored

In [18]:
%store X_test_labeled_df

Stored 'X_test_labeled_df' (DataFrame)
