# data preproccessing


## import modules and configure notebook

In [8]:
import pandas as pd
import numpy as np
import swifter
import seaborn as sns
import matplotlib.pyplot


pd.set_option('max.rows', None)
pd.set_option('max.columns', None)

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score, f1_score
from sklearn.decomposition import PCA 
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler


%matplotlib inline

### Configurations

In [9]:
data_input_path = '../../data/AllData_2_All_Details_3_OutliersRem_1_SupDep_Regions.csv'
classify_bedrock_only = False
group_superficial = True

### Import data

In [10]:
my_data = pd.read_csv(data_input_path)

In [11]:
my_data['Site'].unique()

array(['FH', 'ER', 'WW', 'TC', 'CS', 'BC', 'KQ', 'AR', 'SL', 'FG', 'WB',
       'BX', 'PF', 'BM', 'WH', 'SQ', 'BP', 'WN', 'BH', 'PH', 'LB', 'AB',
       'LV', 'BR', 'KY', 'BF', 'ST', 'SH', 'CF', 'BG', 'AC', 'CR', 'GH',
       'PX', 'WF', 'DH', 'NMAG_Gold', 'NMW_Gold', 'NMWGwern', 'UBSS',
       'Cefn', 'Stockley', 'Pucha', 'Woodbury', 'Pimple', 'Wellington',
       'Lyonshall', 'SymondsYatE', 'Madawg'], dtype=object)

### make labels for classification

In [12]:

def make_classes_grouped(row):
    if row['Geology'] == 'Bedrock':
        if row['Site'] == 'WB' or row['Site'] == 'BX':
            return('WB_BX')
        elif row['Site'] == 'BC' or row['Site'] == 'CS':
            return('BC_CS')
        elif row['Site'] == 'SQ' or row['Site'] == 'BP':
            return('SQ_BP')
        else:
            return(row['Site'])
    elif row['Geology'] == 'Superficial':
        if row['Region'] == 'SV' or row['Region'] == 'SE':
            return('SV_SE')
        else:
            return(row['Region'])

def make_classes_raw(row):
    if row['Geology'] == 'Bedrock':
        return(row['Site'])
    elif row['Geology'] == 'Superficial':
        return(row['Region'])



In [13]:
my_data['class'] = 'init'   

if group_superficial:
    my_data['class'] = my_data.apply(make_classes_grouped, axis = 1)
else:
    my_data['class'] = my_data.apply(make_classes_raw, axis = 1)

### Remove '<' signs.

In [14]:
for column_name in my_data.columns.values[9:-1]:
    def fill_less_than(row):
        if '<' in str(row[column_name]):
            return(float(row[column_name].replace('<', '').replace(',','')))
        else:
            return(float(row[column_name]))
    my_data[column_name] = my_data.swifter.apply(fill_less_than, axis = 1)

Pandas Apply: 100%|██████████| 1606/1606 [00:00<00:00, 18142.44it/s]
Pandas Apply: 100%|██████████| 1606/1606 [00:00<00:00, 20197.75it/s]
Pandas Apply: 100%|██████████| 1606/1606 [00:00<00:00, 19631.25it/s]
Pandas Apply: 100%|██████████| 1606/1606 [00:00<00:00, 19878.63it/s]
Pandas Apply: 100%|██████████| 1606/1606 [00:00<00:00, 17916.37it/s]
Pandas Apply: 100%|██████████| 1606/1606 [00:00<00:00, 19211.50it/s]
Pandas Apply: 100%|██████████| 1606/1606 [00:00<00:00, 20841.49it/s]
Pandas Apply: 100%|██████████| 1606/1606 [00:00<00:00, 13394.66it/s]
Pandas Apply: 100%|██████████| 1606/1606 [00:00<00:00, 18184.90it/s]
Pandas Apply: 100%|██████████| 1606/1606 [00:00<00:00, 17822.18it/s]
Pandas Apply: 100%|██████████| 1606/1606 [00:00<00:00, 19430.56it/s]
Pandas Apply: 100%|██████████| 1606/1606 [00:00<00:00, 15798.79it/s]
Pandas Apply: 100%|██████████| 1606/1606 [00:00<00:00, 22006.27it/s]
Pandas Apply: 100%|██████████| 1606/1606 [00:00<00:00, 17401.82it/s]
Pandas Apply: 100%|██████████| 160

### Impute na values with variable median, this is more resistant to the effect of outliers

In [15]:
for column_name in my_data.columns.values[9:-1]:
    my_data[column_name] = my_data[column_name].fillna(my_data[column_name].median()) 

In [16]:
my_data.head()

Unnamed: 0,Analysis,Geology,Province,Region,Site,SubSite,Formation,Band,Nodule,Li7,Be9,B11,Mg24,Al27,Si28,P31,S33,K39,Ca42,Sc45,Ti47,V51,Cr52,Mn55,Fe56,Co59,Ni60,Cu63,Zn68,Ga69,Ge72,As75,Rb85,Sr88,Y89,Zr90,Nb93,Mo95,Cd111,In115,Sn118,Cs133,Ba137,La139,Ce140,Pr141,Nd146,Sm147,Eu153,Gd157,Tb159,Dy163,Ho165,Er166,Tm169,Yb172,Lu175,Hf178,Ta181,Pb208,Th232,U238,class
0,10_FH1_1_1,Bedrock,Northern,,FH,FH1,Burnham,FH1,FH1_1_1,15.63,0.12,48.36,154.63,943.71,464944.18,50.28,538.57,455.94,712.39,0.42,15.58,0.27,3.3,0.69,8.46,0.05,0.8,1.62,10.82,0.25,1.22,0.16,0.43,12.94,0.88,1.51,0.09,0.05,0.02,0.0,0.05,0.01,6.54,0.84,0.95,0.23,0.87,0.16,0.04,0.16,0.02,0.11,0.03,0.06,0.01,0.02,0.0,0.04,0.01,0.24,0.07,0.05,FH
1,11_FH1_1_1,Bedrock,Northern,,FH,FH1,Burnham,FH1,FH1_1_1,11.5,0.09,44.77,22.42,1077.11,465010.94,70.91,438.2,387.82,515.24,0.44,18.47,0.29,3.45,1.01,11.59,0.11,0.36,0.53,8.93,0.34,0.85,0.1,0.45,13.22,0.95,1.74,0.07,0.01,0.02,0.0,0.04,0.02,8.04,0.92,1.01,0.23,0.98,0.18,0.04,0.18,0.02,0.13,0.03,0.06,0.01,0.04,0.01,0.05,0.0,0.07,0.08,0.04,FH
2,12_FH1_1_1,Bedrock,Northern,,FH,FH1,Burnham,FH1,FH1_1_1,20.05,0.06,44.88,42.7,620.21,465295.41,104.47,372.66,363.71,957.89,0.76,19.89,0.55,3.25,1.21,87.99,0.21,1.68,1.53,11.98,0.25,1.71,0.13,0.43,8.52,0.87,0.93,0.1,0.02,0.02,0.0,0.05,0.01,3.13,0.9,1.08,0.26,0.84,0.15,0.04,0.19,0.02,0.14,0.02,0.07,0.01,0.06,0.0,0.02,0.01,0.46,0.05,0.05,FH
3,13_FH1_1_2,Bedrock,Northern,,FH,FH1,Burnham,FH1,FH1_1_2,11.16,0.73,47.06,162.42,1143.19,465099.89,56367.93,1075.89,547.55,2174.3,0.43,42.3,0.67,152.42,4.84,145.34,0.3,2.45,5.02,17.15,0.35,2.13,0.84,0.76,13.16,0.97,2.0,0.1,0.29,0.18,0.01,0.78,0.04,8.74,0.93,0.95,0.21,0.75,0.13,0.04,0.25,0.02,0.09,0.03,0.05,0.0,0.03,0.0,0.08,0.0,0.64,0.05,0.03,FH
4,14_FH1_1_2,Bedrock,Northern,,FH,FH1,Burnham,FH1,FH1_1_2,17.71,0.32,48.26,33.52,547.22,465027.11,44.44,464.78,278.25,1551.63,0.71,11.18,0.27,2.56,1.73,25.38,0.05,0.8,0.55,9.8,0.41,1.41,0.12,0.28,9.9,0.9,0.9,0.08,0.04,0.1,0.0,0.09,0.01,2.74,0.97,1.09,0.27,1.0,0.17,0.04,0.19,0.02,0.15,0.03,0.05,0.01,0.05,0.01,0.02,0.01,0.59,0.06,0.09,FH


### Filter for known data bedrock data for training the model

In [17]:
train_data = my_data[(my_data['Geology']== 'Bedrock') | (my_data['Geology'] == 'Superficial')]
test_data = my_data[my_data['Geology']=='Artefacts']


### label encode the class to be predicted

In [18]:
train_data_formodel = train_data.copy(deep = True)
if classify_bedrock_only == False: 
    train_data_formodel['class'], uniques = pd.factorize(train_data_formodel['class'])


### order of class labels as numbers 0 through 20

In [19]:
print(uniques)

Index(['FH', 'ER', 'WW', 'TC', 'BC_CS', 'KQ', 'AR', 'SL', 'FG', 'WB_BX', 'PF',
       'BM', 'WH', 'SQ_BP', 'WN', 'BH', 'PH', 'LB', 'AB', 'LV', 'SV_SE', 'BA',
       'WA', 'MM'],
      dtype='object')


In [20]:
%store train_data_formodel
%store train_data
%store test_data
%store my_data
%store uniques

Stored 'train_data_formodel' (DataFrame)
Stored 'train_data' (DataFrame)
Stored 'test_data' (DataFrame)
Stored 'my_data' (DataFrame)
Stored 'uniques' (Index)


In [21]:
train_data_bedrock = train_data[train_data['Geology'] == 'Bedrock']
train_data_superficial = train_data[train_data['Geology'] == 'Superficial']

In [22]:
train_data_bedrock.columns.values[9:-1]

array(['Li7', 'Be9', 'B11', 'Mg24', 'Al27', 'Si28', 'P31', 'S33', 'K39',
       'Ca42', 'Sc45', 'Ti47', 'V51', 'Cr52', 'Mn55', 'Fe56', 'Co59',
       'Ni60', 'Cu63', 'Zn68', 'Ga69', 'Ge72', 'As75', 'Rb85', 'Sr88',
       'Y89', 'Zr90', 'Nb93', 'Mo95', 'Cd111', 'In115', 'Sn118', 'Cs133',
       'Ba137', 'La139', 'Ce140', 'Pr141', 'Nd146', 'Sm147', 'Eu153',
       'Gd157', 'Tb159', 'Dy163', 'Ho165', 'Er166', 'Tm169', 'Yb172',
       'Lu175', 'Hf178', 'Ta181', 'Pb208', 'Th232', 'U238'], dtype=object)

In [23]:
element_data_train = train_data[train_data.columns.values[9:-1]]
element_data_train_bedrock = train_data_bedrock[train_data.columns.values[9:-1]]
element_data_train_superficial = train_data_superficial[train_data.columns.values[9:-1]]
element_data_test = test_data[test_data.columns.values[9:-1]]

In [24]:
my_scaler_train = StandardScaler()
my_scaler_train_bedrock = StandardScaler()
my_scaler_train_superficial = StandardScaler()
my_scaler_test = StandardScaler()

In [25]:
element_data_train_scaled = my_scaler_train.fit_transform(element_data_train)
element_data_train_bedrock_scaled = my_scaler_train_bedrock.fit_transform(element_data_train_bedrock)
element_data_train_superficial_scaled = my_scaler_train_superficial.fit_transform(element_data_train_superficial)
element_data_test_scaled = my_scaler_test.fit_transform(element_data_test)

In [26]:
my_pca_train = PCA(n_components=element_data_train_scaled.shape[1])

my_pca_train_bedrock = PCA(n_components=element_data_train_bedrock_scaled.shape[1])
my_pca_train_superficial = PCA(n_components=element_data_train_superficial_scaled.shape[1])

my_pca_test = PCA(n_components=element_data_test_scaled.shape[1])

element_data_train_pca = my_pca_train.fit_transform(element_data_train_scaled)

element_data_train_bedrock_pca = my_pca_train_bedrock.fit_transform(element_data_train_bedrock_scaled)
element_data_train_superficial_pca = my_pca_train_superficial.fit_transform(element_data_train_superficial_scaled)

element_data_test_pca = my_pca_test.fit_transform(element_data_test_scaled)

In [27]:
no_PCs = element_data_train_scaled.shape[1]
PC_names = []
for i in range(0, no_PCs):
    number = i + 1
    column_name = 'PC' + str(number)
    PC_names.append(column_name)

In [28]:
PC_df_train = pd.DataFrame(data = element_data_train_pca, columns = PC_names)

PC_df_bedrock_train = pd.DataFrame(data = element_data_train_bedrock_pca, columns = PC_names)
PC_df_superficial_train = pd.DataFrame(data = element_data_train_superficial_pca, columns = PC_names)

PC_df_test = pd.DataFrame(data = element_data_test_pca, columns = PC_names)

In [29]:
my_tsne_train = TSNE(n_components=3, n_iter=750, verbose=3).fit_transform(element_data_train_scaled)

my_tsne_bedrock_train = TSNE(n_components=3, n_iter=750, verbose=3).fit_transform(element_data_train_bedrock_scaled)
my_tsne_superficial_train = TSNE(n_components=3, n_iter=750, verbose=3).fit_transform(element_data_train_superficial_scaled)

my_tsne_test = TSNE(n_components=3, n_iter=750, verbose=3).fit_transform(element_data_test_scaled)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1243 samples in 0.022s...
[t-SNE] Computed neighbors for 1243 samples in 0.382s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1243
[t-SNE] Computed conditional probabilities for sample 1243 / 1243
[t-SNE] Mean sigma: 1.771276
[t-SNE] Computed conditional probabilities in 0.134s
[t-SNE] Iteration 50: error = 75.3701096, gradient norm = 0.1138786 (50 iterations in 7.723s)
[t-SNE] Iteration 100: error = 77.6965408, gradient norm = 0.0726715 (50 iterations in 5.498s)
[t-SNE] Iteration 150: error = 77.8678818, gradient norm = 0.0701772 (50 iterations in 5.684s)
[t-SNE] Iteration 200: error = 79.7563553, gradient norm = 0.0623521 (50 iterations in 5.626s)
[t-SNE] Iteration 250: error = 80.4718094, gradient norm = 0.0539096 (50 iterations in 6.193s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 80.471809
[t-SNE] Iteration 300: error = 1.8182570, gradient norm = 0.0007026 (50 iterations in 7.888s)

In [30]:
tsne_df_train = pd.DataFrame(data = my_tsne_train, columns = ['tsne1', 'tsne2', 'tsne3'])

tsne_df_bedrock_train = pd.DataFrame(data = my_tsne_bedrock_train, columns = ['tsne1', 'tsne2', 'tsne3'])
tsne_df_superficial_train = pd.DataFrame(data = my_tsne_superficial_train, columns = ['tsne1', 'tsne2', 'tsne3'])

tsne_df_test = pd.DataFrame(data = my_tsne_test, columns = ['tsne1', 'tsne2', 'tsne3'])

In [31]:
%store PC_df_train
%store my_pca_train

%store PC_df_bedrock_train
%store my_pca_train_bedrock

%store PC_df_superficial_train
%store my_pca_train_superficial

%store PC_df_test
%store my_pca_test

%store tsne_df_train

%store tsne_df_bedrock_train
%store tsne_df_superficial_train


%store tsne_df_test


Stored 'PC_df_train' (DataFrame)
Stored 'my_pca_train' (PCA)
Stored 'PC_df_bedrock_train' (DataFrame)
Stored 'my_pca_train_bedrock' (PCA)
Stored 'PC_df_superficial_train' (DataFrame)
Stored 'my_pca_train_superficial' (PCA)
Stored 'PC_df_test' (DataFrame)
Stored 'my_pca_test' (PCA)
Stored 'tsne_df_train' (DataFrame)
Stored 'tsne_df_bedrock_train' (DataFrame)
Stored 'tsne_df_superficial_train' (DataFrame)
Stored 'tsne_df_test' (DataFrame)


import pandas as pd
%store -r tsne_df_train
tsne_df_train.to_csv('tsne_df_train.csv', index = False)
%store -r tsne_df_test
tsne_df_test.to_csv('tsne_df_test.csv', index=False)

%store -r PC_df_train
PC_df_train.to_csv(' PC_df_train.csv', index=False)
%store -r PC_df_test
PC_df_test.to_csv('PC_df_test.csv', index=False)