# data preproccessing


## import modules and configure notebook

In [1]:
import pandas as pd
import numpy as np
import swifter
import seaborn as sns
pd.set_option('max.rows', None)
pd.set_option('max.columns', None)


from sklearn.decomposition import PCA 
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
%matplotlib inline

### configurations
* data_input_path -> string, filepath to data to be read in

* grouping -> boolean, if set to True them many sites are grouped
* reduced_grouping -> boolean, if set to True then less sites grouped, only bedrock sites 'WB' and 'BX are grouped into one class and superficial sites 'SV' and 'SE' are grouped into one class.
* raw -> boolean, if set to True then no grouping done
* drop_semi_bedrock ->  True|False, if set to True then some bedrock sites deemed to be semi-bedrock sites are not used for classification



In [2]:
data_input_path = '../data/raw_data.csv'

drop_semi_bedrock = True

In [3]:
%store -r X_test_labeled_df

### Import data

In [4]:
my_data = pd.read_csv(data_input_path)

In [5]:
my_data.Geology.value_counts()

Bedrock        808
Superficial    435
Artefacts      363
Name: Geology, dtype: int64

### features

In [6]:
my_data.columns.values[9:-1]

array(['Li7', 'Be9', 'B11', 'Mg24', 'Al27', 'Si28', 'P31', 'S33', 'K39',
       'Ca42', 'Sc45', 'Ti47', 'V51', 'Cr52', 'Mn55', 'Fe56', 'Co59',
       'Ni60', 'Cu63', 'Zn68', 'Ga69', 'Ge72', 'As75', 'Rb85', 'Sr88',
       'Y89', 'Zr90', 'Nb93', 'Mo95', 'Cd111', 'In115', 'Sn118', 'Cs133',
       'Ba137', 'La139', 'Ce140', 'Pr141', 'Nd146', 'Sm147', 'Eu153',
       'Gd157', 'Tb159', 'Dy163', 'Ho165', 'Er166', 'Tm169', 'Yb172',
       'Lu175', 'Hf178', 'Ta181', 'Pb208', 'Th232'], dtype=object)

### raw sample names including sample sites and artefacts

In [7]:
my_data['Site'].unique()

array(['FH', 'ER', 'WW', 'TC', 'CS', 'BC', 'KQ', 'AR', 'SL', 'FG', 'WB',
       'BX', 'PF', 'BM', 'WH', 'SQ', 'BP', 'WN', 'BH', 'PH', 'LB', 'AB',
       'LV', 'BR', 'KY', 'BF', 'ST', 'SH', 'CF', 'BG', 'AC', 'CR', 'GH',
       'PX', 'WF', 'DH', 'NMAG_Gold', 'NMW_Gold', 'NMWGwern', 'UBSS',
       'Cefn', 'Stockley', 'Pucha', 'Woodbury', 'Pimple', 'Wellington',
       'Lyonshall', 'SymondsYatE', 'Madawg', nan], dtype=object)

### define functions for making target classes for classification

In [8]:
def make_classes_grouped(row):
    if row['Geology'] == 'Bedrock':
        return(row['Site'])
    elif row['Geology'] == 'Superficial':
        if row['Region'] == 'SV' or row['Region'] == 'SE':
            return('SV_SE')
        else:
            return(row['Region'])
       

### targets for classification are made

In [9]:
my_data['class'] = 'init'   

my_data['class'] = my_data.apply(make_classes_grouped, axis = 1)


In [10]:
if drop_semi_bedrock:
    my_data = my_data[(my_data['class'] != 'BM') & (my_data['class'] != 'BC') & (my_data['class'] != 'BP') ]

In [11]:
my_data = my_data[my_data['Site']!='BP']
my_data = my_data[my_data['Site']!='BX']

### remove '<' signs and commas from feature values

In [12]:
for column_name in my_data.columns.values[9:-1]:
    def fill_less_than(row):
        if 'DL' in  str(row[column_name]):
            return(np.nan)
        if '<' in str(row[column_name]):
            return(float(row[column_name].replace('<', '').replace(',', '')))
        else:
            return(float(row[column_name]))
    my_data[column_name] = my_data.swifter.apply(fill_less_than, axis = 1)

Pandas Apply: 100%|██████████| 1522/1522 [00:00<00:00, 22856.10it/s]
Pandas Apply: 100%|██████████| 1522/1522 [00:00<00:00, 22353.17it/s]
Pandas Apply: 100%|██████████| 1522/1522 [00:00<00:00, 21714.85it/s]
Pandas Apply: 100%|██████████| 1522/1522 [00:00<00:00, 21508.74it/s]
Pandas Apply: 100%|██████████| 1522/1522 [00:00<00:00, 20988.21it/s]
Pandas Apply: 100%|██████████| 1522/1522 [00:00<00:00, 19877.54it/s]
Pandas Apply: 100%|██████████| 1522/1522 [00:00<00:00, 21340.06it/s]
Pandas Apply: 100%|██████████| 1522/1522 [00:00<00:00, 21904.93it/s]
Pandas Apply: 100%|██████████| 1522/1522 [00:00<00:00, 20521.12it/s]
Pandas Apply: 100%|██████████| 1522/1522 [00:00<00:00, 20884.46it/s]
Pandas Apply: 100%|██████████| 1522/1522 [00:00<00:00, 20415.72it/s]
Pandas Apply: 100%|██████████| 1522/1522 [00:00<00:00, 21661.43it/s]
Pandas Apply: 100%|██████████| 1522/1522 [00:00<00:00, 21753.62it/s]
Pandas Apply: 100%|██████████| 1522/1522 [00:00<00:00, 21737.92it/s]
Pandas Apply: 100%|██████████| 152

### remove rows where there are all element abundances are na values 

In [13]:
my_data = my_data.dropna(subset=my_data.columns.values[9:-1], how = 'all' , axis = 0)

### Impute na values with feature mean

In [14]:
for column_name in my_data.columns.values[9:-1]:
    my_data[column_name] = my_data[column_name].fillna(my_data[column_name].mean()) 

### Outliers defined as any values that exceed 2 standard deviations from the mean, such values are changed to the mean for that variable

In [15]:
std_dict = {}
mean_dict = {}
median_dict = {}

for col in my_data.columns.values[9:-1]:
    std_dict[col] = my_data[col].std()
    
for col in my_data.columns.values[9:-1]:
    mean_dict[col] = my_data[col].mean()
    
for col in my_data.columns.values[9:-1]:
    median_dict[col] = my_data[col].median()
    

In [16]:
for col_name in my_data.columns.values[9:-1]:
    def impute_outliers(row):
        if np.abs(row[col_name] - mean_dict[col_name]) > 2*(std_dict[col_name]):
            return(mean_dict[col_name])
        else:
            return(row[col_name])
    my_data[col_name]= my_data.swifter.apply(impute_outliers, axis = 1)

Pandas Apply: 100%|██████████| 1491/1491 [00:00<00:00, 23960.75it/s]
Pandas Apply: 100%|██████████| 1491/1491 [00:00<00:00, 23275.58it/s]
Pandas Apply: 100%|██████████| 1491/1491 [00:00<00:00, 24399.76it/s]
Pandas Apply: 100%|██████████| 1491/1491 [00:00<00:00, 24445.73it/s]
Pandas Apply: 100%|██████████| 1491/1491 [00:00<00:00, 24034.88it/s]
Pandas Apply: 100%|██████████| 1491/1491 [00:00<00:00, 23246.17it/s]
Pandas Apply: 100%|██████████| 1491/1491 [00:00<00:00, 23382.54it/s]
Pandas Apply: 100%|██████████| 1491/1491 [00:00<00:00, 23056.25it/s]
Pandas Apply: 100%|██████████| 1491/1491 [00:00<00:00, 23524.06it/s]
Pandas Apply: 100%|██████████| 1491/1491 [00:00<00:00, 23690.71it/s]
Pandas Apply: 100%|██████████| 1491/1491 [00:00<00:00, 22985.49it/s]
Pandas Apply: 100%|██████████| 1491/1491 [00:00<00:00, 20504.56it/s]
Pandas Apply: 100%|██████████| 1491/1491 [00:00<00:00, 22223.87it/s]
Pandas Apply: 100%|██████████| 1491/1491 [00:00<00:00, 22096.97it/s]
Pandas Apply: 100%|██████████| 149

### split data into 'train_data' and 'test_data', the former consists of samples from known geological sites and the latter from flint artefacts fow which the original geological source site is unknown and to be predicted.

In [17]:
my_data.isna().any()

Analysis     False
Geology      False
Province      True
Region        True
Site         False
SubSite       True
Formation     True
Band          True
Nodule       False
Li7          False
Be9          False
B11          False
Mg24         False
Al27         False
Si28         False
P31          False
S33          False
K39          False
Ca42         False
Sc45         False
Ti47         False
V51          False
Cr52         False
Mn55         False
Fe56         False
Co59         False
Ni60         False
Cu63         False
Zn68         False
Ga69         False
Ge72         False
As75         False
Rb85         False
Sr88         False
Y89          False
Zr90         False
Nb93         False
Mo95         False
Cd111        False
In115        False
Sn118        False
Cs133        False
Ba137        False
La139        False
Ce140        False
Pr141        False
Nd146        False
Sm147        False
Eu153        False
Gd157        False
Tb159        False
Dy163        False
Ho165       

In [18]:
train_data = my_data[(my_data['Geology']== 'Bedrock') | (my_data['Geology'] == 'Superficial')]
test_data = my_data[my_data['Geology']=='Artefacts']


### label encode the class to be predicted

### 'training_data' is split into two datasets, one consisting of samples from superficial sites and one containing samples from bedrock sites.

In [19]:
train_data_bedrock = train_data[train_data['Geology'] == 'Bedrock']
train_data_superficial = train_data[train_data['Geology'] == 'Superficial']

### four datasets are created, one containing all train data (bedrock and superficial types), one containing just bedrock types, one containing just superficial types and one containing the artefacts

In [20]:
element_data_train = train_data[train_data.columns.values[9:-1]]
element_data_train_bedrock = train_data_bedrock[train_data.columns.values[9:-1]]
element_data_train_superficial = train_data_superficial[train_data.columns.values[9:-1]]
element_data_test = test_data[test_data.columns.values[9:-1]]
element_data_everything = my_data[my_data.columns.values[9:-1]]

In [21]:
element_data_everything.shape

(1491, 53)

In [22]:
element_data_train.shape

(1128, 53)

In [23]:
element_data_everything.head()

Unnamed: 0,Li7,Be9,B11,Mg24,Al27,Si28,P31,S33,K39,Ca42,Sc45,Ti47,V51,Cr52,Mn55,Fe56,Co59,Ni60,Cu63,Zn68,Ga69,Ge72,As75,Rb85,Sr88,Y89,Zr90,Nb93,Mo95,Cd111,In115,Sn118,Cs133,Ba137,La139,Ce140,Pr141,Nd146,Sm147,Eu153,Gd157,Tb159,Dy163,Ho165,Er166,Tm169,Yb172,Lu175,Hf178,Ta181,Pb208,Th232,U238
0,15.63,0.12,48.36,154.63,943.71,464944.18,50.28,538.57,455.94,712.39,0.42,15.58,0.27,3.3,0.69,8.46,0.05,0.8,1.62,10.82,0.25,1.22,0.16,0.43,12.94,0.88,1.51,0.09,0.05,0.02,0.0,0.05,0.01,6.54,0.84,0.95,0.23,0.87,0.16,0.04,0.16,0.02,0.11,0.03,0.06,0.01,0.02,0.0,0.04,0.01,0.24,0.07,0.05
1,11.5,0.09,44.77,33.872347,1077.11,465010.94,70.91,438.2,387.82,515.24,0.44,18.47,0.29,3.45,1.01,11.59,0.11,0.36,0.53,8.93,0.34,0.85,0.1,0.45,13.22,0.95,1.74,0.07,0.01,0.02,0.0,0.04,0.02,8.04,0.92,1.01,0.23,0.98,0.18,0.04,0.18,0.02,0.13,0.03,0.06,0.01,0.04,0.01,0.05,0.0,0.07,0.08,0.04
2,20.05,0.06,44.88,42.7,620.21,465295.41,104.47,372.66,363.71,957.89,0.76,19.89,0.55,3.25,1.21,87.99,0.21,1.68,1.53,11.98,0.25,1.71,0.13,0.43,8.52,0.87,0.93,0.1,0.02,0.02,0.0,0.05,0.01,3.13,0.9,1.08,0.26,0.84,0.15,0.04,0.19,0.02,0.14,0.02,0.07,0.01,0.06,0.0,0.02,0.01,0.46,0.05,0.05
3,11.16,0.73,47.06,33.872347,1143.19,462172.810919,2420.945164,1075.89,547.55,2174.3,0.43,42.3,0.67,152.42,4.84,145.34,0.3,2.45,5.02,17.15,0.35,2.13,0.84,0.76,13.16,0.97,2.0,0.1,0.29,0.18,0.01,0.78,0.04,8.74,0.93,0.95,0.21,0.75,0.13,0.04,0.25,0.02,0.09,0.03,0.05,0.0,0.03,0.0,0.08,0.0,0.64,0.05,0.03
4,17.71,0.32,48.26,33.52,547.22,465027.11,44.44,464.78,278.25,1551.63,0.71,11.18,0.27,2.56,1.73,25.38,0.05,0.8,0.55,9.8,0.41,1.41,0.12,0.28,9.9,0.9,0.9,0.08,0.04,0.1,0.0,0.09,0.01,2.74,0.97,1.09,0.27,1.0,0.17,0.04,0.19,0.02,0.15,0.03,0.05,0.01,0.05,0.01,0.02,0.01,0.59,0.06,0.09


### features are standardised

In [24]:
my_scaler_train = StandardScaler()
my_scaler_train_bedrock = StandardScaler()
my_scaler_train_superficial = StandardScaler()
my_scaler_test = StandardScaler()
my_scaler_everything = StandardScaler()

In [25]:
element_data_train_scaled = my_scaler_train.fit_transform(element_data_train)
element_data_train_bedrock_scaled = my_scaler_train_bedrock.fit_transform(element_data_train_bedrock)
element_data_train_superficial_scaled = my_scaler_train_superficial.fit_transform(element_data_train_superficial)
element_data_test_scaled = my_scaler_test.fit_transform(element_data_test)
element_data_everything_scaled = my_scaler_everything.fit_transform(element_data_everything)

### The four datasets are transformed using Principal component analysis 

In [26]:
my_pca_train = PCA(n_components=element_data_train_scaled.shape[1])
my_pca_train_bedrock = PCA(n_components=element_data_train_bedrock_scaled.shape[1])
my_pca_train_superficial = PCA(n_components=element_data_train_superficial_scaled.shape[1])
my_pca_test = PCA(n_components=element_data_test_scaled.shape[1])
my_pca_everything = PCA(n_components=element_data_everything_scaled.shape[1])

element_data_train_pca = my_pca_train.fit_transform(element_data_train_scaled)
element_data_train_bedrock_pca = my_pca_train_bedrock.fit_transform(element_data_train_bedrock_scaled)
element_data_train_superficial_pca = my_pca_train_superficial.fit_transform(element_data_train_superficial_scaled)
element_data_test_pca = my_pca_test.fit_transform(element_data_test_scaled)
element_data_everything_pca = my_pca_everything.fit_transform(element_data_everything)

### print details of PCA

In [27]:
print(my_pca_train)

PCA(copy=True, iterated_power='auto', n_components=53, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)


### the principal components for the four datasets are put into dataframes

In [28]:
no_PCs = element_data_train_scaled.shape[1]
PC_names = []
for i in range(0, no_PCs):
    number = i + 1
    column_name = 'PC' + str(number)
    PC_names.append(column_name)

In [29]:
PC_df_train = pd.DataFrame(data = element_data_train_pca, columns = PC_names)
PC_df_bedrock_train = pd.DataFrame(data = element_data_train_bedrock_pca, columns = PC_names)
PC_df_superficial_train = pd.DataFrame(data = element_data_train_superficial_pca, columns = PC_names)
PC_df_test = pd.DataFrame(data = element_data_test_pca, columns = PC_names)
PC_df_everything = pd.DataFrame(data = element_data_everything_pca, columns = PC_names)

### T-Distributed Stochastic Neighbour Embedding is done on the four datasets

In [None]:
my_tsne_train = TSNE(n_components=3, n_iter=10000, verbose=3).fit_transform(element_data_train_scaled)
my_tsne_bedrock_train = TSNE(n_components=3, n_iter=10000, verbose=3).fit_transform(element_data_train_bedrock_scaled)
my_tsne_superficial_train = TSNE(n_components=3, n_iter=10000, verbose=3).fit_transform(element_data_train_superficial_scaled)
my_tsne_test = TSNE(n_components=3, n_iter=10000, verbose=3).fit_transform(element_data_test_scaled)
my_tsne_everything = TSNE(n_components=3, n_iter=10000, verbose=3).fit_transform(element_data_everything_scaled)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1128 samples in 0.002s...
[t-SNE] Computed neighbors for 1128 samples in 0.183s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1128
[t-SNE] Computed conditional probabilities for sample 1128 / 1128
[t-SNE] Mean sigma: 2.025902
[t-SNE] Computed conditional probabilities in 0.064s
[t-SNE] Iteration 50: error = 71.7872467, gradient norm = 0.1449712 (50 iterations in 4.767s)
[t-SNE] Iteration 100: error = 73.1344299, gradient norm = 0.1326146 (50 iterations in 4.345s)
[t-SNE] Iteration 150: error = 73.2917786, gradient norm = 0.1184866 (50 iterations in 3.046s)
[t-SNE] Iteration 200: error = 74.2052460, gradient norm = 0.0961905 (50 iterations in 2.948s)
[t-SNE] Iteration 250: error = 74.1814117, gradient norm = 0.1043922 (50 iterations in 2.781s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 74.181412
[t-SNE] Iteration 300: error = 1.5108082, gradient norm = 0.0005867 (50 iterations in 3.001s)

### the t-SNE dimensions for the four datasets are put into dataframes

In [None]:
tsne_df_train = pd.DataFrame(data = my_tsne_train, columns = ['tsne1', 'tsne2', 'tsne3'])
tsne_df_bedrock_train = pd.DataFrame(data = my_tsne_bedrock_train, columns = ['tsne1', 'tsne2', 'tsne3'])
tsne_df_superficial_train = pd.DataFrame(data = my_tsne_superficial_train, columns = ['tsne1', 'tsne2', 'tsne3'])
tsne_df_test = pd.DataFrame(data = my_tsne_test, columns = ['tsne1', 'tsne2', 'tsne3'])
tsne_df_everything = pd.DataFrame(data = my_tsne_everything, columns = ['tsne1', 'tsne2', 'tsne3'])

In [None]:
bedrock_data = train_data[train_data['Geology'] == 'Bedrock']
superficial_data = train_data[train_data['Geology'] == 'Superficial'].reset_index(drop = True)
artefact_data = pd.concat([test_data.reset_index(drop = True), X_test_labeled_df['inlierLabel'].reset_index(drop = True)], axis = 1)

In [None]:
def addartefactclass(row):
    if row['Geology'] == 'Bedrock':
        return(row['class'])
    elif row['Geology'] == 'Superficial':
        return(row['class'])
    else:
        return('Artefact')
my_data['class'] = my_data.swifter.apply(addartefactclass, axis = 1)

In [None]:
artefact_data.head()

In [None]:
my_data_mod = my_data[my_data['Geology'] != 'Artefacts']

In [None]:
my_data_mod_final = pd.concat([my_data_mod, artefact_data], axis = 0).reset_index(drop = True)

In [None]:
def inlier_label(row):
    if row['Geology'] == 'Bedrock':
        return(row['class'])
    elif row['Geology'] == 'Superficial':
        return(row['class'])
    else:
        return(row['inlierLabel'])
my_data_mod_final['class'] = my_data_mod_final.swifter.apply(inlier_label, axis = 1)

In [None]:
my_data_mod_final.tail()

In [None]:
tsne_df_train_labelled = pd.concat([tsne_df_train.reset_index(drop = True), train_data['class'].reset_index(drop = True)], axis = 1)
tsne_df_bedrock_train_labelled = pd.concat([tsne_df_bedrock_train.reset_index(drop = True), bedrock_data['class'].reset_index(drop = True)], axis = 1)
tsne_df_superficial_train_labelled = pd.concat([tsne_df_superficial_train, superficial_data['class']], axis = 1)
tsne_df_test['class'] = 1
tsne_df_everything_labelled = pd.concat([tsne_df_everything.reset_index(drop = True), my_data_mod_final['class'].reset_index(drop = True)], axis = 1)

In [None]:
PC_df_train_labelled = pd.concat([PC_df_train.reset_index(drop = True), train_data['class'].reset_index(drop = True)], axis = 1)
PC_df_superficial_train_labelled = pd.concat([PC_df_superficial_train.reset_index(drop = True), superficial_data['class']], axis = 1)
PC_df_bedrock_train_labelled = pd.concat([PC_df_bedrock_train.reset_index(drop = True), bedrock_data['class'].reset_index(drop = True)], axis = 1)
PC_df_everything_labelled = pd.concat([PC_df_everything.reset_index(drop = True), my_data_mod_final['class'].reset_index(drop = True)], axis = 1)

In [None]:
tsne_df_everything_labelled.tail()

In [None]:
my_data['class'].unique()

In [None]:
#tsne_df_train_labelled.to_csv('website/tsne_both.csv')
#tsne_df_bedrock_train_labelled.to_csv('website/tsne_bedrock.csv')
#tsne_df_superficial_train_labelled.to_csv('website/tsne_superficial.csv')
#tsne_df_test.to_csv('website/tsne_artefacts.csv')
#tsne_df_everything_labelled.to_csv('website/tsne_samples_and_artefacts.csv')

In [None]:
#PC_df_train_labelled.to_csv('website/pca_both.csv')
#PC_df_superficial_train_labelled.to_csv('website/pca_superficial.csv')
#PC_df_bedrock_train_labelled.to_csv('website/pca_bedrock.csv')
#PC_df_everything_labelled.to_csv('website/pca_samples_and_artefacts.csv')