# data preproccessing


## import modules and configure notebook

In [1]:
import pandas as pd
import numpy as np
import swifter

pd.set_option('max.rows', None)
pd.set_option('max.columns', None)


from sklearn.decomposition import PCA 
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler


### configurations
* classify_bedrock_only -> True|False, if set to True then classes for classification are bedrock sites only
* group_superficial -> True|False, if set to True then some superficial sites are grouped together to raise class-specific F1 scores for the associated sites
* drop_fake_bedrock ->  True|False, if set to True then some bedrock sites deemed not to be true bedrock sites are not used for classification
* export_dimensions ->  True|False, if set to True then exports t-SNE and PCA dimensions as .csv files

In [2]:
data_input_path = '../data/raw_data.csv'

group_superficial = True
drop_fake_bedrock = True
classify_bedrock_only = False
reduced_sites = False

### Import data

In [3]:
my_data = pd.read_csv(data_input_path)

In [4]:
my_data.columns.values[9:-1]

array(['Li7', 'Be9', 'B11', 'Mg24', 'Al27', 'Si28', 'P31', 'S33', 'K39',
       'Ca42', 'Sc45', 'Ti47', 'V51', 'Cr52', 'Mn55', 'Fe56', 'Co59',
       'Ni60', 'Cu63', 'Zn68', 'Ga69', 'Ge72', 'As75', 'Rb85', 'Sr88',
       'Y89', 'Zr90', 'Nb93', 'Mo95', 'Cd111', 'In115', 'Sn118', 'Cs133',
       'Ba137', 'La139', 'Ce140', 'Pr141', 'Nd146', 'Sm147', 'Eu153',
       'Gd157', 'Tb159', 'Dy163', 'Ho165', 'Er166', 'Tm169', 'Yb172',
       'Lu175', 'Hf178', 'Ta181', 'Pb208', 'Th232'], dtype=object)

In [5]:
my_data['Site'].unique()

array(['FH', 'ER', 'WW', 'TC', 'CS', 'BC', 'KQ', 'AR', 'SL', 'FG', 'WB',
       'BX', 'PF', 'BM', 'WH', 'SQ', 'BP', 'WN', 'BH', 'PH', 'LB', 'AB',
       'LV', 'BR', 'KY', 'BF', 'ST', 'SH', 'CF', 'BG', 'AC', 'CR', 'GH',
       'PX', 'WF', 'DH', 'NMAG_Gold', 'NMW_Gold', 'NMWGwern', 'UBSS',
       'Cefn', 'Stockley', 'Pucha', 'Woodbury', 'Pimple', 'Wellington',
       'Lyonshall', 'SymondsYatE', 'Madawg', nan], dtype=object)

### make labels for classification

In [6]:

def make_classes_grouped(row):
    if row['Geology'] == 'Bedrock':
        if row['Site'] == 'WB' or row['Site'] == 'BX':
            return('WB_BX')
        elif row['Site'] == 'BC' or row['Site'] == 'CS':
            return('BC_CS')
        elif row['Site'] == 'SQ' or row['Site'] == 'BP':
            return('SQ_BP')
        else:
            return(row['Site'])
    elif row['Geology'] == 'Superficial':
        if row['Region'] == 'SV' or row['Region'] == 'SE':
            return('SV_SE')
        else:
            return(row['Region'])
        
def make_classes_grouped_reduced(row):
    if row['Geology'] == 'Bedrock':
        if row['Site'] == 'WB' or row['Site'] == 'BX':
            return('WB_BX')
        else:
            return(row['Site'])
    elif row['Geology'] == 'Superficial':
        if row['Region'] == 'SV' or row['Region'] == 'SE':
            return('SV_SE')
        else:
            return(row['Region'])

def make_classes_raw(row):
    if row['Geology'] == 'Bedrock':
        return(row['Site'])
    elif row['Geology'] == 'Superficial':
        return(row['Region'])



In [7]:
my_data['class'] = 'init'   


if drop_fake_bedrock and not group_superficial:
    my_data['class'] = my_data.apply(make_classes_grouped, axis = 1)
    
if group_superficial and not drop_fake_bedrock:
    my_data['class'] = my_data.apply(make_classes_grouped, axis = 1)
else:
    my_data['class'] = my_data.apply(make_classes_raw, axis = 1)
if reduced_sites == True:
    my_data['class'] = my_data.apply(make_classes_grouped_reduced, axis = 1)
    

In [8]:
if drop_fake_bedrock:
    my_data = my_data[(my_data['class'] != 'BM') & (my_data['class'] != 'BC') & (my_data['class'] != 'BP') ]

### Remove '<' signs.

In [9]:
for column_name in my_data.columns.values[9:-1]:
    def fill_less_than(row):
        if 'DL' in  str(row[column_name]):
            return(np.nan)
        if '<' in str(row[column_name]):
            return(float(row[column_name].replace('<', '').replace(',', '')))
        else:
            return(float(row[column_name]))
    my_data[column_name] = my_data.swifter.apply(fill_less_than, axis = 1)

Pandas Apply: 100%|██████████| 1552/1552 [00:00<00:00, 24103.77it/s]
Pandas Apply: 100%|██████████| 1552/1552 [00:00<00:00, 21796.11it/s]
Pandas Apply: 100%|██████████| 1552/1552 [00:00<00:00, 21340.30it/s]
Pandas Apply: 100%|██████████| 1552/1552 [00:00<00:00, 21739.18it/s]
Pandas Apply: 100%|██████████| 1552/1552 [00:00<00:00, 21773.14it/s]
Pandas Apply: 100%|██████████| 1552/1552 [00:00<00:00, 24089.23it/s]
Pandas Apply: 100%|██████████| 1552/1552 [00:00<00:00, 24433.82it/s]
Pandas Apply: 100%|██████████| 1552/1552 [00:00<00:00, 24055.76it/s]
Pandas Apply: 100%|██████████| 1552/1552 [00:00<00:00, 24417.14it/s]
Pandas Apply: 100%|██████████| 1552/1552 [00:00<00:00, 23928.33it/s]
Pandas Apply: 100%|██████████| 1552/1552 [00:00<00:00, 23360.73it/s]
Pandas Apply: 100%|██████████| 1552/1552 [00:00<00:00, 22218.37it/s]
Pandas Apply: 100%|██████████| 1552/1552 [00:00<00:00, 22606.64it/s]
Pandas Apply: 100%|██████████| 1552/1552 [00:00<00:00, 24487.86it/s]
Pandas Apply: 100%|██████████| 155

In [10]:

my_data = my_data.dropna(thresh = 15 , axis = 0)

### Impute na values with variable mean

In [11]:
for column_name in my_data.columns.values[9:-1]:
    my_data[column_name] = my_data[column_name].fillna(my_data[column_name].mean()) 

### Outliers defined as any values that exceed 2 standard deviations from the mean, such values are changed to the mean for that variable

In [12]:
std_dict = {}
mean_dict = {}
median_dict = {}
for col in my_data.columns.values[9:-1]:
    std_dict[col] = my_data[col].std()
    
for col in my_data.columns.values[9:-1]:
    mean_dict[col] = my_data[col].mean()
    
for col in my_data.columns.values[9:-1]:
    median_dict[col] = my_data[col].median()
    

In [13]:
for col_name in my_data.columns.values[9:-1]:
    def impute_outliers(row):
        if np.abs(row[col_name] - mean_dict[col_name]) > 2*(std_dict[col_name]):
            return(mean_dict[col_name])
        else:
            return(row[col_name])
    my_data[col_name]= my_data.swifter.apply(impute_outliers, axis = 1)

Pandas Apply: 100%|██████████| 1521/1521 [00:00<00:00, 25621.14it/s]
Pandas Apply: 100%|██████████| 1521/1521 [00:00<00:00, 25410.30it/s]
Pandas Apply: 100%|██████████| 1521/1521 [00:00<00:00, 26526.25it/s]
Pandas Apply: 100%|██████████| 1521/1521 [00:00<00:00, 25790.18it/s]
Pandas Apply: 100%|██████████| 1521/1521 [00:00<00:00, 21932.53it/s]
Pandas Apply: 100%|██████████| 1521/1521 [00:00<00:00, 26525.15it/s]
Pandas Apply: 100%|██████████| 1521/1521 [00:00<00:00, 27672.63it/s]
Pandas Apply: 100%|██████████| 1521/1521 [00:00<00:00, 26936.51it/s]
Pandas Apply: 100%|██████████| 1521/1521 [00:00<00:00, 28519.67it/s]
Pandas Apply: 100%|██████████| 1521/1521 [00:00<00:00, 24693.58it/s]
Pandas Apply: 100%|██████████| 1521/1521 [00:00<00:00, 26384.29it/s]
Pandas Apply: 100%|██████████| 1521/1521 [00:00<00:00, 28119.29it/s]
Pandas Apply: 100%|██████████| 1521/1521 [00:00<00:00, 27309.31it/s]
Pandas Apply: 100%|██████████| 1521/1521 [00:00<00:00, 27697.25it/s]
Pandas Apply: 100%|██████████| 152

### Filter for samples for which the source is known for training the model and those for which the source is not known (artefacts) for predictions

In [14]:
train_data = my_data[(my_data['Geology']== 'Bedrock') | (my_data['Geology'] == 'Superficial')]
test_data = my_data[my_data['Geology']=='Artefacts']


### label encode the class to be predicted

In [15]:
train_data_formodel = train_data.copy(deep = True)
if classify_bedrock_only == False: 
    train_data_formodel['class'], uniques = pd.factorize(train_data_formodel['class'])


### datasets are stored for modelling

In [16]:
%store train_data_formodel
%store train_data
%store test_data
%store my_data
%store uniques

Stored 'train_data_formodel' (DataFrame)
Stored 'train_data' (DataFrame)
Stored 'test_data' (DataFrame)
Stored 'my_data' (DataFrame)
Stored 'uniques' (Index)


### training data is split into two datasets, one superficial and one bedrock

In [17]:
train_data_bedrock = train_data[train_data['Geology'] == 'Bedrock']
train_data_superficial = train_data[train_data['Geology'] == 'Superficial']

### the columns containimg the mass spectrometry data are stored as variables, 4 datsets are stored, one containing all train data (bedrock and superficial), one containing just bedrock, one containing just superficial and one containing the artefacts

In [18]:
train_data_bedrock.columns.values[9:-1]

array(['Li7', 'Be9', 'B11', 'Mg24', 'Al27', 'Si28', 'P31', 'S33', 'K39',
       'Ca42', 'Sc45', 'Ti47', 'V51', 'Cr52', 'Mn55', 'Fe56', 'Co59',
       'Ni60', 'Cu63', 'Zn68', 'Ga69', 'Ge72', 'As75', 'Rb85', 'Sr88',
       'Y89', 'Zr90', 'Nb93', 'Mo95', 'Cd111', 'In115', 'Sn118', 'Cs133',
       'Ba137', 'La139', 'Ce140', 'Pr141', 'Nd146', 'Sm147', 'Eu153',
       'Gd157', 'Tb159', 'Dy163', 'Ho165', 'Er166', 'Tm169', 'Yb172',
       'Lu175', 'Hf178', 'Ta181', 'Pb208', 'Th232', 'U238'], dtype=object)

In [19]:
element_data_train = train_data[train_data.columns.values[9:-1]]
element_data_train_bedrock = train_data_bedrock[train_data.columns.values[9:-1]]
element_data_train_superficial = train_data_superficial[train_data.columns.values[9:-1]]
element_data_test = test_data[test_data.columns.values[9:-1]]

### I scale the mass spectrometry data to have a mean 0 and standard deviation of 1

In [20]:
my_scaler_train = StandardScaler()
my_scaler_train_bedrock = StandardScaler()
my_scaler_train_superficial = StandardScaler()
my_scaler_test = StandardScaler()

In [21]:
element_data_train_scaled = my_scaler_train.fit_transform(element_data_train)
element_data_train_bedrock_scaled = my_scaler_train_bedrock.fit_transform(element_data_train_bedrock)
element_data_train_superficial_scaled = my_scaler_train_superficial.fit_transform(element_data_train_superficial)
element_data_test_scaled = my_scaler_test.fit_transform(element_data_test)

### PCA is utilised on the four datasets

In [22]:
my_pca_train = PCA(n_components=element_data_train_scaled.shape[1])

my_pca_train_bedrock = PCA(n_components=element_data_train_bedrock_scaled.shape[1])
my_pca_train_superficial = PCA(n_components=element_data_train_superficial_scaled.shape[1])

my_pca_test = PCA(n_components=element_data_test_scaled.shape[1])

element_data_train_pca = my_pca_train.fit_transform(element_data_train_scaled)

element_data_train_bedrock_pca = my_pca_train_bedrock.fit_transform(element_data_train_bedrock_scaled)
element_data_train_superficial_pca = my_pca_train_superficial.fit_transform(element_data_train_superficial_scaled)

element_data_test_pca = my_pca_test.fit_transform(element_data_test_scaled)

### the principal components for the four datasets are put into dataframes

In [23]:
no_PCs = element_data_train_scaled.shape[1]
PC_names = []
for i in range(0, no_PCs):
    number = i + 1
    column_name = 'PC' + str(number)
    PC_names.append(column_name)

In [24]:
PC_df_train = pd.DataFrame(data = element_data_train_pca, columns = PC_names)

PC_df_bedrock_train = pd.DataFrame(data = element_data_train_bedrock_pca, columns = PC_names)
PC_df_superficial_train = pd.DataFrame(data = element_data_train_superficial_pca, columns = PC_names)

PC_df_test = pd.DataFrame(data = element_data_test_pca, columns = PC_names)

### t-SNE is utilised on the four datasets

In [25]:
my_tsne_train = TSNE(n_components=3, n_iter=750, verbose=3).fit_transform(element_data_train_scaled)

my_tsne_bedrock_train = TSNE(n_components=3, n_iter=750, verbose=3).fit_transform(element_data_train_bedrock_scaled)
my_tsne_superficial_train = TSNE(n_components=3, n_iter=750, verbose=3).fit_transform(element_data_train_superficial_scaled)

my_tsne_test = TSNE(n_components=3, n_iter=750, verbose=3).fit_transform(element_data_test_scaled)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1158 samples in 0.002s...
[t-SNE] Computed neighbors for 1158 samples in 0.187s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1158
[t-SNE] Computed conditional probabilities for sample 1158 / 1158
[t-SNE] Mean sigma: 2.027810
[t-SNE] Computed conditional probabilities in 0.071s
[t-SNE] Iteration 50: error = 72.9022751, gradient norm = 0.1194551 (50 iterations in 5.077s)
[t-SNE] Iteration 100: error = 73.8381500, gradient norm = 0.0987646 (50 iterations in 3.793s)
[t-SNE] Iteration 150: error = 74.3681564, gradient norm = 0.1034126 (50 iterations in 2.729s)
[t-SNE] Iteration 200: error = 74.3389893, gradient norm = 0.0824283 (50 iterations in 2.519s)
[t-SNE] Iteration 250: error = 74.7075882, gradient norm = 0.0842570 (50 iterations in 2.994s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 74.707588
[t-SNE] Iteration 300: error = 1.5202278, gradient norm = 0.0010219 (50 iterations in 2.766s)

### the t-SNE dimensions for the four datasets are put into dataframes

In [26]:
tsne_df_train = pd.DataFrame(data = my_tsne_train, columns = ['tsne1', 'tsne2', 'tsne3'])

tsne_df_bedrock_train = pd.DataFrame(data = my_tsne_bedrock_train, columns = ['tsne1', 'tsne2', 'tsne3'])
tsne_df_superficial_train = pd.DataFrame(data = my_tsne_superficial_train, columns = ['tsne1', 'tsne2', 'tsne3'])

tsne_df_test = pd.DataFrame(data = my_tsne_test, columns = ['tsne1', 'tsne2', 'tsne3'])

### datasets are stored for the purpose of two-dimensional and three-dimensional visualisations

In [27]:
%store PC_df_train
%store my_pca_train

%store PC_df_bedrock_train
%store my_pca_train_bedrock

%store PC_df_superficial_train
%store my_pca_train_superficial

%store PC_df_test
%store my_pca_test

%store tsne_df_train

%store tsne_df_bedrock_train
%store tsne_df_superficial_train


%store tsne_df_test


Stored 'PC_df_train' (DataFrame)
Stored 'my_pca_train' (PCA)
Stored 'PC_df_bedrock_train' (DataFrame)
Stored 'my_pca_train_bedrock' (PCA)
Stored 'PC_df_superficial_train' (DataFrame)
Stored 'my_pca_train_superficial' (PCA)
Stored 'PC_df_test' (DataFrame)
Stored 'my_pca_test' (PCA)
Stored 'tsne_df_train' (DataFrame)
Stored 'tsne_df_bedrock_train' (DataFrame)
Stored 'tsne_df_superficial_train' (DataFrame)
Stored 'tsne_df_test' (DataFrame)
