# data preproccessing


## import modules and configure notebook

In [1]:
import pandas as pd
import numpy as np
import swifter
import seaborn as sns
from sklearn.decomposition import PCA 
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

from general_ML_functions import replace_outliers, split_data

pd.set_option('max.rows', None)
pd.set_option('max.columns', None)
%matplotlib inline



### import configurations

In [2]:
from config import *

### read data from csv file

In [3]:
my_data = pd.read_csv('../data/rosa_dummy_data.csv')

In [4]:
my_data.head()

Unnamed: 0,Analysis,Li7,Be9,B11,Mg24,Al27,Si28,P31,S33,K39,Ca42,Sc45,Ti47,V51,Cr52,Mn55,Fe56,Co59,Ni60,Cu63,Zn68,Ga69,Ge72,As75,Rb85,Sr88,Y89,Zr90,Nb93,Mo95,Cd111,In115,Sn118,Cs133,Ba137,La139,Ce140,Pr141,Nd146,Sm147,Eu153,Gd157,Tb159,Dy163,Ho165,Er166,Tm169,Yb172,Lu175,Hf178,Ta181,Pb208,Th232,U238,class,Geology
0,10_FH1_1_1,15.63,0.12,48.36,154.63,943.71,464944.18,50.28,538.57,455.94,712.39,0.42,15.58,0.27,3.3,0.69,8.46,0.05,0.8,1.62,10.82,0.25,1.22,0.16,0.43,12.94,0.88,1.51,0.09,0.05,0.02,0.0,0.05,0.01,6.54,0.84,0.95,0.23,0.87,0.16,0.04,0.16,0.02,0.11,0.03,0.06,0.01,0.02,0.0,0.04,0.01,0.24,0.07,0.05,FH,samples
1,11_FH1_1_1,11.5,0.09,44.77,262.87,1077.11,465010.94,70.91,438.2,387.82,515.24,0.44,18.47,0.29,3.45,1.01,11.59,0.11,0.36,0.53,8.93,0.34,0.85,0.1,0.45,13.22,0.95,1.74,0.07,0.01,0.02,0.0,0.04,0.02,8.04,0.92,1.01,0.23,0.98,0.18,0.04,0.18,0.02,0.13,0.03,0.06,0.01,0.04,0.01,0.05,0.0,0.07,0.08,0.04,FH,samples
2,12_FH1_1_1,20.05,0.06,44.88,42.7,620.21,465295.41,104.47,372.66,363.71,957.89,0.76,19.89,0.55,3.25,1.21,87.99,0.21,1.68,1.53,11.98,0.25,1.71,0.13,0.43,8.52,0.87,0.93,0.1,0.02,0.02,0.0,0.05,0.01,3.13,0.9,1.08,0.26,0.84,0.15,0.04,0.19,0.02,0.14,0.02,0.07,0.01,0.06,0.0,0.02,0.01,0.46,0.05,0.05,FH,samples
3,13_FH1_1_2,11.16,0.73,47.06,162.42,1143.19,402596.61,56367.93,1075.89,547.55,2174.3,0.43,42.3,0.67,152.42,4.84,145.34,0.3,2.45,5.02,17.15,0.35,2.13,0.84,0.76,13.16,0.97,2.0,0.1,0.29,0.18,0.01,0.78,0.04,8.74,0.93,0.95,0.21,0.75,0.13,0.04,0.25,0.02,0.09,0.03,0.05,0.0,0.03,0.0,0.08,0.0,0.64,0.05,0.03,FH,samples
4,14_FH1_1_2,17.71,0.32,48.26,33.52,547.22,465027.11,44.44,464.78,278.25,1551.63,0.71,11.18,0.27,2.56,1.73,25.38,0.05,0.8,0.55,9.8,0.41,1.41,0.12,0.28,9.9,0.9,0.9,0.08,0.04,0.1,0.0,0.09,0.01,2.74,0.97,1.09,0.27,1.0,0.17,0.04,0.19,0.02,0.15,0.03,0.05,0.01,0.05,0.01,0.02,0.01,0.59,0.06,0.09,FH,samples


### visualise NA values

sns.set(rc={'figure.figsize':(20.7,50)})
sns.heatmap(my_data.isna(), cbar=False)

### remove rows where there are all element abundances are na values 

In [5]:
my_data = my_data.dropna(subset=my_data.columns.values[features_start:features_end], how = 'all' , axis = 0)

In [6]:
my_data[target].unique()

array(['FH', 'ER', 'WW', 'TC', 'CS', 'BC', 'KQ', 'AR', 'SL', 'FG',
       'WB_BX', 'PF', 'BM', 'WH', 'SQ', 'WN', 'BH', 'PH', 'LB', 'AB',
       'LV', 'SV_SE', 'BA', 'WA', 'MM', nan], dtype=object)

### split data into 'train_data' and 'test_data'

In [7]:
my_data_known, my_data_unknown = split_data(my_data, known_idententifier_col, known_identifier_value)

In [8]:
my_data_known.columns.values[features_start:features_end]

array(['Li7', 'Be9', 'B11', 'Mg24', 'Al27', 'Si28', 'P31', 'S33', 'K39',
       'Ca42', 'Sc45', 'Ti47', 'V51', 'Cr52', 'Mn55', 'Fe56', 'Co59',
       'Ni60', 'Cu63', 'Zn68', 'Ga69', 'Ge72', 'As75', 'Rb85', 'Sr88',
       'Y89', 'Zr90', 'Nb93', 'Mo95', 'Cd111', 'In115', 'Sn118', 'Cs133',
       'Ba137', 'La139', 'Ce140', 'Pr141', 'Nd146', 'Sm147', 'Eu153',
       'Gd157', 'Tb159', 'Dy163', 'Ho165', 'Er166', 'Tm169', 'Yb172',
       'Lu175', 'Hf178', 'Ta181', 'Pb208', 'Th232', 'U238'], dtype=object)

### Impute na values with feature mean

In [9]:
for column_name in my_data_known.columns.values[features_start:features_end]:
    my_data_known[column_name] = my_data_known[column_name].fillna(my_data_known[column_name].mean()) 

for column_name in my_data_unknown.columns.values[features_start:features_end]:
    my_data_unknown[column_name] = my_data_unknown[column_name].fillna(my_data_unknown[column_name].mean())     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


### Outliers defined as any values that exceed 2 standard deviations from the mean, such values are changed to the mean for that variable

In [10]:
my_data_known = replace_outliers(my_data_known, features_start = features_start, features_end = features_end, num_stds = 2)

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data[col_name]= data.swifter.apply(impute_outliers_geo, axis = 1)


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1218, style=ProgressStyle(description_widt…




In [11]:
my_data_unknown = replace_outliers(my_data_unknown, features_start = features_start, features_end = features_end, num_stds = 2)

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=363, style=ProgressStyle(description_width…




### be aware code stops running after above cell runs

In [13]:
my_data_known[target], uniques = pd.factorize(my_data_known[target])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [14]:
%store my_data_known
%store my_data_unknown
%store uniques
%store my_data

Stored 'my_data_known' (DataFrame)
Stored 'my_data_unknown' (DataFrame)
Stored 'uniques' (Index)
Stored 'my_data' (DataFrame)
