In [2]:
import psycopg2
import sqlalchemy as sq
import pandas as pd
import numpy as np
import mariadb
import json
import os
import shutil
import subprocess
from pathlib import Path
import pyodbc
from sklearn.decomposition import PCA
from scipy.spatial import distance
from sklearn.preprocessing import StandardScaler


In [3]:
soil_df = pd.read_csv("input/soil_analysis_cleaned.csv")

In [4]:
soil_df = pd.pivot_table(data=soil_df, values="result", index="sample_code", columns="chemical_name")

In [5]:
nulls = soil_df.isna().sum(axis=0).sort_values()
nulls = pd.DataFrame(nulls)
nulls = nulls[nulls<100000]
nulls = nulls.dropna()
chemicals = nulls.index

In [6]:
nulls

Unnamed: 0_level_0,0
chemical_name,Unnamed: 1_level_1
calcium,26767.0
magnesium,26768.0
potassium,26859.0
sodium,27846.0
cec,32649.0
phosphorus,49589.0
sulphur,68512.0
boron,71034.0
copper,72620.0
zinc,72624.0


In [7]:
soil_df = soil_df[chemicals]

In [8]:
soil_df

chemical_name,calcium,magnesium,potassium,sodium,cec,phosphorus,sulphur,boron,copper,zinc,manganese,iron,ec_salts,organic_matter
sample_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
AGD102-100SA0001,2530.0,465.0,851.0,32.7,23.20,40.20,,,,,,,,3.60
AGD102-101SA0001,607.0,102.0,211.0,32.5,6.87,5.62,,,,,,,,3.81
AGD102-101SA0002,844.0,145.0,241.0,110.0,7.47,26.60,,,,,,,,2.85
AGD102-101SA0003,477.0,89.7,128.0,121.0,6.46,6.04,,,,,,,,2.33
AGD102-101SA0004,677.0,136.0,266.0,124.0,8.93,4.05,,,,,,,,3.97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Test01-1SA0001,1430.0,341.0,731.0,34.6,19.10,61.40,,,,,,,229.0,5.13
Test01-1SA0002,2320.0,488.0,702.0,73.0,21.60,155.00,,,,,,,445.0,6.50
Test01-3SA0001,2110.0,267.0,682.0,39.3,20.40,8.42,,,,,,,92.0,
Test01-4SA0001,2110.0,267.0,682.0,39.3,20.40,8.42,,,,,,,92.0,


In [9]:
soil_df = soil_df.dropna()

In [10]:
sc = StandardScaler()

In [11]:
soil_df = sc.fit_transform(soil_df)

In [12]:
soil_df

array([[ 3.16075532e+00,  2.21488375e+00,  3.89610248e+00, ...,
        -8.16537590e-02,  2.41724392e+00, -1.25345382e-03],
       [-2.34259396e-01, -1.46071590e-01, -4.19709951e-01, ...,
        -4.38872753e-01, -2.33667560e-01,  1.29811490e-01],
       [-3.45283973e-01, -4.13290237e-01, -6.71465677e-01, ...,
        -3.53820611e-01, -3.10021446e-01, -1.20182755e-01],
       ...,
       [-6.70761288e-01, -4.44727725e-01, -8.90852809e-01, ...,
         2.16028736e-01, -3.14793564e-01, -3.94448286e-01],
       [-5.89574314e-02,  4.38665684e-01, -7.07430780e-01, ...,
        -4.13357110e-01, -2.59914208e-01, -4.91533430e-01],
       [-5.03055741e-01, -5.10746449e-01, -8.77223927e-01, ...,
         1.76397771e+00, -2.45597854e-01, -4.59980759e-01]])

In [13]:
pca = PCA(n_components=10)

In [14]:
soil_df = pca.fit_transform(soil_df)

In [15]:
soil_df = pd.DataFrame(soil_df)

In [16]:
soil_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,9.666593,4.050115,-1.808987,-0.229919,-4.574414,-1.271518,1.343595,-0.833879,-5.106942,2.485444
1,-0.573127,-0.510525,-0.173562,0.647647,0.561327,0.043760,0.302313,0.154533,-0.178708,-0.208248
2,-1.022743,-0.390607,0.036656,0.355385,0.577510,-0.117096,0.252146,0.070002,-0.142226,-0.183525
3,-1.579523,0.054465,0.522349,0.043831,0.398465,0.041689,-0.022065,0.078982,-0.093071,0.094307
4,1.024292,-0.755685,-0.234985,-0.199776,0.506214,-0.460110,0.404997,-0.000487,-0.068036,-0.441838
...,...,...,...,...,...,...,...,...,...,...
118341,-1.248794,0.571489,-0.082631,-1.422423,0.347753,0.085967,-0.819467,-0.752875,0.638201,-0.765287
118342,-1.324354,-0.264857,0.679136,-0.570375,-0.147696,-0.258603,-0.387859,0.110910,0.314588,0.511813
118343,-1.595311,0.037880,0.449511,-0.827979,0.071950,-0.165761,-0.625120,-0.136073,0.442470,0.136611
118344,-0.575778,-0.819147,0.590315,-0.798854,0.200901,-0.424535,-0.925912,0.216686,0.350396,0.424658


In [17]:
mu = np.mean(soil_df, axis=0)
sigma = np.cov(soil_df.T)

In [18]:
soil_df['mahalanobis_distance'] = [distance.mahalanobis(soil_df.iloc[i], mu, np.linalg.inv(sigma)) for i in range(len(soil_df)) ]

In [19]:
soil_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,mahalanobis_distance
0,9.666593,4.050115,-1.808987,-0.229919,-4.574414,-1.271518,1.343595,-0.833879,-5.106942,2.485444,10.461321
1,-0.573127,-0.510525,-0.173562,0.647647,0.561327,0.043760,0.302313,0.154533,-0.178708,-0.208248,1.094922
2,-1.022743,-0.390607,0.036656,0.355385,0.577510,-0.117096,0.252146,0.070002,-0.142226,-0.183525,0.992279
3,-1.579523,0.054465,0.522349,0.043831,0.398465,0.041689,-0.022065,0.078982,-0.093071,0.094307,1.018598
4,1.024292,-0.755685,-0.234985,-0.199776,0.506214,-0.460110,0.404997,-0.000487,-0.068036,-0.441838,1.306540
...,...,...,...,...,...,...,...,...,...,...,...
118341,-1.248794,0.571489,-0.082631,-1.422423,0.347753,0.085967,-0.819467,-0.752875,0.638201,-0.765287,2.384365
118342,-1.324354,-0.264857,0.679136,-0.570375,-0.147696,-0.258603,-0.387859,0.110910,0.314588,0.511813,1.410762
118343,-1.595311,0.037880,0.449511,-0.827979,0.071950,-0.165761,-0.625120,-0.136073,0.442470,0.136611,1.496536
118344,-0.575778,-0.819147,0.590315,-0.798854,0.200901,-0.424535,-0.925912,0.216686,0.350396,0.424658,1.756031


In [20]:
soil_df[['mahalanobis_distance']].describe()

Unnamed: 0,mahalanobis_distance
count,118346.0
mean,1.974059
std,2.470437
min,0.341224
25%,1.170129
50%,1.515778
75%,2.15869
max,320.799454


In [22]:
np.quantile(soil_df['mahalanobis_distance'].values , 0.95)

4.393726780733175

In [23]:
soil_df.to_csv("output/mahalanobis_distance.csv")

In [28]:
soil_df.loc[soil_df['mahalanobis_distance'] > np.quantile(soil_df['mahalanobis_distance'].values , 0.95)].to_csv('output/five_percent_outliers.csv')

In [27]:
# 5918 / 118346.000000