In [1]:
import psycopg2
import pandas as pd
import numpy as np
import mariadb
import json
import os
import shutil
import subprocess
from pathlib import Path
import pyodbc
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy.spatial import distance
import pickle
import matplotlib.pyplot as plt

In [14]:
water_df = pd.read_csv("output/water_single_analysis.csv")

In [15]:
water_df['analysis_name'].nunique()

42

In [16]:
water_df['analysis_name'] = [i.replace(":","") for i in water_df['analysis_name']]

In [7]:
water_df['analysis_name'].value_counts()

Irrigation                                          762404
Standard Drinking Water Analysis (WHO)              181124
Standard Drinking Water Analysis                    121503
Feacal Coliforms (MPN) and E.Coli                   101342
Reverse Osmosis Water Use                            84045
Basic Drinking Water Analysis                        62633
Basic Drinking Water Analysis (WHO)                  47972
Base Titanium Water Analysis                         47653
Total Coliform and Faecal E.Coli                     31738
Standard Drinking Water                              24881
Total Coliforms and Faecal E.Coli                     9387
Taita Water Analysis (Uni of Helsinki)                8256
WRMA Water Analysis                                   5400
Heavy Metal Analysis (Water)                          4575
Feacal Coliform (MPN) and E.Coli                      3219
Heavy Metals in Water                                 2671
Water Borne Diseases                                  24

In [19]:
for analysis in np.unique(water_df['analysis_name']):
    print(analysis)
    df_ = water_df.loc[water_df['analysis_name']==analysis]
    df_ = pd.pivot_table(data=df_, values="result", index="sample_code", columns="chemical_name")
    print(len(df_))

    os.makedirs(f"output/boxplots/{analysis}",exist_ok=True)
    for column in df_.columns:
        plt.boxplot(df_[column])
        plt.savefig(f"output/boxplots/{analysis}/{column}.png")
        plt.clf()
        outlier_threshold = df_[column].quantile(0.99)
        df_ = df_.loc[df_[column] <= outlier_threshold]
    df_.describe().to_csv(f"output/chemical_null_count/{analysis}.csv")
    print(len(df_))
    if(len(df_) == 0):
        continue
    
    df_.to_csv(f"output/analysis/{analysis}.csv")
    break

Alkalinity in Water
120
116


<Figure size 640x480 with 0 Axes>

In [20]:
df_

chemical_name,calcium,magnesium
sample_code,Unnamed: 1_level_1,Unnamed: 2_level_1
CH152WA0001,1.37,0.40
CH152WA0002,2.37,0.46
CH152WA0003,7.98,0.61
CH152WA0004,2.63,0.30
CH152WA0005,7.85,2.25
...,...,...
CT248WA0024,39.30,9.18
CT248WA0026,4.93,2.22
CT248WA0028,17.40,4.90
CT248WA0029,14.60,5.98


In [22]:
for analysis_file in os.listdir("output/analysis"):
    print(analysis_file)
    analysis_df = pd.read_csv(f"output/analysis/{analysis_file}",index_col=0)
    analysis_df = analysis_df.dropna()
    if(len(analysis_df) == 0):
        continue
    sc = StandardScaler()
    pca = PCA()
    analysis_scaled = sc.fit_transform(analysis_df)
    pca.fit(analysis_scaled)
    pca_explained_variance_df = pd.DataFrame(zip(analysis_df.columns,pca.explained_variance_ratio_))
    pca_explained_variance_df.to_csv(f"output/pca_explained_variance_ratio/{analysis_file}")
    pca_explained_variance_df = pca_explained_variance_df.loc[pca_explained_variance_df['1']>0.1]
    print(pca_explained_variance_df)
    print([ i for i in pca.explained_variance_ratio_ if i > 0.1])
    n_components = len([ i for i in pca.explained_variance_ratio_ if i > 0.1])
    print(n_components)
    pca = PCA(n_components=n_components)
    pca_reduced_df = pca.fit_transform(analysis_scaled)
    analysis_scaled = pd.DataFrame(analysis_scaled,index=analysis_df.index)
    pca_reduced_df = pd.DataFrame(pca_reduced_df,index=analysis_df.index)
    analysis_scaled.to_csv(f"output/analysis_scaled/{analysis_file}")
    pca_reduced_df.to_csv(f"output/pca/{analysis_file}")
    pickle.dump(sc, open(f"models/scalers/{analysis_file.replace('csv','pkl')}","wb"))
    pickle.dump(pca, open(f"models/pca/{analysis_file.replace('csv','pkl')}","wb"))

Alkalinity in Water.csv
[0.9010519327172081]
1


In [40]:
for analysis_file in os.listdir("output/pca"):
    print(analysis_file)
    pca_reduced_df = pd.read_csv(f"output/pca/{analysis_file}",index_col=0)
    if len(pca_reduced_df.columns) < 2:
        continue
    mu = np.mean(pca_reduced_df, axis=0)
    sigma = np.cov(pca_reduced_df.T)

    try:
        pca_reduced_df['mahalanobis_distance'] = [distance.mahalanobis(pca_reduced_df.iloc[i], mu, np.linalg.inv(sigma)) for i in range(len(pca_reduced_df)) ]
    except:
        continue
    pca_reduced_df.to_csv(f"output/mahalanobis_distance/{analysis_file}")

Alkalinity in Water.csv
Aquasearch Full Chemical Analysis.csv
0   -3.700743e-17
1   -1.295260e-16
dtype: float64
[[ 3.16622165e+00 -6.53279231e-16]
 [-6.53279231e-16  1.46198355e+00]]
Base Titanium Water Analysis.csv
0    8.940217e-17
1   -1.168656e-16
2   -7.713128e-17
dtype: float64
[[ 6.97856154e+00 -1.70168910e-16 -5.12884788e-16]
 [-1.70168910e-16  3.55725671e+00 -1.39014149e-15]
 [-5.12884788e-16 -1.39014149e-15  3.00077546e+00]]
BT Extended Water Quality Analysis.csv
E coli & Total Coliforms.csv
0    1.930823e-17
1    5.309762e-17
dtype: float64
[[1.66570922e+00 9.92242921e-17]
 [9.92242921e-17 3.39740376e-01]]
EAS 122018 Drinking Water Standard Analysis(SG).csv
0   -1.009294e-17
1    6.308085e-17
dtype: float64
[[1.44333684e+00 1.17314864e-17]
 [1.17314864e-17 6.03174788e-01]]
Faecal Coliforms and Faecal E.Coli.csv
0    1.356939e-16
1    1.480297e-16
2   -1.480297e-16
dtype: float64
[[ 8.85989720e+00 -4.36441666e-16 -2.54988372e-16]
 [-4.36441666e-16  5.01627608e+00 -4.90095092

In [13]:
pca_reduced_df

Unnamed: 0_level_0,0
sample_code,Unnamed: 1_level_1
CH152WA0001,-0.953822
CH152WA0002,-0.934108
CH152WA0003,-0.829962
CH152WA0004,-0.935051
CH152WA0005,-0.775575
...,...
CT248WA0024,0.018716
CT248WA0026,-0.828122
CT248WA0028,-0.515525
CT248WA0029,-0.527592


In [44]:
mahalanobis_threshold_dict = {}
for analysis_file in os.listdir("output/mahalanobis_distance"):
    print(analysis_file)
    analysis = analysis_file.replace(".csv","")
    mahalanobis_df = pd.read_csv(f"output/mahalanobis_distance/{analysis_file}",index_col=0)
    upper_quantile = (mahalanobis_df['mahalanobis_distance'].quantile(0.95))
    mahalanobis_threshold_dict[analysis_file.replace(".csv","")] = upper_quantile
    mahalanobis_df.loc[mahalanobis_df['mahalanobis_distance'] >= upper_quantile].to_csv(f"output/mahanobis_distance_upper_quantile/{analysis_file}")
    plt.boxplot(mahalanobis_df['mahalanobis_distance'])
    plt.savefig(f"output/mahalanobis_boxplots/{analysis}.png")
    plt.clf()
pickle.dump(mahalanobis_threshold_dict, open("mahalanobis_thresholds.dict","wb"))

Aquasearch Full Chemical Analysis.csv
Base Titanium Water Analysis.csv
E coli & Total Coliforms.csv
EAS 122018 Drinking Water Standard Analysis(SG).csv
Faecal Coliforms and Faecal E.Coli.csv
Feacal Coliform (MPN) and E.Coli.csv
Standard Drinking Water + Free Chlorine Analysis.csv
Standard Drinking Water.csv
Taita Water Analysis (Uni of Helsinki).csv
Total Coliforms and Faecal E.Coli.csv
Total Suspended Solids (Calculated).csv
Turbidity.csv
Water Colour Analysis.csv
WRMA Water Analysis.csv


<Figure size 640x480 with 0 Axes>

In [42]:
mahalanobis_threshold_dict

{'Aquasearch Full Chemical Analysis': 1.8113237681715342,
 'Base Titanium Water Analysis': 2.9378947171123944,
 'E coli & Total Coliforms': 2.3326288306672254,
 'EAS 122018 Drinking Water Standard Analysis(SG)': 2.147653594609786,
 'Faecal Coliforms and Faecal E.Coli': 2.450020082834289,
 'Feacal Coliform (MPN) and E.Coli': 3.1801159251369375,
 'Standard Drinking Water + Free Chlorine Analysis': 2.5263932579595076,
 'Standard Drinking Water': 2.548448122482619,
 'Taita Water Analysis (Uni of Helsinki)': 2.3345469867079327,
 'Total Coliforms and Faecal E.Coli': 2.70237718592308,
 'Total Suspended Solids (Calculated)': 3.1438634766539,
 'Turbidity': 3.1438634766539,
 'Water Colour Analysis': 3.1438634766539,
 'WRMA Water Analysis': 3.631719882852156}

In [102]:
water_ = water_df.loc[water_df['analysis_name'] == 'Total Coliforms and Faecal E.Coli']

In [103]:
water_ = water_.sort_values(by="sample_code")

In [105]:
water_ = water_.loc[water_['sample_code'] == 'CA002WA0156']

In [119]:
water_pivot = pd.pivot_table(data = water_, columns = "chemical_name", values = "result", index="sample_code").reset_index()

In [125]:
water_pivot['analysis_id'] = 570

In [128]:
req = water_pivot.to_dict(orient="records")

In [129]:
req

[{'sample_code': 'CA002WA0156',
  'ammonium': 0.01,
  'bicarbonate': 130.0,
  'boron': 0.035,
  'calcium': 1.62,
  'chlorides': 9.12,
  'copper': 0.01,
  'electrical_conductivity': 0.28,
  'iron': 1.39,
  'magnesium': 0.32,
  'manganese': 0.21,
  'molybdenum': 0.025,
  'nitrate_n': 0.01,
  'nitrates': 0.01,
  'phosphorus': 0.028,
  'potassium': 7.19,
  'silica': 72.5,
  'silicon__': 33.9,
  'sodium': 62.7,
  'sulphate': 5.27,
  'sulphur': 1.76,
  'zinc': 1.83,
  'analysis_id': 570}]

In [132]:
_ = pd.DataFrame(req)

In [133]:
_

Unnamed: 0,sample_code,ammonium,bicarbonate,boron,calcium,chlorides,copper,electrical_conductivity,iron,magnesium,...,nitrates,phosphorus,potassium,silica,silicon__,sodium,sulphate,sulphur,zinc,analysis_id
0,CA002WA0156,0.01,130.0,0.035,1.62,9.12,0.01,0.28,1.39,0.32,...,0.01,0.028,7.19,72.5,33.9,62.7,5.27,1.76,1.83,570


In [134]:
analysis_dict = pickle.load(open("analysis.dict","rb"))

In [165]:
mahalanobis_thresholds = pickle.load(open("mahalanobis_thresholds.dict","rb"))

In [166]:
mahalanobis_thresholds

{'Aquasearch Full Chemical Analysis': 1.8113237681715342,
 'Base Titanium Water Analysis': 2.9378947171123944,
 'E coli & Total Coliforms': 2.3326288306672254,
 'EAS 122018 Drinking Water Standard Analysis(SG)': 2.147653594609786,
 'Faecal Coliforms and Faecal E.Coli': 2.450020082834289,
 'Feacal Coliform (MPN) and E.Coli': 3.1801159251369375,
 'Standard Drinking Water + Free Chlorine Analysis': 2.5263932579595076,
 'Standard Drinking Water': 2.548448122482619,
 'Taita Water Analysis (Uni of Helsinki)': 2.3345469867079327,
 'Total Coliforms and Faecal E.Coli': 2.70237718592308,
 'Total Suspended Solids (Calculated)': 3.1438634766539,
 'Turbidity': 3.1438634766539,
 'Water Colour Analysis': 3.1438634766539,
 'WRMA Water Analysis': 3.631719882852156}

In [135]:
analysis_dict

{337: 'Total Suspended Solids',
 459: 'E coli & Total Coliforms',
 580: 'Total Suspended Solids',
 26552: 'E Coli & Total Coliforms',
 26585: 'Total Coliforms',
 26762: 'Total Bacteria Count',
 26773: 'Total Coliforms',
 26845: 'Total coliforms',
 49: 'Total Suspended Solids',
 52: 'Standard Drinking Water Analysis',
 154: 'Turbidity',
 180: 'Total Bacterial Count',
 184: 'Alkalinity in Water',
 187: 'Turbidity Analysis',
 216: 'Reverse Osmosis Water Use',
 263: 'E coli & Total Coliforms',
 269: 'Total Suspended Solids',
 270: 'Total Dissolved Solids',
 276: 'Heavy Metals in Water',
 288: 'Basic Drinking Water Analysis (WHO)',
 289: 'E Coli & Total Coliforms',
 308: 'Water Colour Analysis',
 315: 'Water Borne Diseases',
 324: 'Reverse Osmosis Water Use',
 371: 'Total Suspended Solids',
 379: 'Total Bacteria Count',
 555: 'Aquasearch Full Chemical Analysis',
 570: 'Total Coliforms and Faecal E.Coli',
 660: 'Iodine in Water',
 707: 'Standard Drinking Water Analysis (WHO)',
 719: 'Base Ti

In [136]:
_['analysis_name'] = [ analysis_dict[i] for i in _['analysis_id'] ]

In [137]:
_

Unnamed: 0,sample_code,ammonium,bicarbonate,boron,calcium,chlorides,copper,electrical_conductivity,iron,magnesium,...,phosphorus,potassium,silica,silicon__,sodium,sulphate,sulphur,zinc,analysis_id,analysis_name
0,CA002WA0156,0.01,130.0,0.035,1.62,9.12,0.01,0.28,1.39,0.32,...,0.028,7.19,72.5,33.9,62.7,5.27,1.76,1.83,570,Total Coliforms and Faecal E.Coli


In [169]:
result = []
for index,row in _.iterrows():
    scaler = pickle.load(open(f"models/scalers/{analysis}.pkl","rb"))
    pca = pickle.load(open(f"models/pca/{analysis}.pkl","rb"))

    pca_df = pd.read_csv(f"output/pca_df/{analysis}.csv",index_col=0)
    print(pca_df)

    sample_code = row['sample_code']
    analysis = row['analysis_name']
    analysis_id = row['analysis_id']
    tmp_df = pd.DataFrame(row).T.drop(["sample_code","analysis_id","analysis_name"], axis=1)    

    df_scaled = scaler.transform(tmp_df)
    df_pca = pd.DataFrame(pca.transform(df_scaled))

    mu = np.mean(pca_df, axis=0)
    sigma = np.cov(pca_df.T)

    mahalanobis_distance = distance.mahalanobis(df_pca.iloc[0], mu, np.linalg.inv(sigma))

    print(mahalanobis_distance)

    expected_md = mahalanobis_thresholds[analysis]
    print(expected_md)

    if mahalanobis_distance <= expected_md:
        result.append({"sample_code": sample_code,"is_outlier":False})
    else:
        result.append({"sample_code": sample_code,"is_outlier":True})
    

                        0         1
sample_code                        
CA002WA0156      0.455404 -1.684522
CA019WA0022     -2.678104  0.078408
CA053WA0032     -2.738159  0.564073
CA053WA0033     -2.738219  0.674791
CA053WA0034     -0.550419  0.261831
...                   ...       ...
CX001WA0056      1.959837 -2.315428
CZ017WA0001     -2.170468  0.127508
FA324-251WA0001  0.810630 -1.689447
I48-1-77WA0003  -2.753695  0.052034
I48-1-77WA0004  -2.760188 -0.004667

[348 rows x 2 columns]
1.0636497517330128
2.70237718592308


In [170]:
result

[{'sample_code': 'CA002WA0156', 'is_outlier': False}]