In [1]:
import psycopg2
import pandas as pd
import numpy as np
import mariadb
import json
import os
import shutil
import subprocess
from pathlib import Path
import pyodbc
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy.spatial import distance
import pickle
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
import seaborn as sns



In [2]:
water_df = pd.read_csv("output/water_analysis_cleaned.csv")

In [3]:
water_df.analysis_name.nunique()

39

In [4]:
water_df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,sample_code,batch_date,analysis_name,chemical_name,result,unit_name
0,230,230,CO006WA0079,2012-07-19 00:00:00.000,Irrigation,sar,1.570,
1,312,312,CP052WA0005,2012-07-13 00:00:00.000,Irrigation,sar,1.090,
2,320,320,CP052WA0006,2012-07-13 00:00:00.000,Irrigation,sar,0.050,
3,335,335,CP052WA0007,2012-07-13 00:00:00.000,Irrigation,sar,1.280,
4,357,357,CP052WA0008,2012-07-13 00:00:00.000,Irrigation,sar,1.730,
...,...,...,...,...,...,...,...,...
1006750,446831,446831,CU014WA0017,2020-04-09 10:34:00.000,KS Drinking Water Standard Analysis,nitrite,0.005,ppm
1006751,447540,447540,CU014WA0019,2020-04-09 11:01:00.000,KS Drinking Water Standard Analysis,nitrite,0.005,ppm
1006752,476832,476832,CB210WA0001,2020-05-12 10:49:00.000,KS Drinking Water Standard Analysis,nitrite,0.003,ppm
1006753,477818,477818,CB210WA0002,2020-05-12 10:49:00.000,KS Drinking Water Standard Analysis,nitrite,0.003,ppm


In [5]:
# water_df = water_df.loc[water_df['analysis_name'] == 'Base Titanium Water Analysis']
water_df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,sample_code,batch_date,analysis_name,chemical_name,result,unit_name
0,230,230,CO006WA0079,2012-07-19 00:00:00.000,Irrigation,sar,1.570,
1,312,312,CP052WA0005,2012-07-13 00:00:00.000,Irrigation,sar,1.090,
2,320,320,CP052WA0006,2012-07-13 00:00:00.000,Irrigation,sar,0.050,
3,335,335,CP052WA0007,2012-07-13 00:00:00.000,Irrigation,sar,1.280,
4,357,357,CP052WA0008,2012-07-13 00:00:00.000,Irrigation,sar,1.730,
...,...,...,...,...,...,...,...,...
1006750,446831,446831,CU014WA0017,2020-04-09 10:34:00.000,KS Drinking Water Standard Analysis,nitrite,0.005,ppm
1006751,447540,447540,CU014WA0019,2020-04-09 11:01:00.000,KS Drinking Water Standard Analysis,nitrite,0.005,ppm
1006752,476832,476832,CB210WA0001,2020-05-12 10:49:00.000,KS Drinking Water Standard Analysis,nitrite,0.003,ppm
1006753,477818,477818,CB210WA0002,2020-05-12 10:49:00.000,KS Drinking Water Standard Analysis,nitrite,0.003,ppm


In [6]:
water_df.sort_values("batch_date")

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,sample_code,batch_date,analysis_name,chemical_name,result,unit_name
739321,22111,22111,CM003WA0024,2010-08-25 00:00:00.000,Standard Drinking Water Analysis,fluorides,0.30,ppm
377993,22108,22108,CM003WA0024,2010-08-25 00:00:00.000,Feacal Coliforms (MPN) and E.Coli,boron,0.01,ppm
464223,22125,22125,CM003WA0024,2010-08-25 00:00:00.000,Feacal Coliforms (MPN) and E.Coli,manganese,0.05,ppm
729643,22117,22117,CM003WA0024,2010-08-25 00:00:00.000,Standard Drinking Water Analysis,calcium,9.42,ppm
488615,22116,22116,CM003WA0024,2010-08-25 00:00:00.000,Feacal Coliforms (MPN) and E.Coli,ammonium,0.01,ppm
...,...,...,...,...,...,...,...,...
932687,392769,392769,CS333WA0517,2024-08-07 11:37:22.000,Standard Drinking Water Analysis (WHO),silicon__,33.40,ppm
935714,398670,398670,CS333WA0517,2024-08-07 11:37:22.000,Standard Drinking Water Analysis (WHO),total_coliforms,70.00,mpn/100ml
931118,392966,392966,CS333WA0517,2024-08-07 11:37:22.000,Standard Drinking Water Analysis (WHO),ammonium,0.21,ppm
928797,392970,392970,CS333WA0517,2024-08-07 11:37:22.000,Standard Drinking Water Analysis (WHO),sodium,150.00,ppm


In [7]:
water_df = water_df.sort_values('sample_code')

In [8]:
new_samples_df = water_df.loc[water_df['batch_date'] > '2024-05-21'] 
water_df = water_df.loc[water_df['batch_date'] <= '2024-05-21'] 

In [9]:
water_df.analysis_name.unique()

array(['Irrigation', 'Reverse Osmosis Water Use',
       'Feacal Coliforms (MPN) and E.Coli', 'Standard Drinking Water',
       'Standard Drinking Water Analysis (WHO)',
       'Total Coliform and Faecal E.Coli', 'Heavy Metal Analysis (Water)',
       'Standard Drinking Water Analysis',
       'Basic Drinking Water Analysis', 'Heavy Metals in Water',
       'Total Dissolved Solids', 'Water Colour Analysis', 'Turbidity',
       'Total Suspended Solids', 'Basic Drinking Water Analysis (WHO)',
       'Feacal Coliform (MPN) and E.Coli', 'Post Harvest Water Analysis',
       'Total Bacterial Count', 'Total Coliforms and Faecal E.Coli',
       'Total Bacteria Count', 'Total Suspended Solids (Calculated)',
       'KEBS Drinking Water Microbiology', 'Chemical Oxygen Demand',
       'Chemical & Biochemical Oxygen Demand', 'Water Borne Diseases',
       'E coli & Total Coliforms', 'Water Microbiology (KEBS)',
       'KEBS Drinking Water Standard Analysis',
       'Colloidal Silica in Water', 'Ba

In [10]:
water_df['analysis_name'].nunique()

39

In [11]:
new_samples_df['analysis_name'].nunique()

6

In [12]:
new_samples_df[(new_samples_df['analysis_name'].isin(water_df['analysis_name'].unique()))].analysis_name.nunique()

6

In [13]:
water_df['analysis_name'] = [i.replace(":","") for i in water_df['analysis_name']]

In [14]:
water_df['analysis_name'].value_counts()

Irrigation                                          374744
Feacal Coliforms (MPN) and E.Coli                   136524
Basic Drinking Water Analysis                       121173
Standard Drinking Water Analysis                    111316
Standard Drinking Water                              63749
Reverse Osmosis Water Use                            44505
Base Titanium Water Analysis                         35185
Total Coliform and Faecal E.Coli                     28720
Standard Drinking Water Analysis (WHO)               19280
Heavy Metal Analysis (Water)                         16642
Total Dissolved Solids                               11430
Taita Water Analysis (Uni of Helsinki)                8256
Water Colour Analysis                                 7449
Heavy Metals in Water                                 6250
Feacal Coliform (MPN) and E.Coli                      4776
Total Suspended Solids (Calculated)                   2871
Turbidity                                             28

In [15]:
for analysis in np.unique(water_df['analysis_name']):
    print(analysis)
    df_ = water_df.loc[water_df['analysis_name']==analysis]
    df_ = pd.pivot_table(data=df_, values="result", index="sample_code", columns="chemical_name")
    print(len(df_))

    os.makedirs(f"output/boxplots/{analysis}",exist_ok=True)
    for column in df_.columns:
        plt.boxplot(df_[column])
        plt.savefig(f"output/boxplots/{analysis}/{column}.png")
        plt.clf()
        outlier_threshold = df_[column].quantile(0.99)
        # df_ = df_.loc[df_[column] <= outlier_threshold]
    df_.describe().to_csv(f"output/chemical_null_count/{analysis}.csv")
    print(len(df_))
    if(len(df_) == 0):
        continue
    
    df_.to_csv(f"output/analysis/{analysis}.csv")
    

Aquasearch Full Chemical Analysis
191
191
BT Extended Water Quality Analysis
28
28
Base Titanium Water Analysis
1548
1548
Basic Drinking Water Analysis
2720
2720
Basic Drinking Water Analysis (WHO)
338
338
Chemical & Biochemical Oxygen Demand
160
160
Chemical Oxygen Demand
79
79
Colloidal Silica in Water
30
30
E coli & Total Coliforms
105
105
Feacal Coliform (MPN) and E.Coli
214
214
Feacal Coliforms (MPN) and E.Coli
3344
3344
Free Chlorine
77
77
Heavy Metal Analysis (Water)
624
624
Heavy Metals in Water
654
654
Irrigation
6632
6632
KEBS Drinking Water Microbiology
71
71
KEBS Drinking Water Standard Analysis
94
94
KS Drinking Water Standard Analysis
19
19
Oil & Grease
63
63
Post Harvest Water Analysis
121
121
Reverse Osmosis Water Use
1785
1785
Standard Drinking Water
1420
1420
Standard Drinking Water + Free Chlorine Analysis
43
43
Standard Drinking Water Analysis
2473
2473
Standard Drinking Water Analysis (WHO)
907
907
Taita Water Analysis (Uni of Helsinki)
516
516
Total Bacteria Count

<Figure size 640x480 with 0 Axes>

In [16]:
# for analysis_file in os.listdir("output/analysis"):
#     os.makedirs("./output/pairplots",exist_ok=True)
#     analysis_df = pd.read_csv(f"output/analysis/{analysis_file}",index_col=0)
#     print(analysis_file.replace('.csv',''))
#     sns.pairplot(analysis_df)
#     print('Saving')
#     plt.savefig(f"output/pairplots/{analysis_file.replace('.csv','')}.png")
#     print('Saved')
#     plt.clf()

In [17]:
for analysis_file in os.listdir("output/analysis"):
    print(analysis_file)
    analysis_df = pd.read_csv(f"output/analysis/{analysis_file}",index_col=0)
    print(len(analysis_df.columns))
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    analysis_df = pd.DataFrame(imp_mean.fit_transform(analysis_df), columns=analysis_df.columns, index=analysis_df.index)
    pickle.dump(imp_mean, open(f"models/imputers/{analysis_file.replace('csv','pkl')}","wb"))
    # if(len(analysis_df) == 0):
    #     continue
    sc = StandardScaler()
    pca = PCA()
    analysis_scaled = sc.fit_transform(analysis_df)
    pca.fit(analysis_scaled)
    pca_explained_variance_df = pd.DataFrame(zip(analysis_df.columns,pca.explained_variance_ratio_))
    pca_explained_variance_df.to_csv(f"output/pca_explained_variance_ratio/{analysis_file}")
    pca_explained_variance_df = pca_explained_variance_df.loc[pca_explained_variance_df[1]>0.1]
    n_components = len(pca_explained_variance_df)
    
    pca_chems = pca_explained_variance_df[0]
    
    pca = PCA(n_components=n_components)
    analysis_scaled = sc.fit_transform(analysis_df)
    pca_reduced_df = pca.fit_transform(analysis_scaled)
    analysis_scaled = pd.DataFrame(analysis_scaled,index=analysis_df.index)
    pca_reduced_df = pd.DataFrame(pca_reduced_df,index=analysis_df.index, columns=pca_chems)
    analysis_scaled.to_csv(f"output/analysis_scaled/{analysis_file}")
    pca_reduced_df.to_csv(f"output/pca_df/{analysis_file}")
    pickle.dump(sc, open(f"models/scalers/{analysis_file.replace('csv','pkl')}","wb"))
    pickle.dump(pca, open(f"models/pca/{analysis_file.replace('csv','pkl')}","wb"))

Aquasearch Full Chemical Analysis.csv
4
Base Titanium Water Analysis.csv
23
Basic Drinking Water Analysis (WHO).csv
2
Basic Drinking Water Analysis.csv
43
BT Extended Water Quality Analysis.csv
35
Chemical & Biochemical Oxygen Demand.csv
20
Chemical Oxygen Demand.csv
3
Colloidal Silica in Water.csv
3
E coli & Total Coliforms.csv
2
Feacal Coliform (MPN) and E.Coli.csv
26
Feacal Coliforms (MPN) and E.Coli.csv
37
Free Chlorine.csv
5
Heavy Metal Analysis (Water).csv
35
Heavy Metals in Water.csv
15
Irrigation.csv
41
KEBS Drinking Water Microbiology.csv
6
KEBS Drinking Water Standard Analysis.csv
4
KS Drinking Water Standard Analysis.csv
3
Oil & Grease.csv
8
Post Harvest Water Analysis.csv
3
Reverse Osmosis Water Use.csv
35
Standard Drinking Water + Free Chlorine Analysis.csv
2
Standard Drinking Water Analysis (WHO).csv
25
Standard Drinking Water Analysis.csv
31
Standard Drinking Water.csv
26
Taita Water Analysis (Uni of Helsinki).csv
16
Total Bacteria Count.csv
8
Total Bacterial Count.csv
4

In [18]:
# pd.DataFrame([
#     {
#         'sample_code': 'CB057WA0107',
#         'ammonium': {'result':3.936512250255515, 'units':'ppm'},
#         'bicarbonate': {'result':3.936512250255515, 'units':'ppm'},
#         'boron': {'result':3.936512250255515, 'units':'ppm'}
#     }
# ])

In [19]:
# pca_reduced_df.reset_index().to_dict('records')

In [20]:
for analysis_file in os.listdir("output/pca"):
    print(analysis_file)

Aquasearch Full Chemical Analysis.csv
Base Titanium Water Analysis.csv
Basic Drinking Water Analysis (WHO).csv
Basic Drinking Water Analysis.csv
BT Extended Water Quality Analysis.csv
Chemical & Biochemical Oxygen Demand.csv
Chemical Oxygen Demand.csv
Colloidal Silica in Water.csv
E coli & Total Coliforms.csv
Feacal Coliform (MPN) and E.Coli.csv
Feacal Coliforms (MPN) and E.Coli.csv
Free Chlorine.csv
Heavy Metal Analysis (Water).csv
Heavy Metals in Water.csv
Irrigation.csv
KEBS Drinking Water Microbiology.csv
KEBS Drinking Water Standard Analysis.csv
KS Drinking Water Standard Analysis.csv
Oil & Grease.csv
Post Harvest Water Analysis.csv
Reverse Osmosis Water Use.csv
Standard Drinking Water + Free Chlorine Analysis.csv
Standard Drinking Water Analysis (WHO).csv
Standard Drinking Water Analysis.csv
Standard Drinking Water.csv
Taita Water Analysis (Uni of Helsinki).csv
Total Bacteria Count.csv
Total Bacterial Count.csv
Total Chlorine.csv
Total Coliform and Faecal E.Coli.csv
Total Colifor

In [21]:
for analysis_file in os.listdir("output/pca_df"):
    print(analysis_file)
    pca_reduced_df = pd.read_csv(f"output/pca_df/{analysis_file}",index_col=0)
    # if len(pca_reduced_df.columns) < 2:
        # continue
    mu = np.mean(pca_reduced_df, axis=0)
    sigma = np.cov(pca_reduced_df.T)

    try:
        pca_reduced_df['mahalanobis_distance'] = [distance.mahalanobis(pca_reduced_df.iloc[i], mu, np.linalg.inv(sigma)) for i in range(len(pca_reduced_df)) ]
    except:
        continue
    pca_reduced_df.to_csv(f"output/mahalanobis_distance/{analysis_file}")

Aquasearch Full Chemical Analysis.csv
Base Titanium Water Analysis.csv
Basic Drinking Water Analysis (WHO).csv
Basic Drinking Water Analysis.csv
BT Extended Water Quality Analysis.csv
Chemical & Biochemical Oxygen Demand.csv
Chemical Oxygen Demand.csv
Colloidal Silica in Water.csv
E coli & Total Coliforms.csv
Feacal Coliform (MPN) and E.Coli.csv
Feacal Coliforms (MPN) and E.Coli.csv
Free Chlorine.csv
Heavy Metal Analysis (Water).csv
Heavy Metals in Water.csv
Irrigation.csv
KEBS Drinking Water Microbiology.csv
KEBS Drinking Water Standard Analysis.csv
KS Drinking Water Standard Analysis.csv
Oil & Grease.csv
Post Harvest Water Analysis.csv
Reverse Osmosis Water Use.csv
Standard Drinking Water + Free Chlorine Analysis.csv
Standard Drinking Water Analysis (WHO).csv
Standard Drinking Water Analysis.csv
Standard Drinking Water.csv
Taita Water Analysis (Uni of Helsinki).csv
Total Bacteria Count.csv
Total Bacterial Count.csv
Total Chlorine.csv
Total Coliform and Faecal E.Coli.csv
Total Colifor

In [22]:
pca_reduced_df

Unnamed: 0_level_0,total_viable_count_@37
sample_code,Unnamed: 1_level_1
CA361WA0008,-0.814164
CA361WA0009,-0.778229
CA361WA0010,0.736775
CA361WA0011,2.05917
CA361WA0012,0.248064
CA361WA0013,-0.726483
CB050WA0037,-0.805252
CB050WA0038,-0.694861
CB050WA0039,-0.594244
CR211WA0005,1.369225


In [23]:
mahalanobis_threshold_dict = {}
for analysis_file in os.listdir("output/mahalanobis_distance"):
    print(analysis_file)
    analysis = analysis_file.replace(".csv","")
    mahalanobis_df = pd.read_csv(f"output/mahalanobis_distance/{analysis_file}",index_col=0)
    upper_quantile = (mahalanobis_df['mahalanobis_distance'].quantile(0.95))
    mahalanobis_threshold_dict[analysis_file.replace(".csv","")] = upper_quantile
    mahalanobis_df.loc[mahalanobis_df['mahalanobis_distance'] >= upper_quantile].to_csv(f"output/mahanobis_distance_upper_quantile/{analysis_file}")
    plt.boxplot(mahalanobis_df['mahalanobis_distance'])
    plt.savefig(f"output/mahalanobis_boxplots/{analysis}.png")
    plt.clf()
pickle.dump(mahalanobis_threshold_dict, open("mahalanobis_thresholds.dict","wb"))

Aquasearch Full Chemical Analysis.csv
Base Titanium Water Analysis.csv
Basic Drinking Water Analysis (WHO).csv
BT Extended Water Quality Analysis.csv
Chemical & Biochemical Oxygen Demand.csv
Chemical Oxygen Demand.csv
Colloidal Silica in Water.csv
E coli & Total Coliforms.csv
Feacal Coliform (MPN) and E.Coli.csv
Free Chlorine.csv
Heavy Metal Analysis (Water).csv
Heavy Metals in Water.csv
Irrigation.csv
KEBS Drinking Water Microbiology.csv
KEBS Drinking Water Standard Analysis.csv
KS Drinking Water Standard Analysis.csv
Oil & Grease.csv
Reverse Osmosis Water Use.csv
Standard Drinking Water + Free Chlorine Analysis.csv
Standard Drinking Water Analysis.csv
Standard Drinking Water.csv
Taita Water Analysis (Uni of Helsinki).csv
Total Bacteria Count.csv
Total Bacterial Count.csv
Total Chlorine.csv
Total Coliform and Faecal E.Coli.csv
Total Coliforms and Faecal E.Coli.csv
Total Suspended Solids (Calculated).csv
Total Suspended Solids.csv
Turbidity Analysis.csv
Turbidity.csv
Water Borne Diseas

<Figure size 640x480 with 0 Axes>

In [None]:
len(mahalanobis_threshold_dict.keys())

In [25]:
new_samples_df.analysis_name.unique()

array(['Standard Drinking Water Analysis (WHO)',
       'KEBS Drinking Water Microbiology',
       'Basic Drinking Water Analysis (WHO)', 'Irrigation',
       'Standard Drinking Water + Free Chlorine Analysis',
       'Reverse Osmosis Water Use'], dtype=object)

In [26]:
conn_lims = pyodbc.connect("Driver={SQL Server};"
                            "Server=192.168.5.18\CROPNUT;"
                            "Database=cropnuts;"
                            "uid=thomasTsuma;pwd=GR^KX$uRe9#JwLc6")

In [27]:
reverse_analysis_dict = pd.read_sql(f"""SELECT analysis_id, LTRIM(RTRIM(analysis_name)) as analysis_name FROM Analysis ORDER BY analysis_name""",con=conn_lims).set_index("analysis_name").to_dict()['analysis_id']
reverse_analysis_dict



{'% Gypsum': 540,
 '% Nitrogen (RSSP 2)': 647,
 '% Organic Matter': 258,
 '% Organic Matter (RSSP 2)': 646,
 '% Soil Nitrogen': 18,
 '%Assay': 27235,
 '%N (RSSP 2)': 652,
 '%OM (RSSP 2)': 651,
 '%P,%S Analysis(Super Calcium)': 383,
 '1:2 Soil Volume Extract': 294,
 '1:2 soil volume extract': 25,
 '1:2 Soil Volume Extract (Data Only)': 27339,
 '1:2 Vol Extract for BLGG': 317,
 '20:12:12 (N,P205,K20) Analysis': 840,
 'Absorbance at 254nm': 26936,
 'Acid Detergent Fibre': 27063,
 'Acid Insoluble Ash': 27417,
 'Acid Insoluble Matter': 27108,
 'Acid titration': 26647,
 'Acid Value': 26971,
 'Advanced Biological Farming Soil Audit': 26521,
 'Advanced Soil Health Analysis': 27181,
 'Aerobic Mesophilic Count': 26649,
 'Aflatoxin': 26794,
 'Aflatoxin AFB1': 26938,
 'Aflatoxin B1': 26939,
 'Aflatoxin in feed': 767,
 'Aflatoxin Total': 26753,
 'AfSIS Standard Leaf Analysis': 633,
 'AFSIS Std Wet Chemistry Soil Analysis': 589,
 'Aggregate Stability': 26727,
 'Agrifi Soil Microbiome Analysis': 2729

In [28]:
len(reverse_analysis_dict.keys())

1323

In [29]:
test = []

In [30]:
for sample in new_samples_df.sample_code.unique():
    res = {}
    tmp_ = new_samples_df.loc[new_samples_df.sample_code == sample]
    res['sample_code'] = sample
    print(tmp_.analysis_name.unique())
    res['analysis_id'] = [ reverse_analysis_dict[i] for i in tmp_.analysis_name.unique() if i in reverse_analysis_dict.keys() ]
    for index,row in tmp_.iterrows():
        if row['result'] >= 0 :
            res[row['chemical_name']] = {'result': row['result'], 'units': str(row['unit_name'])}
        else:
            res[row['chemical_name']] = {'result': 0, 'units': row['unit_name']}
    test.append(res)    


['Standard Drinking Water Analysis (WHO)']
['Standard Drinking Water Analysis (WHO)']
['Standard Drinking Water Analysis (WHO)']
['Standard Drinking Water Analysis (WHO)']
['KEBS Drinking Water Microbiology']
['Standard Drinking Water Analysis (WHO)']
['Standard Drinking Water Analysis (WHO)']
['Standard Drinking Water Analysis (WHO)']
['Standard Drinking Water Analysis (WHO)']
['Standard Drinking Water Analysis (WHO)']
['Standard Drinking Water Analysis (WHO)']
['Standard Drinking Water Analysis (WHO)']
['Standard Drinking Water Analysis (WHO)']
['Standard Drinking Water Analysis (WHO)']
['Standard Drinking Water Analysis (WHO)']
['Basic Drinking Water Analysis (WHO)' 'KEBS Drinking Water Microbiology']
['Standard Drinking Water Analysis (WHO)']
['Irrigation']
['Standard Drinking Water Analysis (WHO)']
['Standard Drinking Water + Free Chlorine Analysis']
['Standard Drinking Water Analysis (WHO)']
['Irrigation']
['Irrigation']
['Reverse Osmosis Water Use']
['Standard Drinking Water Ana

In [31]:
test

[{'sample_code': 'CA028WA0129',
  'analysis_id': [707],
  'potassium': {'result': 11.4, 'units': 'ppm'},
  'magnesium': {'result': 8.36, 'units': 'ppm'},
  'calcium': {'result': 14.8, 'units': 'ppm'},
  'sodium': {'result': 40.2, 'units': 'ppm'},
  'electrical_conductivity': {'result': 0.34, 'units': 'mS cm -1'},
  'sulphur': {'result': 4.61, 'units': 'ppm'},
  'phosphorus': {'result': 0.1, 'units': 'ppm'},
  'ammonium': {'result': 0.01, 'units': 'ppm'},
  'silicon__': {'result': 28.7, 'units': 'ppm'},
  'nitrate_n': {'result': 6.03, 'units': 'ppm'},
  'silica': {'result': 61.4, 'units': 'ppm'}},
 {'sample_code': 'CA028WA0130',
  'analysis_id': [707],
  'hardness': {'result': 68.6, 'units': 'ppm'},
  'silica': {'result': 58.8, 'units': 'ppm'},
  'phosphorus': {'result': 0.093, 'units': 'ppm'},
  'zinc': {'result': 0.01, 'units': 'ppm'},
  'calcium': {'result': 14.4, 'units': 'ppm'},
  'sulphate': {'result': 12.1, 'units': 'ppm'},
  'bicarbonate': {'result': 132.0, 'units': 'ppm'},
  's

In [32]:
str(test).replace("'", '"')

'[{"sample_code": "CA028WA0129", "analysis_id": [707], "potassium": {"result": 11.4, "units": "ppm"}, "magnesium": {"result": 8.36, "units": "ppm"}, "calcium": {"result": 14.8, "units": "ppm"}, "sodium": {"result": 40.2, "units": "ppm"}, "electrical_conductivity": {"result": 0.34, "units": "mS cm -1"}, "sulphur": {"result": 4.61, "units": "ppm"}, "phosphorus": {"result": 0.1, "units": "ppm"}, "ammonium": {"result": 0.01, "units": "ppm"}, "silicon__": {"result": 28.7, "units": "ppm"}, "nitrate_n": {"result": 6.03, "units": "ppm"}, "silica": {"result": 61.4, "units": "ppm"}}, {"sample_code": "CA028WA0130", "analysis_id": [707], "hardness": {"result": 68.6, "units": "ppm"}, "silica": {"result": 58.8, "units": "ppm"}, "phosphorus": {"result": 0.093, "units": "ppm"}, "zinc": {"result": 0.01, "units": "ppm"}, "calcium": {"result": 14.4, "units": "ppm"}, "sulphate": {"result": 12.1, "units": "ppm"}, "bicarbonate": {"result": 132.0, "units": "ppm"}, "sulphur": {"result": 4.03, "units": "ppm"},

In [None]:
reverse_analysis_dict[ 'KEBS Drinking Water Microbiology']

In [None]:
new_samples_df.analysis_name.unique()

In [None]:
mahalanobis_threshold_dict['Standard Drinking Water Analysis (WHO)']

In [None]:
mahalanobis_threshold_dict.keys()

In [None]:
water_ = new_samples_df.loc[new_samples_df['analysis_name'].isin(mahalanobis_thresholds.keys())]

In [None]:
water_

In [None]:
water_ = water_.sort_values(by="sample_code")

In [None]:
water_ = water_.loc[(water_['sample_code'] == 'CD048WA0110')]

In [None]:
water_

In [None]:
water_pivot = pd.pivot_table(data = water_, columns = "chemical_name", values = "result", index="sample_code").reset_index()

In [None]:
water_

In [None]:
water_pivot['analysis_id'] = 570

In [None]:
req = water_pivot.to_dict(orient="records")

In [None]:
req

In [None]:
import psycopg2

conn_lims = pyodbc.connect("Driver={SQL Server};"
                            "Server=192.168.5.18\CROPNUT;"
                            "Database=cropnuts;"
                            "uid=thomasTsuma;pwd=GR^KX$uRe9#JwLc6")

In [None]:
analysis_dict = pd.read_sql(f"""SELECT analysis_id, LTRIM(RTRIM(analysis_name)) as analysis_name FROM Analysis  ORDER BY analysis_name""",con=conn_lims).set_index("analysis_id").to_dict()['analysis_name']


In [None]:
analysis_dict

In [None]:
pickle.dump(analysis_dict, open("analysis.dict","wb"))

In [None]:
req = [
    {
      'sample_code': 'CD048WA0110',
      'calcium': {'result':13.4, "units":"ppm"},
      'electrical_conductivity': {"result":0.28, "units":"mS cm -1"},
      'magnesium': {'result':3.9, "units":"ppm"},
      'analysis_id': [288,27032]
    }]


In [None]:
req

In [None]:
_ = pd.DataFrame(req)

In [None]:
_

In [None]:
analysis_dict = pickle.load(open("analysis.dict","rb"))

In [None]:
analysis_dict

In [None]:
mahalanobis_thresholds = pickle.load(open("mahalanobis_thresholds.dict","rb"))

In [None]:
(mahalanobis_thresholds.keys())

In [None]:
unit_decision = pd.read_csv("output/water_unit_per_chemical_decision.csv")

In [None]:
unit_decision

In [33]:
req_body = test
_ = pd.DataFrame(req_body)
unit_decision = pd.read_csv("water_unit_per_chemical_decision.csv")
analysis_dict = pickle.load(open("analysis.dict","rb"))
mahalanobis_thresholds = pickle.load(open("mahalanobis_thresholds.dict","rb"))

result = {}
_df = pd.DataFrame()
for index, row in _.iterrows():
    analyses = row['analysis_id']
    for analysis_ in analyses:
        df_analysis_ = pd.DataFrame(row).T
        df_analysis_['analysis_name'] = analysis_dict[analysis_]
        _df = pd.concat([_df, df_analysis_])
for index,row in _df.iterrows():
    sample_code = row['sample_code']
    analysis = row['analysis_name']
    analysis_id = row['analysis_id']
    if sample_code not in result.keys():
        result[sample_code] = []
    if analysis not in mahalanobis_thresholds.keys():
        result[sample_code].append({"sample_code": sample_code,"status":"warning", "message": f"Analysis not in models", "details": f"Analysis: {analysis} is not in the list of defined models" })   
        continue     
    scaler = pickle.load(open(f"scalers/{analysis}.pkl","rb"))
    pca = pickle.load(open(f"pca/{analysis}.pkl","rb"))

    pca_df = pd.read_csv(f"pca_df/{analysis}.csv",index_col=0)

    

    try :
        tmp_df = pd.DataFrame(row).T[pca_df.columns]   
        print(tmp_df)
    except:
        result[sample_code].append({"sample_code": sample_code,"status":"warning", "message": f"Missing parameters for analysis_id: {analysis}", "details": f"Expected parameters are {','.join(pca_df.columns)} for analysis: {analysis}" })
        continue
    failed_units_comparison = {}    
    for col in pca_df.columns:
        expected_units = unit_decision.loc[(unit_decision['crop'] == analysis) & (unit_decision['chemical_name'] == col)]
        print(expected_units[['crop','chemical_name','unit_name']].to_dict())
        if row[col]['units'] !=   expected_units['unit_name'].values[0] :
            failed_units_comparison[col] = expected_units[['crop','chemical_name','unit_name']].to_dict()
        else:
            row[col] = row[col]['result']
    if len(failed_units_comparison.keys()) > 0:
        result[sample_code].append({"sample_code": sample_code,"status":"warning", "message": f"Wrong units provided", "details": f"Expected units are {str(failed_units_comparison)} for analysis: {analysis}" })
        continue

    tmp_df = pd.DataFrame(row).T[pca_df.columns]    
    
    # tmp_df = 
    df_scaled = scaler.transform(tmp_df)
    df_pca = pd.DataFrame(pca.transform(df_scaled))

    mu = np.mean(pca_df, axis=0)
    sigma = np.cov(pca_df.T)

    mahalanobis_distance = distance.mahalanobis(df_pca.iloc[0], mu, np.linalg.inv(sigma))

    print(mahalanobis_distance)

    expected_md = mahalanobis_thresholds[analysis]
    print(expected_md)

    if mahalanobis_distance > expected_md:
        result[sample_code].append({"sample_code": sample_code,"status":"fail", "message": "Mahalanobis distance exceeds threshold", "description":f"Mahalanobis distance of {mahalanobis_distance} exceeds threshold of {expected_md} for analysis: {analysis}" })
    else:
        result[sample_code].append({"sample_code": sample_code,"status":"pass","message": "Mahalanobis distance within threshold", "description":f"Mahalanobis distance of {mahalanobis_distance} is within threshold of {expected_md} for analysis: {analysis}" })
            


                             calcium                        magnesium
15  {'result': 13.4, 'units': 'ppm'}  {'result': 3.9, 'units': 'ppm'}
{'crop': {549: 'Basic Drinking Water Analysis (WHO)'}, 'chemical_name': {549: 'calcium'}, 'unit_name': {549: 'ppm'}}
{'crop': {550: 'Basic Drinking Water Analysis (WHO)'}, 'chemical_name': {550: 'magnesium'}, 'unit_name': {550: 'ppm'}}


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- ammonium
- bicarbonate
- boron
- chloride
- copper
- ...


In [None]:
result

In [None]:
str([1,2,3,4])