In [1]:
import os

import json
import joblib
import glob
import shutil
import numpy as np
import pandas as pd
from pathlib import Path
import tensorflow as tf
print(f'tensorflow version : {tf.version.VERSION}')
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras.models import Sequential, Model
from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.metrics import *

tensorflow version : 2.9.1


In [2]:
# script should be in the same directory as the notebook
!pip install utils
import utils




In [3]:
def mean_dupes_drop_singles_filter_by_std( df, std_thres=0.03, filt_zeros=True):
    """ Within the SPC samples,  duplicates should be averaged, not dropped.
    This function:

    - averages the duplicates
    - calculates std over column
    -  filters the samples out with high std (high std could mean collection error)
    -  filters the samples out with 0 std (not a true duplicate)

    Parameters
    ----------
    :param: ``df`` : ``pd.DataFrame``
        Dataframe of SPC data.
    :param: ``std_thres`` : ``float``
        standard deviation threshold for acceptable data. Default = 0.01.
    :param: ``filt_zeros`` : ``bool``
        If True, filter out sample duplicates with a standard deviation of 0 --> unrealistic!

    Returns
    -------
    :return: ```pd.DataFrame``
        Dataframe in better shape.

    """
    # copy dataframe for std calculation
    df_std = df.copy(deep=True)
#     cls.__log.info("Shape of initial dataframe: %s", np.shape(df_std))

    # average duplicates
    df = df.groupby(df.index).mean()

    # calculate average std over column, maintain index
    df_std = df_std.groupby(df_std.index).std()
    # failed_indices = [(2138, 61433) ]
    # print(df_std.loc[failed_indices])
    # df_std =df_std.loc[failed_indices]
    # print(df.std(axis=0).sort_values(ascending =False).head(30))
    # calculate avg std series
    df_std = df_std.mean(axis=1)

    # create dataframe from avg std series
    df_std = pd.DataFrame(
        index=df_std.index.values, columns=["std"], data=df_std.values
    )
    print(df_std.sort_values(by='std',ascending =False).head())
    # print(df_std)
    df_std = df_std.dropna()
    # failed_indices = [(2138, 61433) ]  
    # df_std.loc[failed_indices]

#     cls.__log.info("Shape of dataframe w/o NaNs: %s", np.shape(df_std))

    # filter df by std, removing below thresh
    df_std = df_std[df_std["std"] < std_thres]
#     cls.__log.info("Shape of thresholded dataframe: %s", np.shape(df_std))

    if filt_zeros:
        df_std = df_std[df_std["std"] > 0]
#         cls.__log.info(
#             "Shape of thresholded dataframe with filtered zeros: %s",
#             np.shape(df_std),
#         )

    # merge the dataframes by index
    df_mrgd = pd.merge(df, df_std, left_index=True, right_index=True, how="outer")
#     cls.__log.info("Shape of merged dataframe by index: %s", np.shape(df_mrgd))

    # drop na's by row
    df_mrgd = df_mrgd.dropna()
#     cls.__log.info("Shape of merged dataframe w/o NaNs: %s", np.shape(df_mrgd))

    # drop std column
    df_mrgd = df_mrgd.drop(["std"], axis=1)

    return df_mrgd


In [23]:
# path to the output of opus to csv tool AB_quantized_spectra.csv 
path_to_ab_spectra = Path('../DS-ML69 product1 spectra/20221206-103534')

# path_to_ab_spectra = Path('DS-AD_6_Bruker/20221122-140926')

df_spectra_1 = pd.read_csv(path_to_ab_spectra / 'AB_quantized_spectra.csv',index_col=0)

In [24]:
df_spectra_1

Unnamed: 0_level_0,520,522,524,526,528,530,532,534,536,538,...,3956,3958,3960,3962,3964,3966,3968,3970,3972,3974
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AGD901-4MN0003_Mineral Lick_ATR_20221121_150713_001,0.067645,0.066111,0.065489,0.065464,0.065696,0.065960,0.066090,0.066015,0.065722,0.065310,...,0.009480,0.009520,0.009531,0.009510,0.009473,0.009452,0.009458,0.009472,0.009476,0.009452
AGD901-4MN0003_Mineral Lick_ATR_20221121_150742_002,0.067439,0.065602,0.064870,0.064948,0.065441,0.065957,0.066147,0.065910,0.065275,0.064506,...,0.009734,0.009773,0.009788,0.009777,0.009754,0.009744,0.009757,0.009784,0.009808,0.009810
AGD901-4MN0004_Screen_ATR_20221121_155612_001,0.047662,0.046840,0.045623,0.044372,0.043575,0.043318,0.043508,0.043811,0.043836,0.043458,...,0.014057,0.014034,0.013991,0.013948,0.013926,0.013942,0.013991,0.014040,0.014064,0.014046
AGD901-4MN0004_Screen_ATR_20221121_155647_002,0.043126,0.042197,0.041736,0.041672,0.041898,0.042213,0.042400,0.042342,0.041981,0.041378,...,0.014024,0.013994,0.013957,0.013912,0.013870,0.013853,0.013868,0.013910,0.013965,0.014015
AGD901-4MN0005_Screen_ATR_20221121_160028_001,0.021856,0.023238,0.024949,0.026894,0.028964,0.030894,0.032384,0.033270,0.033499,0.033393,...,0.011783,0.011809,0.011824,0.011819,0.011804,0.011806,0.011831,0.011863,0.011880,0.011862
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FA807-592MN0004_Screen_ATR_20221122_091411_002,0.131186,0.134238,0.137313,0.140703,0.144641,0.148986,0.153456,0.157710,0.161406,0.164375,...,0.015314,0.015264,0.015224,0.015224,0.015273,0.015356,0.015451,0.015527,0.015571,0.015592
FA807-596MN0001_Mineral Lick_ATR_20221121_145656_001,0.384920,0.389439,0.395161,0.402252,0.410303,0.417779,0.422826,0.424059,0.420413,0.411792,...,0.012868,0.012873,0.012861,0.012827,0.012782,0.012754,0.012757,0.012782,0.012812,0.012824
FA807-596MN0001_Mineral Lick_ATR_20221121_145724_002,0.392781,0.395536,0.399638,0.405222,0.411957,0.418707,0.424012,0.426147,0.423457,0.415502,...,0.013207,0.013192,0.013155,0.013102,0.013051,0.013028,0.013042,0.013080,0.013125,0.013153
MN0001_Mineral lick_ATR_20221122_090056_001,0.070285,0.072497,0.074055,0.075772,0.078584,0.082950,0.089085,0.096710,0.105333,0.114294,...,0.009449,0.009423,0.009424,0.009444,0.009475,0.009514,0.009558,0.009599,0.009631,0.009640


In [25]:
# extract sample code
df_spectra_1.index = df_spectra_1.index.str.extract('([C]\w{3,5}SA\w{2,5}|[F]\w{4,5}[-]{1}\w{6,9}|A\w{5,7}[-]{1}\w{7,10}|\w{3}[-]{1}\w{1,2}[-]{1}\w{7,10}|\w{3}[-]{1}\w{2}[-]{1}\w{4,6})', expand=False)

In [26]:
df_spectra_1 = df_spectra_1.reset_index()
df_spectra_1.dropna(axis=0, inplace=True)
df_spectra_1 = df_spectra_1.set_index("sample_id")

In [27]:
df_spectra_1

Unnamed: 0_level_0,520,522,524,526,528,530,532,534,536,538,...,3956,3958,3960,3962,3964,3966,3968,3970,3972,3974
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AGD901-4MN0003_Mi,0.067645,0.066111,0.065489,0.065464,0.065696,0.06596,0.06609,0.066015,0.065722,0.06531,...,0.00948,0.00952,0.009531,0.00951,0.009473,0.009452,0.009458,0.009472,0.009476,0.009452
AGD901-4MN0003_Mi,0.067439,0.065602,0.06487,0.064948,0.065441,0.065957,0.066147,0.06591,0.065275,0.064506,...,0.009734,0.009773,0.009788,0.009777,0.009754,0.009744,0.009757,0.009784,0.009808,0.00981
AGD901-4MN0004_Sc,0.047662,0.04684,0.045623,0.044372,0.043575,0.043318,0.043508,0.043811,0.043836,0.043458,...,0.014057,0.014034,0.013991,0.013948,0.013926,0.013942,0.013991,0.01404,0.014064,0.014046
AGD901-4MN0004_Sc,0.043126,0.042197,0.041736,0.041672,0.041898,0.042213,0.0424,0.042342,0.041981,0.041378,...,0.014024,0.013994,0.013957,0.013912,0.01387,0.013853,0.013868,0.01391,0.013965,0.014015
AGD901-4MN0005_Sc,0.021856,0.023238,0.024949,0.026894,0.028964,0.030894,0.032384,0.03327,0.033499,0.033393,...,0.011783,0.011809,0.011824,0.011819,0.011804,0.011806,0.011831,0.011863,0.01188,0.011862
AGD901-4MN0005_Sc,0.026756,0.025656,0.024823,0.024695,0.025666,0.027459,0.029572,0.031588,0.033179,0.034386,...,0.011887,0.01185,0.011827,0.011813,0.011807,0.011814,0.011835,0.011861,0.011881,0.011876
AGD901-4MN0007_Mi,0.069525,0.069314,0.069095,0.068997,0.069088,0.06913,0.06882,0.068141,0.067204,0.066244,...,0.011287,0.011295,0.011279,0.011261,0.011261,0.011294,0.01135,0.011396,0.011413,0.011397
AGD901-4MN0007_Mi,0.065621,0.066883,0.068469,0.069894,0.070734,0.070952,0.070648,0.069956,0.069024,0.068065,...,0.011461,0.011468,0.011465,0.011449,0.011427,0.011419,0.011431,0.011456,0.011484,0.011501
AGD901-4MN0008_Mi,0.080406,0.079872,0.079467,0.079305,0.079469,0.079844,0.080252,0.080557,0.080655,0.080531,...,0.010104,0.010146,0.010148,0.010122,0.010089,0.010078,0.010095,0.010124,0.01015,0.010164
AGD901-4MN0008_Mi,0.07572,0.076416,0.077165,0.077645,0.077696,0.077549,0.077535,0.077757,0.078213,0.078786,...,0.009983,0.010007,0.010047,0.010095,0.010147,0.010197,0.010245,0.010291,0.010327,0.01034


In [28]:
# df_spectra_1.index = df_spectra_1.index.astype("str")

In [29]:
df_spectra_1.index = [df_spectra_1.index[x].split('_')[0] for x in range(df_spectra_1.shape[0])]

In [30]:
# df_avg_1 = mean_dupes_drop_singles_filter_by_std(df_spectra_1)
df = mean_dupes_drop_singles_filter_by_std(df_spectra_1)


                      std
FA807-578MN0002  0.000426
FA807-592MN0004  0.000358
FA807-596MN0001  0.000338
AGD901-4MN0004   0.000253
AGD901-4MN0003   0.000235


In [31]:
# df.to_csv("C:/Users/Tsuma Thomas/Documents/CropNutsDocuments/DS-ML69 product1 spectra/20221206-103534.csv")

In [54]:
# save to path of notebook
df.to_csv('spc.csv')

In [15]:
# path to file
filename = 'spc.csv'

In [16]:
# chemicals = ['psi', 'aluminium', 
#             'phosphorus', 'ph', 'exchangeable_acidity', 'calcium', 'magnesium',
#               'sulphur', 'sodium', 'iron', 'manganese', 'boron', 'copper', 'zinc', 'total_nitrogen', 'potassium',
#              'ec_salts', 'organic_carbon', 'cec', 'sand', 'silt', 'clay']

chemicals = ['clay','sand','silt']


In [17]:
from math import log10, floor
def round_sig(x, sig=2):
    return round(x, sig-int(floor(log10(abs(x))))-1)

In [18]:
# read data from file for predictions
data = pd.read_csv(filename, index_col=0, engine='c')

In [19]:
# make sure we have 1728 wave numbers
data = data.T.head(1728).T

In [20]:
# path to models and preprocessing steps
base_path = Path('./dl_models_all_chems_20210414/saved_models')

In [21]:

for chemical in chemicals:
    print(chemical)
    preds_comb = pd.DataFrame()
    models_folder = base_path / chemical / 'std'
    all_models = [x for x in models_folder.glob('**/*.hdf5')]

    
    new_indices = data.index



    for model_path in all_models:

        json_path = model_path.parent.parent / 'model.json'

        with open(json_path) as f:
            json_ = json.load(f)

        inputs = []

        for i in range(len(json_['Inputs'])):
            input_name = json_['Inputs'][i]['Name']
            train = data.copy(deep=True)
            print(len(train.shape))

            for j in range(len(json_['Inputs'][i]['Pre-processing'])):
                key_ = json_['Inputs'][i]['Pre-processing'][j]['Name']
                if input_name == 'nir2':
                    input_name = 'nir.2'
                pickle_path = model_path.parent / 'preprocess' / f'input.{input_name}.{j}.{key_}.pickle'
                pickle_ = joblib.load(pickle_path)
                train = pickle_.fit_transform(train)

            inputs.append(train.values)

        tf.keras.backend.clear_session()
        model = tf.keras.models.load_model(model_path)
        preds = pd.DataFrame(model(inputs).numpy())
        preds_comb = pd.concat([preds_comb, preds], axis=1)

    preds_comb = preds_comb.median(axis=1)
    preds_comb.index = new_indices
    
    
    # save output
    preds_comb.to_csv(f'preds/{chemical}.csv')

clay
2
2
2
2
2
2
2
2
2
2
2
2
2
2
sand
2
2
2
2
2
2
2
2
2
2
2
2
2
2
silt
2
2
2
2
2
2
2
2
2
2
2
2
2
2


In [22]:
print(json_['Inputs'][i])

{'Name': 'nir2', 'Regex': '^amp.*$', 'Pre-processing': [{'Name': 'Downsample', 'Params': {'step': 3}}, {'Name': 'SavitzkyGolay', 'Params': {'polynomial_order': 2, 'derivative_order': 1, 'num_points': 15}}]}
