In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn import preprocessing

import seaborn as sns
import matplotlib.pyplot as plt


#local
from ML_models.DecisionTree import DecisionTreeModel
from ML_models.RandomForest import RandomForestModel
from ML_models.k_NN import KnnModel
from ML_models.NN import NNModel

In [None]:
import os
cwd = os.getcwd()
print(cwd)

In [None]:
def parse_datasets(crops_list):
    # explore the directory containing the datasets vars and populate a result dataframe

    datasets = []

    for crop in crops_list:
        directory = f'data/{crop}/datasets_vars/'
    
        # iterate over files in that directory
        for filename in os.listdir(directory):
            file_path = os.path.join(directory, filename)
            # checking if it is a file
            if os.path.isfile(file_path):
                file_name = os.path.splitext(os.path.basename(file_path))[0]
                datasets.append(f'{crop}_{file_name}')

    return datasets

In [None]:
def create_result_df(col_names, row_names):
    # Create an empty DataFrame with specified row and column names
    df = pd.DataFrame(columns=col_names, index=row_names)

    return (df)

In [None]:
def import_df(obs_data_path, meteo_data_path):
    obs_df = pd.read_csv(obs_data_path, index_col=0)
    meteo_df = pd.read_csv(meteo_data_path, index_col=0)

    return obs_df, meteo_df

In [None]:
def get_vars(directory_path, filename):
    vars = []
    file_path = os.path.join(directory_path, filename)
    # checking if it is a file
    if os.path.isfile(file_path):
        with open(file_path, 'r') as f:
            for line in f:
                if line.strip(): # remove empty lines
                    if not line.startswith('->'): # remove df code
                        vars.append(line.strip())
    
    return vars

In [None]:
def extract_sub_df(combined_df, vars):
    return combined_df.loc[:, vars]

In [None]:
def normalize(sub_df):
    min_max_scaler = preprocessing.MinMaxScaler()
    sub_df_scaled = min_max_scaler.fit_transform(sub_df)

    return(pd.DataFrame(sub_df_scaled, index=sub_df.index, columns=sub_df.columns))

In [None]:
def preprocess_data(crop, obs_df, meteo_df):
    

    obs_df.rename(columns={'SampleDate':'Date'}, inplace=True)

    if crop == 'onion':
        obs_df.loc[obs_df['cote_b_squamosa'] >= 1, 'cote_b_squamosa'] = 1
        unique_sample_date = obs_df['Date'].unique()
        unique_sample_date = meteo_df[meteo_df['Date'].isin(unique_sample_date)]

        combined_df = obs_df.merge(meteo_df, on=['FarmID', 'Date'])
        label_df = combined_df.get('cote_b_squamosa')
        combined_df = combined_df.drop(['FarmID', 'Plant_ID', 'Date', 'cote_b_squamosa', 'cote_p_destructor', 'cote_s_vesicarium', 'Bulb_onions_date'], axis=1)

    elif crop == 'laitue':
        obs_df.loc[obs_df['cote_b_lactucae'] >= 1, 'cote_b_lactucae'] = 1
        unique_sample_date = obs_df['Date'].unique()
        unique_sample_date = meteo_df[meteo_df['Date'].isin(unique_sample_date)]

        combined_df = obs_df.merge(meteo_df, on=['FarmID', 'Date'])
        label_df = combined_df.get('cote_b_lactucae')
        combined_df = combined_df.drop(['FarmID', 'Plant_ID', 'Date', 'cote_b_lactucae', 'incidence_sclerotinia', 'incidence_b_cinerea', 'Pommaison_lettuce_date'], axis=1)


    elif crop == 'carrot':
        obs_df = obs_df.drop(obs_df[obs_df['FarmID'] == 0].index)
        obs_df.loc[obs_df['cote_c_carotae'] >= 1, 'cote_c_carotae'] = 1
        unique_sample_date = obs_df['Date'].unique()
        unique_sample_date = meteo_df[meteo_df['Date'].isin(unique_sample_date)]

        combined_df = obs_df.merge(meteo_df, on=['FarmID', 'Date'])
        label_df = combined_df.get('cote_c_carotae')
        combined_df = combined_df.drop(['FarmID', 'Plant_ID', 'Date', 'cote_c_carotae', 'incidence_s_sclerotiorum', 'incidence_a_dauci'], axis=1)


    else:
        raise Exception(f'Crop <{crop}> should be "onion", "laitue", or "carrot".')
    
    return combined_df, label_df

In [None]:
def main():
    #crops_list = ['onion', 'laitue', 'carrot']
    crops_list = ['onion']
    models_to_run = ['Decision tree', 'Random Forest', 'K-nn', 'Neural Network']
    datasets_to_run = ['base', 'Botcast']
    restart = None

    datasets = parse_datasets(crops_list)

    # find previous or create new result dataframe
    if restart is not None:
        result_df = pd.read_csv(restart, header=0, index_col=0)
    
    else:
        result_df = create_result_df(models_to_run, datasets)


    for crop in crops_list:
        obs_data_path = f'data/{crop}/crop_no_sensitive_data.csv'
        meteo_data_path = f'data/{crop}/combined_daily_meteo.csv'
        directory_path = f'data/{crop}/datasets_vars/'
        obs_df, meteo_df = import_df(obs_data_path, meteo_data_path)

        combined_df, label_df = preprocess_data(crop, obs_df, meteo_df)

        for filename in (os.listdir(directory_path)):
            row_name = f'{crop}_{os.path.splitext(os.path.basename(filename))[0]}'

            vars = get_vars(directory_path, filename)

            sub_df = extract_sub_df(combined_df, vars)

            sub_df_norm = normalize(sub_df)
            
            if row_name in [f'{crop}_base', f'{crop}_Botcast']:

                #Decision tree classifier
                if result_df.isnull().loc[row_name, 'Decision tree']:
                    mean_acc, std_acc, mean_roc_auc, std_roc_auc, mean_f1, std_f1 = DecisionTreeModel(sub_df_norm, label_df)
                    result_df.loc[row_name, 'Decision tree'] = f'acc: {mean_acc:.2f} (\u00B1 {std_acc:.2f}) roc_auc: {mean_roc_auc:.2f} (\u00B1 {std_roc_auc:.2f}) f1: {mean_f1:.2f} (\u00B1 {std_f1:.2f})'
                    
                #Random Forest classifier
                if result_df.isnull().loc[row_name, 'Random Forest']:
                    mean_acc, std_acc, mean_roc_auc, std_roc_auc, mean_f1, std_f1 = RandomForestModel(sub_df_norm, label_df)
                    result_df.loc[row_name, 'Random Forest'] = f'acc: {mean_acc:.2f} (\u00B1 {std_acc:.2f}) roc_auc: {mean_roc_auc:.2f} (\u00B1 {std_roc_auc:.2f}) f1: {mean_f1:.2f} (\u00B1 {std_f1:.2f})'

                #Knn classifier
                if result_df.isnull().loc[row_name, 'K-nn']:
                    mean_acc, std_acc, mean_roc_auc, std_roc_auc, mean_f1, std_f1 = KnnModel(sub_df_norm, label_df)
                    result_df.loc[row_name, 'K-nn'] = f'acc: {mean_acc:.2f} (\u00B1 {std_acc:.2f}) roc_auc: {mean_roc_auc:.2f} (\u00B1 {std_roc_auc:.2f}) f1: {mean_f1:.2f} (\u00B1 {std_f1:.2f})'
                
                if result_df.isnull().loc[row_name, 'Neural Network']:
                    mean_acc, std_acc, mean_roc_auc, std_roc_auc, mean_f1, std_f1 = NNModel(sub_df_norm, label_df)
                    result_df.loc[row_name, 'Neural Network'] = f'acc: {mean_acc:.2f} (\u00B1 {std_acc:.2f}) roc_auc: {mean_roc_auc:.2f} (\u00B1 {std_roc_auc:.2f}) f1: {mean_f1:.2f} (\u00B1 {std_f1:.2f})'
                    
                result_df.to_csv('test_result_verification.csv')
                

In [None]:
main()