In [1]:
import pandas as pd 
import os
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.utils import resample
import imblearn
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, precision_score, recall_score
from tqdm import tqdm
import joblib
import glob
from sklearn import svm
import time 

def extract_month(month_str):
    return int(month_str.split("-")[1])

def scaling(arr, type="standard", perc_inf = 25, perc_sup = 75):
    if type == "standard":
        scaler = preprocessing.StandardScaler()
    elif type == "robust":
        scaler = preprocessing.RobustScaler()
        f.write("Scaling type not recognized")
        scaler= preprocessing.StandardScaler()
    scaled_features = scaler.fit_transform(np.array(arr).reshape(-1,1))
    return scaled_features


#ALL POSSIBILE OPTION IMPLEMENTED

# nan_managing_options = ["erase","fill_mean"]
# normlaization_options = ["standard","robust"]
# training_balance_options = ["minority","smotenc"]
# type_of_arch_options = ["ml","dl"]

# OPTIONS USED DURINF TESTS FOR REDUCE TIME

# nan_managing_options = ["erase","fill_mean"]
# normlaization_options = ["robust"]
# training_balance_options = ["smotenc"]
# type_of_arch_options = ["ml","dl"]

nan_managing_options = ["erase","fill_mean"]
normlaization_options = ["robust"]
training_balance_options = ["smotenc"]



DIFFERENT DATASET CREATION

In [None]:
#OPTION SELECTION
for i in tqdm(range(len(nan_managing_options))):
    nan_managing=nan_managing_options[i]
    for j in range(len(normlaization_options)):
        normalization=normlaization_options[j]
        for k in range(len(training_balance_options)):
            training_balance=training_balance_options[k]
            
            #create a dataframe with the options selected
            options = pd.DataFrame({"nan_managing":nan_managing, "normalization":normalization, "training_balance":training_balance}, index=[0])

            TEST_PATH = os.sep + os.path.join("workspace", "dataset_varius_test",f"{nan_managing}_{normalization}_{training_balance}") 
            if not os.path.exists(TEST_PATH):
                os.makedirs(TEST_PATH)

            IMAGE_PATH = os.sep + os.path.join(TEST_PATH, "Images")
            if not os.path.exists(IMAGE_PATH):
                os.makedirs(IMAGE_PATH)

            DATASET_PATH_TESTS = os.sep + os.path.join(TEST_PATH, "dataset")
            if not os.path.exists(DATASET_PATH_TESTS):
                os.makedirs(DATASET_PATH_TESTS)

            options.to_csv(os.path.join(TEST_PATH, "options.csv"), index=False)

            report_file_name = os.path.join(TEST_PATH,f"report_dataset.txt")
            f = open(report_file_name, "a")

            f.write("###########################################################\n")
            f.write("############# STARTING NEW EXPERIMENT #####################\n")

            f.write("\n ---------- Options ---------- \n")

            f.write(f"Nan managing option: {nan_managing}\n")
            f.write(f"Normalization option:  {normalization}\n")
            f.write(f"Training balance option: {training_balance}\n")
            

            f.write("\n ---------- READ DATA ---------- \n")

            all_dataset_path = os.sep + os.path.join("workspace", "Dataset", "weather.csv")
            all_dataset_df = pd.read_csv(all_dataset_path)
            f.write("Dataset has been read\n")


            f.write("\n ---------- MANAGE NAN ---------- \n")
            f.write("Some info about the dataset:\n")
            f.write(f"| Len of all_dataset_df: {len(all_dataset_df)}\n")
            f.write(f"| Number of features: {len(all_dataset_df.columns)}\n")
            not_nan_dataset_df = all_dataset_df.dropna(axis=0, how='any')
            f.write(f"| Len of not_nan_dataset_df:{ len(not_nan_dataset_df)}\n")
            f.write(f"| Only the {np.round(len(not_nan_dataset_df)/len(all_dataset_df),4)*100}% of the dataset is free of NaN values")
            f.write("-----------------------------------------------------\n")

            nan_count_df = np.round(all_dataset_df.isnull().sum() * 100 / len(all_dataset_df),4).to_frame()
            nan_count_df.columns = ['Percentage of NaN']

            if nan_managing == "erase":
                f.write("-->  Managing NaN values with erase\n")
                #list of columns with NaN values > 30%
                column_to_erase = nan_count_df[nan_count_df['Percentage of NaN'] > 30].index.to_list()
                f.write(f"The following columns have NaN values > 30%: {column_to_erase} and will be erased\n")
                wo_somecolumun_dataset_df = all_dataset_df.drop(column_to_erase, axis=1)
                not_nan_wo_somecolumn_dataset_df = wo_somecolumun_dataset_df.dropna(axis=0, how='any')
                f.write("Any row of the remain dataset that contains at least one NaN values will be erased\n")
                f.write("Final dataset lenght is {} and {}% of the initial dataset in terms of rows\n".format(len(not_nan_wo_somecolumn_dataset_df), np.round(len(not_nan_wo_somecolumn_dataset_df)/len(wo_somecolumun_dataset_df)*100,2)))
                f.write(f"Number of features of final dataset : {len(not_nan_wo_somecolumn_dataset_df.columns)}\n")
                dataset_to_use = not_nan_wo_somecolumn_dataset_df

                categorical_column = []
                non_categorical_column=[]

                for i in dataset_to_use.columns:
                    
                    if dataset_to_use[i].dtype == 'object':
                        categorical_column.append(i)
                    else:
                        non_categorical_column.append(i)

                f.write(f"The final dataset contains the following categorical columns: {categorical_column}\n")
                f.write(f"The final dataset contains the following non-categorical columns: {non_categorical_column}\n")
                        
            elif nan_managing == "fill_mean":
                f.write("--> Managing NaN values with fill_mean\n")
                f.write("Categorical colum will be fill with the mode of the column, while, numerical columns will be fill with the mean\n")


                categorical_column = []
                non_categorical_column=[]

                for i in all_dataset_df.columns:
                    
                    if all_dataset_df[i].dtype == 'object':
                        categorical_column.append(i)
                    else:
                        non_categorical_column.append(i)

                dataset_to_use = all_dataset_df

                for i in categorical_column:
                    dataset_to_use[i] = dataset_to_use[i].fillna(dataset_to_use[i].mode()[0])
                for i in non_categorical_column:
                    dataset_to_use[i] = dataset_to_use[i].fillna(dataset_to_use[i].mean())

                f.write("The final dataset lenght is {} and {}% of the initial dataset in terms of rows\n".format(len(dataset_to_use), np.round(len(dataset_to_use)/len(all_dataset_df)*100,2)))
                f.write(f"Number of features of final dataset : {len(dataset_to_use.columns)}\n")

                f.write(f"The final dataset contains the following categorical columns: {categorical_column}\n")
                f.write(f"The final dataset contains the following non-categorical columns: {non_categorical_column}\n")

            f.write("\n---------- ENCODING DATA ---------- \n")

            categorical_column = []
            non_categorical_column=[]

            for i in dataset_to_use.columns:
                
                if dataset_to_use[i].dtype == 'object':
                    categorical_column.append(i)
                else:
                    non_categorical_column.append(i)

            dataset_to_use = dataset_to_use.replace(['No', 'Yes'], [0, 1])
            f.write("The Yes and No values have been replaced by 1 and 0\n")
            f.write("The Date columns have been dropped and an additional column with month has been added\n")
            f.write("Other categorical columns have been encoded with LabelEncoder: the mapping is saved in a csv file\n")

            le = preprocessing.LabelEncoder()
            for i in categorical_column:
                if i != 'Date':
                    dataset_to_use[i] = le.fit_transform(dataset_to_use[i])
                    mapping = dict(zip(le.classes_, le.transform(le.classes_)))
                    mapping_df = pd.DataFrame(mapping.items(), columns=['Original', 'Encoded'])
                    mapping_df.to_csv(os.path.join(DATASET_PATH_TESTS, "mapping_" + i + "_OPTIONS_"+ str(nan_managing) + ".csv"))
                    f.write(f"{i} has been encoded, mapping saved in {DATASET_PATH_TESTS}/mapping_{i}_OPTIONS_{nan_managing}.csv\n")
                else:
                    dataset_to_use["month"] = dataset_to_use["Date"].apply(extract_month)
                    dataset_to_use.drop(["Date"], axis=1, inplace=True)

            dataset_to_use.drop(["Unnamed: 0"], axis=1, inplace=True)
            f.write("The Unnamed: 0 column has been dropped\n")

            f.write("\n ---------- CHECK CLASS DISTIRBUTION ---------- \n")

            tw_rain = np.sum(np.array(dataset_to_use["RainTomorrow"]))
            tw_not_rain = len(dataset_to_use) - tw_rain
            tw_rain_perc = np.round(tw_rain/len(dataset_to_use)*100,2)
            tw_not_rain_perc = np.round(tw_not_rain/len(dataset_to_use)*100,2)

            f.write(f"RainTomorrow: {tw_rain}, that is the {tw_rain_perc}% of the dataset\n")
            f.write(f"NotRainTomorrow: {tw_not_rain}, that is the {tw_not_rain_perc}% of the dataset\n")

            fig, ax = plt.subplots(figsize=(15,10))
            plt.rcParams.update({'font.size': 15})
            ax.bar(['1', '0'], [tw_rain, tw_not_rain],color=[(0.95,0.11,0.41), (0.12,0.07,0.3)])
            #add second y-axis
            ax2 = ax.twinx()
            ax2.bar(['1', '0'], [tw_rain_perc, tw_not_rain_perc], color=[(0.95,0.11,0.41), (0.12,0.07,0.3)])
            ax2.set_ylabel('Percentage of all dataset')
            ax.set_title("RainTomorrow Columns")
            ax.set_ylabel("Number of Samples")
            plt.grid(True)
            # add capiton under th plot 
            plt.savefig(os.path.join(IMAGE_PATH,f"BarPlot_Distirbution_RainTomorrow_OPTIONS_{nan_managing}.png"), dpi=300)
            plt.close()
            f.write(f"Bar plot of RainTomorrow has been saved in {IMAGE_PATH}/BarPlot_Distirbution_RainTomorrow_OPTIONS_{nan_managing}.png\n")

            f.write("\n ---------- NORMALZIE DATA ---------- \n")

            featuers = dataset_to_use.drop(['RainTomorrow'], axis=1)
            target = dataset_to_use['RainTomorrow']

            plt.figure(figsize=(20,10))
            sns.boxplot(data = featuers)
            plt.xticks(rotation=90)
            plt.savefig(os.path.join(IMAGE_PATH,f"BoxPlot_features_distribution_OPTIONS_{nan_managing}.png"), dpi=300)
            plt.close()
            f.write(f"Box plot of features distribution has been saved in {IMAGE_PATH}/BoxPlot_features_distribution_OPTIONS_{nan_managing}.png\n")


            if normalization == "standard":
                f.write("--> Normalizing data with StandardScaler\n")
                scaled_features = featuers
                for i in featuers.columns:
                    scaled_features[i] = scaling(scaled_features[i])

                plt.figure(figsize=(20,10))
                sns.boxplot(data = scaled_features)
                plt.xticks(rotation=90)
                plt.savefig(os.path.join(IMAGE_PATH,f"BoxPlot_features_distribution_StandardScaler_OPTIONS_{nan_managing}_{normalization}.png"), dpi=300)
                plt.close()
                f.write(f"Box plot of features distribution has been saved in {IMAGE_PATH}/BoxPlot_features_distribution_StandardScaler_OPTIONS_{nan_managing}_{normalization}.png\n")

                featuers = scaled_features

            elif normalization == "robust":
                f.write("--> Normalizing data with Scaler and quantile remove\n")
                inf_quantile = 0.05
                sup_quantile = 0.95

                f.write(f"Inf qunatile: {inf_quantile}\n")
                f.write(f"Sup qunatile: {sup_quantile}\n")

                scaled_features = featuers
                for i in featuers.columns:
                    scaled_features[i] = scaling(scaled_features[i])

                scaled_features_quantile = scaled_features
                for i in scaled_features.columns:
                    if i not in categorical_column:
                        quantilie_inf = np.quantile(scaled_features[i], inf_quantile)
                        quantilie_sup = np.quantile(scaled_features[i], sup_quantile)
                        scaled_features_quantile[i] = scaled_features[i].apply(lambda x: np.clip(x, quantilie_inf, quantilie_sup))

                plt.figure(figsize=(20,10))
                sns.boxplot(data = scaled_features_quantile)
                plt.xticks(rotation=90)
                plt.savefig(os.path.join(IMAGE_PATH,f"BoxPlot_features_distribution_RobustScaler_OPTIONS_{nan_managing}_{normalization}.png"), dpi=300)
                plt.close()

                featuers = scaled_features_quantile

            f.write("\n --------SPLITTING------------ \n")

            f.write("--> Splitting data in order to create a balanced TEST dataset\n")
            test_size = 0.3 # 30% of the smallest class in the dataset
            f.write(f"Test size is the {test_size*100}% of the smallest class in the dataset multply bt the number of classes\n")
            yes_num = np.sum(target)
            no_num = len(target) - yes_num
            test_size_samples_for_each_class = int(np.round(test_size*yes_num))
            f.write(f"Number of samples for each class: {test_size_samples_for_each_class}\n")

            #join the target and the features
            features_target = pd.concat([featuers, target], axis=1)
            #shuffle the data
            features_target = features_target.sample(frac=1).reset_index(drop=True)

            test_yes = features_target[features_target["RainTomorrow"] == 1].iloc[0:test_size_samples_for_each_class,:]
            train_yes = features_target[features_target["RainTomorrow"] == 1].iloc[test_size_samples_for_each_class:,:]
            test_no = features_target[features_target["RainTomorrow"] == 0].iloc[0:test_size_samples_for_each_class,:]
            train_no = features_target[features_target["RainTomorrow"] == 0].iloc[test_size_samples_for_each_class:,:]

            train = pd.concat([train_yes, train_no], axis=0)
            test = pd.concat([test_yes, test_no], axis=0)

            f.write("\nFrom the whole dataset we will extract {test_size_samples_for_each_class} samples for each class in order to have a balanced TEST dataset")
            f.write("\nThe train dataset will contain {len(train)} samples and the dataset isn't balanced")

            f.write("\nNumber of samples in test: {}".format(len(test)))
            f.write("\nTest set is {}% of the dataset".format(np.round(len(test)/len(features_target)*100,2)))

            fig, ax = plt.subplots(1,2, figsize=(15,10))
            plt.rcParams.update({'font.size': 15})
            #set y axes to the same scale for ax[0] and ax[1]
            ax[0].bar(['Yes', 'No'], [len(train_yes), len(train_no)], color=[(0.95,0.11,0.41), (0.12,0.07,0.3)])
            ax[0].set_title("Train set")
            ax[0].set_xlabel("RainTomorrow")
            ax[0].set_ylabel("Number of Samples")
            ax[0].grid(True)

            ax[1].bar(['Yes', 'No'], [len(test_yes), len(test_no)],color=[(0.95,0.11,0.41), (0.12,0.07,0.3)])
            ax[1].set_title("Test set")
            ax[1].set_xlabel("RainTomorrow")
            ax[1].set_ylabel("Number of Samples")
            ax[1].set_ylim(ax[0].get_ylim())
            ax[1].grid(True)
            plt.suptitle("Train and Test set before oversampling")
            plt.savefig(os.path.join(IMAGE_PATH,f"Train_Test_dataset_barplot_distirbution_OPTIONS_{nan_managing}_{normalization}.png"), dpi=300)
            plt.close()

            f.write("\n ---------- BALANCE TRAINING DATA ---------- \n")

            train_featuers = train.drop(['RainTomorrow'], axis=1)
            train_target = train['RainTomorrow']

            categorical = categorical_column
            categorical.remove('Date')
            categorical.remove('RainTomorrow')
            categorical_mask = []

            for i in train_featuers.columns:
                if i in categorical:
                    categorical_mask.append(True)
                else:
                    categorical_mask.append(False)

            if training_balance == "minority":
                f.write("\n--> Balancing training data with the minority class")
                oversample = imblearn.over_sampling.RandomOverSampler(sampling_strategy='minority')
            elif training_balance == "smotenc":
                f.write("\n--> Balancing training data with SMOTE")
                oversample = imblearn.over_sampling.SMOTENC(categorical_features=categorical_mask,sampling_strategy='minority',random_state=42)

            train_featuers_resampled, train_target_resampled = oversample.fit_resample(train_featuers, train_target)

            #create dataframe from resampled data
            train_resampled = train_featuers_resampled
            train_resampled['RainTomorrow'] = train_target_resampled

            tw_rain = np.sum(np.array(train_resampled["RainTomorrow"]))
            tw_not_rain = len(train_resampled) - tw_rain
            tw_rain_perc = np.round(tw_rain/len(train_resampled)*100,2)
            tw_not_rain_perc = np.round(tw_not_rain/len(train_resampled)*100,2)

            f.write(f"\nRainTomorrow in training: {tw_rain}, that is the {tw_rain_perc}% of the dataset oversampled")
            f.write(f"\nNotRainTomorrow in training: {tw_not_rain}, that is the {tw_not_rain_perc}% of the dataset oversampled")


            fig, ax = plt.subplots(figsize=(15,10))
            plt.rcParams.update({'font.size': 15})
            ax.bar(['1', '0'], [tw_rain, tw_not_rain], color=[(0.95,0.11,0.41), (0.12,0.07,0.3)])
            #add second y-axis
            ax2 = ax.twinx()
            ax2.bar(['1', '0'], [tw_rain_perc, tw_not_rain_perc], color=[(0.95,0.11,0.41), (0.12,0.07,0.3)])
            ax2.set_ylabel('Percentage of all train dataset')
            ax.set_title("RainTomorrow Columns Oversampled DF")
            ax.set_ylabel("Number of Samples")
            plt.grid(True)
            # add capiton under th plot 
            plt.savefig(os.path.join(IMAGE_PATH,F"BarPlot_Distirbution_RainTomorrow_train_oversampled_OPTIONS_{nan_managing}_{normalization}_{training_balance}.png"), dpi=300)
            plt.close()

            f.write(f"\nDataset dimension increased by: {len(train_resampled)-len(train)}")

            x_train  = train_resampled.drop(['RainTomorrow'], axis=1)
            y_train = train_resampled['RainTomorrow']
            x_test = test.drop(['RainTomorrow'], axis=1)
            y_test = test['RainTomorrow']

            #save the train and test dataset
            x_train.to_csv(os.path.join(DATASET_PATH_TESTS,"x_train.csv"), index=False)
            y_train.to_csv(os.path.join(DATASET_PATH_TESTS,"y_train.csv"), index=False)
            x_test.to_csv(os.path.join(DATASET_PATH_TESTS,"x_test.csv"), index=False)
            y_test.to_csv(os.path.join(DATASET_PATH_TESTS,"y_test.csv"), index=False)
            f.write("\nDataset saved at {}".format(DATASET_PATH_TESTS))

            fig, ax = plt.subplots(figsize=(25,10))
            plt.rcParams.update({'font.size': 15})
            sns.heatmap(featuers.corr(), annot=True, ax=ax, cmap='RdBu_r')
            ax.set_title("Correlation Matrix")
            ax.set_xlabel("Features")
            ax.set_ylabel("Features")
            plt.savefig(os.path.join(IMAGE_PATH,F"Correlation_Matrix_OPTIONS_{nan_managing}_{normalization}_{training_balance}.png"), dpi=300)
            plt.close()


            f.close()


# EXPERIMENT ON DIFFERENT DATASET

## Random forest

In [2]:
DATASETS_PATH = os.path.sep + os.path.join("workspace", "dataset_varius_test")
for dataset_folder in os.listdir(DATASETS_PATH):
    folder = os.path.join(DATASETS_PATH, dataset_folder)
    RESULTS_PATH = os.path.sep + os.path.join("workspace", "results", dataset_folder + "_RF")

    if not os.path.exists(RESULTS_PATH):
        os.makedirs(RESULTS_PATH)

    x_train_path = glob.glob(os.path.join(folder, "dataset", "x_train.csv"))
    y_train_path = glob.glob(os.path.join(folder, "dataset", "y_train.csv"))
    x_test_path  = glob.glob(os.path.join(folder, "dataset", "x_test.csv"))
    y_test_path  = glob.glob(os.path.join(folder, "dataset", "y_test.csv"))

    x_train = pd.read_csv(x_train_path[0])
    y_train = pd.read_csv(y_train_path[0])
    x_test  = pd.read_csv(x_test_path[0])
    y_test  = pd.read_csv(y_test_path[0])

    ml = RandomForestClassifier(n_estimators = 100, random_state = 0)
    ml.fit(x_train, y_train)
    joblib.dump(ml, os.path.join(RESULTS_PATH, "model.joblib"))

    st = time.time()
    y_pred = ml.predict(x_test)
    ed = time.time()
    ex_time = ed - st
    sps = len(y_pred)/ex_time

    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=["Not Rain", "Rain"])
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    df_res = pd.DataFrame({"Precision": [precision], "Recall": [recall], "F1": [f1], "Accuracy": [acc], "Execution Time": [ex_time], "Samples per second": [sps], "Classification Report": [report]})
    df_res.to_csv(os.path.join(RESULTS_PATH, "results.csv"), index=False)

## LOGISTIC REGRESSION

In [4]:
DATASETS_PATH = os.path.sep + os.path.join("workspace", "dataset_varius_test")
for dataset_folder in os.listdir(DATASETS_PATH):
    folder = os.path.join(DATASETS_PATH, dataset_folder)
    RESULTS_PATH = os.path.sep + os.path.join("workspace", "results", dataset_folder + "_LR")

    if not os.path.exists(RESULTS_PATH):
        os.makedirs(RESULTS_PATH)

    x_train_path = glob.glob(os.path.join(folder, "dataset", "x_train.csv"))
    y_train_path = glob.glob(os.path.join(folder, "dataset", "y_train.csv"))
    x_test_path  = glob.glob(os.path.join(folder, "dataset", "x_test.csv"))
    y_test_path  = glob.glob(os.path.join(folder, "dataset", "y_test.csv"))

    x_train = pd.read_csv(x_train_path[0])
    y_train = pd.read_csv(y_train_path[0])
    x_test  = pd.read_csv(x_test_path[0])
    y_test  = pd.read_csv(y_test_path[0])

    ml = LogisticRegression()
    ml.fit(x_train, y_train)
    joblib.dump(ml, os.path.join(RESULTS_PATH, "model.joblib"))

    st = time.time()
    y_pred = ml.predict(x_test)
    ed = time.time()
    ex_time = ed - st
    sps = len(y_pred)/ex_time

    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=["Not Rain", "Rain"])
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    df_res = pd.DataFrame({"Precision": [precision], "Recall": [recall], "F1": [f1], "Accuracy": [acc], "Execution Time": [ex_time], "Samples per second": [sps], "Classification Report": [report]})
    df_res.to_csv(os.path.join(RESULTS_PATH, "results.csv"), index=False)

## SVM

In [2]:
DATASETS_PATH = os.path.sep + os.path.join("workspace", "dataset_varius_test")
for dataset_folder in os.listdir(DATASETS_PATH):
    folder = os.path.join(DATASETS_PATH, dataset_folder)
    RESULTS_PATH = os.path.sep + os.path.join("workspace", "results", dataset_folder + "_svm1_1")

    if not os.path.exists(RESULTS_PATH):
        os.makedirs(RESULTS_PATH)

    x_train_path = glob.glob(os.path.join(folder, "dataset", "x_train.csv"))
    y_train_path = glob.glob(os.path.join(folder, "dataset", "y_train.csv"))
    x_test_path  = glob.glob(os.path.join(folder, "dataset", "x_test.csv"))
    y_test_path  = glob.glob(os.path.join(folder, "dataset", "y_test.csv"))

    x_train = pd.read_csv(x_train_path[0])
    y_train = pd.read_csv(y_train_path[0])
    x_test  = pd.read_csv(x_test_path[0])
    y_test  = pd.read_csv(y_test_path[0])

    weights_per_class = [1,1]
    class_weight_d = {0: weights_per_class[0], 1: weights_per_class[1]}
    ml = svm.SVC(class_weight=class_weight_d)

    ml.fit(x_train, y_train)
    joblib.dump(ml, os.path.join(RESULTS_PATH, "model.joblib"))

    st = time.time()
    y_pred = ml.predict(x_test)
    ed = time.time()
    ex_time = ed - st
    sps = len(y_pred)/ex_time

    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=["Not Rain", "Rain"])
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    df_res = pd.DataFrame({"Precision": [precision], "Recall": [recall], "F1": [f1], "Accuracy": [acc], "Execution Time": [ex_time], "Samples per second": [sps], "Classification Report": [report]})
    df_res.to_csv(os.path.join(RESULTS_PATH, "results.csv"), index=False)

In [3]:
DATASETS_PATH = os.path.sep + os.path.join("workspace", "dataset_varius_test")
for dataset_folder in os.listdir(DATASETS_PATH):
    folder = os.path.join(DATASETS_PATH, dataset_folder)
    RESULTS_PATH = os.path.sep + os.path.join("workspace", "results", dataset_folder + "_svm1_1.2")

    if not os.path.exists(RESULTS_PATH):
        os.makedirs(RESULTS_PATH)

    x_train_path = glob.glob(os.path.join(folder, "dataset", "x_train.csv"))
    y_train_path = glob.glob(os.path.join(folder, "dataset", "y_train.csv"))
    x_test_path  = glob.glob(os.path.join(folder, "dataset", "x_test.csv"))
    y_test_path  = glob.glob(os.path.join(folder, "dataset", "y_test.csv"))

    x_train = pd.read_csv(x_train_path[0])
    y_train = pd.read_csv(y_train_path[0])
    x_test  = pd.read_csv(x_test_path[0])
    y_test  = pd.read_csv(y_test_path[0])

    weights_per_class = [1,1.2]
    class_weight_d = {0: weights_per_class[0], 1: weights_per_class[1]}
    ml = svm.SVC(class_weight=class_weight_d)

    ml.fit(x_train, y_train)
    joblib.dump(ml, os.path.join(RESULTS_PATH, "model.joblib"))

    st = time.time()
    y_pred = ml.predict(x_test)
    ed = time.time()
    ex_time = ed - st
    sps = len(y_pred)/ex_time

    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=["Not Rain", "Rain"])
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    df_res = pd.DataFrame({"Precision": [precision], "Recall": [recall], "F1": [f1], "Accuracy": [acc], "Execution Time": [ex_time], "Samples per second": [sps], "Classification Report": [report]})
    df_res.to_csv(os.path.join(RESULTS_PATH, "results.csv"), index=False)

## ANN

In [4]:
from torch.utils.data import DataLoader
import torch
import torch
from torch.utils.data import Dataset
import logging
from PIL import Image
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import pandas as pd
import glob 
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from torch.optim.lr_scheduler import ReduceLROnPlateau


from torch.utils.tensorboard import SummaryWriter

In [5]:
class BinaryClassification(nn.Module):
    def __init__(self,input_shape, n_classes=1, hidden_1 = 64, hidden_2 = 64, dropout = 0.5,batch_size=64):
            super(BinaryClassification, self).__init__()
            self.layer_1 = nn.Linear(input_shape, hidden_1) 
            self.layer_2 = nn.Linear(hidden_1, hidden_2)
            self.layer_out = nn.Linear(hidden_2, n_classes) 
            self.layer_out_put = nn.Sigmoid()
            
            self.relu = nn.ReLU()
            self.dropout = nn.Dropout(p=dropout)
            self.batchnorm1 = nn.BatchNorm1d(hidden_2)
            self.batchnorm2 = nn.BatchNorm1d(hidden_2)
            
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        #------------------------------------------------------
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        #------------------------------------------------------
        x = self.layer_out(x)
        out = self.layer_out_put(x)
        
        return out   

def binary_acc(y_pred, y_test):
    y_pred_tag = y_pred.squeeze()
    correct_results_sum = (y_pred_tag == y_test.squeeze()).sum().float()
    acc = correct_results_sum/len(y_test.squeeze())    
    return acc

In [6]:
class myDataset(Dataset):
    def __init__(self, x_train_path, y_train_path):
        self.x_train_path = x_train_path
        self.y_train_path = y_train_path
        self.x_train = pd.read_csv(self.x_train_path)
        self.y_train = pd.read_csv(self.y_train_path)

    def __len__(self):
        return len(self.x_train)

    def __getitem__(self, idx):
        return {'sample': self.x_train.iloc[idx].values, 
                'label': self.y_train.iloc[idx].values}

In [7]:
def eval_net(net, loader, device, criterion):
    n_val = len(loader)
    tot_loss = 0
    tot_acc = 0

    with tqdm(total=n_val, desc='Validation round', unit='batch',disable = True, leave=True) as pbar:
        for batch in loader:
            sample = batch['sample'].to(device).float()
            label = batch['label'].to(device).float()

            with torch.no_grad():
                y_pred = net(sample)
                y_pred_tag = torch.round(y_pred)
            
            loss = criterion(y_pred, label)
            acc = binary_acc(y_pred_tag, label.unsqueeze(1))

            tot_loss += loss.item()
            tot_acc += acc.item()

            pbar.set_postfix(**{'Val Loss/batch': loss, 'Val Acc/batch': acc})
            pbar.update()
    net.train()
    return tot_loss/len(loader), tot_acc/len(loader)


def train_net (net,device,x_train_path,y_train_path, dir_checkpoint, epochs=5,batch_size = 1, lr = 0.001 ):
    dataset = myDataset(x_train_path,y_train_path)
    #split dataset into train and test
    train_size = int(0.9 * len(dataset))
    val_size = len(dataset) - train_size

    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

    writer = SummaryWriter(comment=f'LR_{lr}_BS_{batch_size}_EP_{epochs}')
    global_step = 0

    optimizer = torch.optim.Adam(net.parameters(), lr=lr, weight_decay=1e-5)
    criterion = nn.BCEWithLogitsLoss()
    scheduler = ReduceLROnPlateau(optimizer, 'min')

    net.to(device)


    for epoch in range(epochs):
        net.train()
        epoch_loss = 0
        epoch_acc = 0
        pseudo_batch_loss = 0
        pseudo_batch_acc = 0
        
        with tqdm(total=len(train_loader), desc=f'Epoch {epoch}/{epochs}') as pbar:
            for batch in train_loader:
                sample = batch['sample'].to(device).float()
                label = batch['label'].to(device).float()

                y_pred = net(sample)
                
                loss = criterion(y_pred, label)
                acc = binary_acc(torch.round(y_pred), label.unsqueeze(1))

                
                optimizer.zero_grad()
                
                loss.backward()
                
                optimizer.step()

                epoch_loss += loss.item()
                epoch_acc += acc.item()
                global_step += 1
                pseudo_batch_loss += loss.item()
                pseudo_batch_acc += acc.item()

                if (global_step) % 1 == 0:
                    writer.add_scalar('Loss/train each batch', pseudo_batch_loss, global_step)
                    writer.add_scalar('Acc/train each batch', pseudo_batch_acc, global_step)
                    pbar.set_postfix(**{'loss': pseudo_batch_loss, 'acc': pseudo_batch_acc})
                    pseudo_batch_acc = 0
                    pseudo_batch_loss = 0
                    
                pbar.update()

        loss_val, acc_val = eval_net(net, val_loader, device, criterion)
        scheduler.step(acc_val)

        writer.add_scalar('Loss/val', loss_val, epoch)
        writer.add_scalar('Acc/val', acc_val, epoch)
        writer.add_scalar('Acc/train epoches', epoch_acc/len(train_loader), epoch)
        writer.add_scalar('Loss/train epoches', epoch_loss/len(train_loader), epoch)

        if(epoch % 2 == 0):
            try:
                os.mkdir(dir_checkpoint)
                logging.info('Created checkpoint directory')
            except OSError:
                pass
            torch.save(net.state_dict(), os.path.join(dir_checkpoint,f"net_epoch_{epoch}.pth"))
    
        
            
    torch.save(net.state_dict(), os.path.join(dir_checkpoint,f"net_epoch_{epochs}.pth"))

def prediction(net,
                device,
                x_test_path,
                y_test_path):
    net.to(device)
    net.eval()
    test_dataset =myDataset(x_test_path,y_test_path)
    test_loader = DataLoader(test_dataset, batch_size=1, pin_memory=True)

    label_t = []
    pred_t = []

    with tqdm(total=len(test_loader), desc=f'Batch' ) as pbar:
        for batch in test_loader:
            sample = batch['sample'].to(device).float()
            label = batch['label'].to(device).float()

            with torch.no_grad():
                y_pred = net(sample)
                y_pred_tag = torch.round(y_pred)

        
            label_t.append(label.cpu().numpy().squeeze().tolist())
            pred_t.append(np.round(y_pred_tag.cpu().numpy()).squeeze().tolist())
            pbar.update()
            
    return label_t, pred_t

In [21]:
DATASETS_PATH = os.path.sep + os.path.join("workspace", "dataset_varius_test")
for dataset_folder in os.listdir(DATASETS_PATH):
    folder = os.path.join(DATASETS_PATH, dataset_folder)
    RESULTS_PATH = os.path.sep + os.path.join("workspace", "results", dataset_folder + "_ANN")

    if not os.path.exists(RESULTS_PATH):
        os.makedirs(RESULTS_PATH)

    x_train_path = glob.glob(os.path.join(folder, "dataset", "x_train.csv"))
    y_train_path = glob.glob(os.path.join(folder, "dataset", "y_train.csv"))
    x_test_path  = glob.glob(os.path.join(folder, "dataset", "x_test.csv"))
    y_test_path  = glob.glob(os.path.join(folder, "dataset", "y_test.csv"))

    x_train = pd.read_csv(x_train_path[0])
    y_train = pd.read_csv(y_train_path[0])
    x_test  = pd.read_csv(x_test_path[0])
    y_test  = pd.read_csv(y_test_path[0])


    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    df_xtrain = pd.read_csv(x_train_path[0])
    dir_checkpoint = os.path.join(RESULTS_PATH,"checkpoints")
    if os.path.exists(dir_checkpoint)==False:
        os.mkdir(dir_checkpoint)

    batch_size = 128
    lr = 0.001
    epochs = 100
    net = BinaryClassification(df_xtrain.shape[1], n_classes=1, hidden_1=64, hidden_2 = 64, dropout = 0.3, batch_size=batch_size)
    net.to(device)

    if os.path.exists(os.path.join(RESULTS_PATH,"checkpoints",f"net_epoch_{epochs}.pth")) == False:
        train_net(net,device,x_train_path[0],y_train_path[0], dir_checkpoint, epochs=epochs,batch_size = batch_size, lr = lr)

    net.load_state_dict(torch.load(os.path.join(RESULTS_PATH,"checkpoints",f"net_epoch_{epochs}.pth"), map_location=device))


    st = time.time()
    y_test, y_pred = prediction(net, device, x_test_path[0], y_test_path[0])
    ed = time.time()
    ex_time = ed - st
    sps = len(y_pred)/ex_time

    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=["Not Rain", "Rain"])
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    df_res = pd.DataFrame({"Precision": [precision], "Recall": [recall], "F1": [f1], "Accuracy": [acc], "Execution Time": [ex_time], "Samples per second": [sps], "Classification Report": [report]})
    df_res.to_csv(os.path.join(RESULTS_PATH, "results.csv"), index=False)

Batch: 100%|██████████| 19126/19126 [00:14<00:00, 1363.22it/s]
Epoch 0/100: 100%|██████████| 1131/1131 [00:24<00:00, 46.87it/s, acc=0.85, loss=0.542] 
Epoch 1/100: 100%|██████████| 1131/1131 [00:24<00:00, 46.75it/s, acc=0.812, loss=0.617]
Epoch 2/100: 100%|██████████| 1131/1131 [00:23<00:00, 47.91it/s, acc=0.8, loss=0.536]  
Epoch 3/100: 100%|██████████| 1131/1131 [00:23<00:00, 48.11it/s, acc=0.825, loss=0.623]
Epoch 4/100: 100%|██████████| 1131/1131 [00:23<00:00, 47.81it/s, acc=0.762, loss=0.648]
Epoch 5/100: 100%|██████████| 1131/1131 [00:23<00:00, 47.80it/s, acc=0.825, loss=0.574]
Epoch 6/100: 100%|██████████| 1131/1131 [00:23<00:00, 48.14it/s, acc=0.663, loss=0.645]
Epoch 7/100: 100%|██████████| 1131/1131 [00:23<00:00, 47.69it/s, acc=0.8, loss=0.616]  
Epoch 8/100: 100%|██████████| 1131/1131 [00:23<00:00, 47.38it/s, acc=0.863, loss=0.587]
Epoch 9/100: 100%|██████████| 1131/1131 [00:23<00:00, 47.14it/s, acc=0.762, loss=0.592]
Epoch 10/100: 100%|██████████| 1131/1131 [00:23<00:00, 47