In [103]:
import os

import matplotlib.pyplot as plt
import seaborn as sns

if str(os.getcwdb()[-3:]).split("'")[1] != 'src':
    os.chdir(os.path.dirname(os.getcwdb()))


In [104]:
import pandas as pd
import numpy as np
from copy import copy
import time
import pickle

from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVR, SVC

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_validate

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, f1_score

from xgboost import XGBRegressor



class Model:


    def __init__(self, df, target_name, index=None):
        self.target_name = target_name
        self.index = index
        self.df = df


    @property
    def dataframe(self):
        if self.index:
            return self.df.set_index(self.index)
        else:
            return self.df


    @staticmethod
    def send_pickle():
        pass


    def split_dataframe(self, train_num=0.7, random_num=43, scaler=None):
        self.random_num = random_num
        X = self.dataframe.drop(columns=self.target_name)
        y = self.dataframe[self.target_name]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, train_size=train_num, random_state=self.random_num)
        if scaler:
            self.scaler = eval(scaler + '()')
            self.scaler_name = ' (' + scaler + ')'
            self.X_train = self.scaler.fit_transform(self.X_train)
            self.X_test = self.scaler.transform(self.X_test)        
        else:
            self.scaler_name = ''
        return (self.X_train, self.X_test, self.y_train, self.y_test)


    def prepare_models(self, selected_list=None, excluded_list=None, params_list=None):
        self.models = self.chosen_models.copy()
        if not excluded_list:
            excluded_list = []
        if not selected_list:
            selected_list = []
        self.models_previous = self.models.copy()
        for element in self.models_previous.keys():
            if (len(selected_list) >= 1 and element not in selected_list) or element in excluded_list:
                self.models.pop(element)
        for model_name in self.models.keys():
            self.models[model_name] = eval(model_name + '()')
        if params_list:
            for params in params_list:
                self.models[params[0] + ': ' + params[1]] = eval(params[0] + '(' + params[1] + ')')
            for params in params_list:
                    if params[0] in self.models:
                        try:
                            self.models.pop(params[0])
                        except Exception:
                            continue
        return 'Models prepared. Apply them or use kfold (apply + evaluate)'


    def apply_models(self):
        print(f'-- {self.type.capitalize()} --')
        current_time = time.time()
        total_time = time.time() - current_time
        for model_name, model in self.models.items():
            start_time = time.time()
            print(f'Starting {model_name}:')
            model.fit(self.X_train, self.y_train)
            self.y_pred = model.predict(self.X_test)
            self.models[model_name] = {'test': np.array(self.y_test), 'prediction': self.y_pred, 'model': model}
            execution_time = time.time() - start_time
            total_time += execution_time
            print(f'- {model_name} done in {round(execution_time, 2)} sec(s). Total time: {round(total_time, 2)}')
        return self.models


    def create_dataframe(self, best_values_list, worst_values_list, models_metrics):
        self.df = pd.DataFrame(data=models_metrics)
        if best_values_list:
            best_values_list = [element[0] for element in best_values_list]
            worst_values_list = [element[0] for element in worst_values_list]
            self.df['BEST'] = best_values_list
            self.df['WORST'] = worst_values_list



class Regression(Model):
    chosen_models = dict()


    def __init__(self, dataframe, target_name, index=None):
        super().__init__(dataframe, target_name, index)
        self.type = 'regression'


    @classmethod
    def add_models(cls, regression_list):
        if regression_list:
            for element in regression_list:
                cls.chosen_models[element] = ''


    @classmethod
    def remove_models(cls, regression_list):
        if regression_list:
            for element in regression_list:
                cls.chosen_models.pop(element)

  
    def apply_and_evaluate_kfolds(self, kfolds_num=5):
        self.kfolds_num = kfolds_num
        self.kfolds = KFold(n_splits=kfolds_num, shuffle=True, random_state=self.random_num)
        self.kfold = 'fold'
        metrics = ['neg_root_mean_squared_error', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'r2', 'neg_mean_absolute_percentage_error']
        self.models_evaluated = dict()
        print(f'-- {self.type.capitalize()}{self.scaler_name}: using mean of {self.kfolds_num} {self.kfold}s --')
        current_time = time.time()
        total_time = time.time() - current_time
        for model_name, model in self.models.items():
            print(f'Starting {model_name}:')
            start_time = time.time()
            cross_val = cross_validate(model, self.X_train, self.y_train, cv=self.kfolds, return_estimator=True, scoring=metrics)
            list_of_metrics = list(cross_val.keys())[3:]
            self.models_evaluated[model_name] = dict()
            self.models_evaluated[model_name]['models'] = cross_val['estimator']
            self.models_evaluated[model_name]['metrics'] = {'rmse': abs(np.mean(list(cross_val.values())[3:][0])), 
                                                            'mse': abs(np.mean(list(cross_val.values())[3:][1])), 
                                                            'mae': abs(np.mean(list(cross_val.values())[3:][2])), 
                                                            'r2_score': np.mean(list(cross_val.values())[3:][3]), 
                                                            'mape': abs(np.mean(list(cross_val.values())[3:][4]))}
            self.models_evaluated[model_name]['all_metrics'] = {'rmse': list(map(abs, list(cross_val.values())[3:][0])), 
                                                            'mse': list(map(abs, list(cross_val.values())[3:][1])), 
                                                            'mae': list(map(abs, list(cross_val.values())[3:][2])), 
                                                            'r2_score': list(map(abs, list(cross_val.values())[3:][3])), 
                                                            'mape': list(map(abs, list(cross_val.values())[3:][4]))}
            self.models_evaluated[model_name]['variances'] = {'rmse': np.var(list(cross_val.values())[3:][0]), 
                                                            'mse': np.var(list(cross_val.values())[3:][1]), 
                                                            'mae': np.var(list(cross_val.values())[3:][2]), 
                                                            'r2_score': np.var(list(cross_val.values())[3:][3]), 
                                                            'mape': np.var(list(cross_val.values())[3:][4])}
            execution_time = time.time() - start_time
            total_time += execution_time
            print(f'- {model_name} done in {round(execution_time, 2)} sec(s). Total time: {round(total_time, 2)}')
        return self.models_evaluated


    def evaluate_metrics(self):
        self.models_evaluated = self.models.copy()
        for model_name, model_results in self.models_evaluated.items():
            rmse = mean_squared_error(model_results['test'], model_results['prediction'], squared=False)
            mse = mean_squared_error(model_results['test'], model_results['prediction'])
            mae = mean_absolute_error(model_results['test'], model_results['prediction'])
            r2 = r2_score(model_results['test'], model_results['prediction'])
            mape = mean_absolute_percentage_error(model_results['test'], model_results['prediction'])
            self.models_evaluated[model_name]['metrics'] = {'rmse': rmse, 'mse': mse, 'mae': mae, 'r2_score': r2, 'mape': mape}
        return self.models_evaluated


    def create_dataframe(self):
        models_metrics = self.models_evaluated.copy()
        best_values_list = []
        worst_values_list = []
        for model_name, model_results in self.models_evaluated.items():
            models_metrics[model_name] = models_metrics[model_name]['metrics']
            if len(models_metrics) > 1:
                model_values = [value if type(value) is not list else sum([row[index] for index, row in enumerate(value)]) for value in self.models_evaluated[model_name]['metrics'].values()]
                if not best_values_list:
                    best_values_list = [[model_name, value] for value in model_values]
                    worst_values_list = [[model_name, value] for value in model_values]
                else:
                    for index, value in enumerate(model_values):
                        if value < best_values_list[index][1]:
                            if index != 3:
                                best_values_list[index][1] = value
                                best_values_list[index][0] = model_name
                            else:
                                worst_values_list[index][1] = value
                                worst_values_list[index][0] = model_name
                        if value > worst_values_list[index][1]:
                            if index != 3:
                                worst_values_list[index][1] = value
                                worst_values_list[index][0] = model_name
                            else:
                                best_values_list[index][1] = value
                                best_values_list[index][0] = model_name
        super().create_dataframe(best_values_list, worst_values_list, models_metrics)
        return self.df


class Classification(Model):
    chosen_models = dict()


    def __init__(self, dataframe, target_name, index=None):
        super().__init__(dataframe, target_name, index)
        self.type = 'classification'


    @classmethod
    def add_models(cls, classification_list):
        if classification_list:
            for element in classification_list:
                cls.chosen_models[element] = ''


    @classmethod
    def remove_models(cls, classification_list):
        if classification_list:
            for element in classification_list:
                cls.chosen_models.pop(element)


    def apply_and_evaluate_kfolds(self, kfolds_num=5, multiclass_average=None):
        self.kfolds = StratifiedKFold(n_splits=kfolds_num, shuffle=True, random_state=self.random_num)
        self.kfolds_num = kfolds_num
        self.kfold = 'stratified fold'
        metrics = ['accuracy', 'recall', 'precision', 'f1']
        if multiclass_average == 'micro':
            metrics = ['accuracy', 'precision_micro', 'recall_micro', 'f1_micro'] 
        elif multiclass_average == 'macro':
            metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'] 
        elif multiclass_average == 'samples':
            metrics = ['accuracy', 'precision_samples', 'recall_samples', 'f1_samples'] 
        elif multiclass_average == 'weighted':
            metrics = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'] 
        self.models_evaluated = dict()
        print(f'-- {self.type.capitalize()}{self.scaler_name}: using mean of {self.kfolds_num} {self.kfold}s --')
        current_time = time.time()
        total_time = time.time() - current_time
        for model_name, model in self.models.items():
            print(f'Starting {model_name}:')
            start_time = time.time()
            cross_val = cross_validate(model, self.X_train, self.y_train, cv=self.kfolds, return_estimator=True, scoring=metrics)
            self.models_evaluated[model_name] = dict()
            self.models_evaluated[model_name]['models'] = cross_val['estimator']
            self.models_evaluated[model_name]['metrics'] = {'accuracy': abs(np.mean(list(cross_val.values())[3:][0])), 
                                                            'recall': abs(np.mean(list(cross_val.values())[3:][1])), 
                                                            'precision': abs(np.mean(list(cross_val.values())[3:][2])), 
                                                            'f1_score': np.mean(list(cross_val.values())[3:][3])}
            self.models_evaluated[model_name]['all_metrics'] = {'accuracy': list(map(abs, list(cross_val.values())[3:][0])), 
                                                            'recall': list(map(abs, list(cross_val.values())[3:][1])), 
                                                            'precision': list(map(abs, list(cross_val.values())[3:][2])), 
                                                            'f1_score': list(map(abs, list(cross_val.values())[3:][3]))}
            self.models_evaluated[model_name]['variances'] = {'accuracy': np.var(list(cross_val.values())[3:][0]), 
                                                            'recall': np.var(list(cross_val.values())[3:][1]), 
                                                            'precision': np.var(list(cross_val.values())[3:][2]), 
                                                            'f1_score': np.var(list(cross_val.values())[3:][3])}
            execution_time = time.time() - start_time
            total_time += execution_time
            print(f'- {model_name} done in {round(execution_time, 2)} sec(s). Total time: {round(total_time, 2)}')
        return self.models_evaluated


    def evaluate_metrics(self, params_list=None):
        self.models_evaluated = self.models.copy()
        for model_name, model_results in self.models_evaluated.items():
            accuracy = "accuracy_score (model_results['test'], model_results['prediction']"
            recall = "recall_score (model_results['test'], model_results['prediction']"
            precision = "precision_score (model_results['test'], model_results['prediction']"
            f1 = "f1_score (model_results['test'], model_results['prediction']"
            matrix = "confusion_matrix (model_results['test'], model_results['prediction']"
            list_of_metrics = []
            for index, element in enumerate([accuracy, recall, precision, f1, matrix], 1):
                if params_list:
                    for params in params_list:
                        if params[0] == element.split()[0]:
                            element += ', ' + params[1] + ')'
                if element[-1] == ']':
                    element += ')'
                list_of_metrics.append(eval(element))
            print(list_of_metrics)
            confusion = [element for element in list_of_metrics[-1]]
            self.models_evaluated[model_name]['metrics'] = {'accuracy': list_of_metrics[0], 'recall': list_of_metrics[1], 'precision': list_of_metrics[2], 'f1_score': list_of_metrics[3], 'confusion_matrix': confusion}
        return self.models_evaluated


    def create_dataframe(self):
        models_metrics = self.models_evaluated.copy()
        best_values_list = []
        worst_values_list = []
        for model_name, model_results in self.models_evaluated.items():
            models_metrics[model_name] = models_metrics[model_name]['metrics']
            if len(models_metrics) > 1:
                model_values = [value if type(value) is not list else sum([row[index] for index, row in enumerate(value)]) for value in self.models_evaluated[model_name]['metrics'].values()]
                if not best_values_list:
                    best_values_list = [[model_name, value] for value in model_values]
                    worst_values_list = [[model_name, value] for value in model_values]
                else:
                    for index, value in enumerate(model_values):
                        if value > best_values_list[index][1]:
                            best_values_list[index][1] = value
                            best_values_list[index][0] = model_name
                        if value < worst_values_list[index][1]:
                            worst_values_list[index][1] = value
                            worst_values_list[index][0] = model_name
        super().create_dataframe(best_values_list, worst_values_list, models_metrics)
        return self.df


In [105]:
df_diamonds = pd.read_csv(r'data\processed\diamonds_training.csv', index_col='id')
df_predict = pd.read_csv(r'data\processed\diamonds_testing.csv', index_col='id')


In [106]:
Regression.add_models(['LinearRegression',
                        'XGBRegressor'
                        ]
                        )


In [107]:
round_1 = Regression(df_diamonds, 'price')
round_1.split_dataframe()
round_1.prepare_models()
dict_round_1 = round_1.apply_and_evaluate_kfolds()
round_1.create_dataframe()


-- Regression: using mean of 5 folds --
Starting LinearRegression:
- LinearRegression done in 0.15 sec(s). Total time: 0.15
Starting XGBRegressor:
- XGBRegressor done in 5.62 sec(s). Total time: 5.76


Unnamed: 0,LinearRegression,XGBRegressor,BEST,WORST
rmse,0.182664,0.090762,XGBRegressor,LinearRegression
mse,0.033669,0.008247,XGBRegressor,LinearRegression
mae,0.118476,0.065596,XGBRegressor,LinearRegression
r2_score,0.967188,0.991976,XGBRegressor,LinearRegression
mape,0.015317,0.008468,XGBRegressor,LinearRegression


In [108]:
dict_round_1


{'LinearRegression': {'models': [LinearRegression(),
   LinearRegression(),
   LinearRegression(),
   LinearRegression(),
   LinearRegression()],
  'metrics': {'rmse': 0.18266434126360717,
   'mse': 0.033668866217605245,
   'mae': 0.11847602510487627,
   'r2_score': 0.9671883402028569,
   'mape': 0.015316754357796062},
  'all_metrics': {'rmse': [0.18278436404836218,
    0.16134131610876856,
    0.21213340614001955,
    0.1873559277102721,
    0.16970669231061353],
   'mse': [0.0334101237405642,
    0.026031020283709582,
    0.04500058200056649,
    0.035102243648176705,
    0.02880036141500925],
   'mae': [0.1198527764880509,
    0.12009375582158771,
    0.11814603052977636,
    0.1176840754805362,
    0.1166034872044301],
   'r2_score': [0.9675277226414865,
    0.9749723070956142,
    0.9554574304863221,
    0.9657327540523,
    0.9722514867385628],
   'mape': [0.015509386119075434,
    0.015492198456540348,
    0.015267110508885734,
    0.015208411444627408,
    0.015106665259851387]

In [109]:
from sklearn.datasets import load_wine


X = load_wine(as_frame=True).data
y = load_wine(as_frame=True).target

df = pd.concat([X, y], axis=1)


In [110]:
Classification.add_models(['LogisticRegression',
                            ]
                            )

In [111]:
round_2 = Classification(df, 'target')

In [112]:
round_2.split_dataframe()


(     alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
 97     12.29        1.41  1.98               16.0       85.0           2.55   
 53     13.77        1.90  2.68               17.1      115.0           3.00   
 20     14.06        1.63  2.28               16.0      126.0           3.00   
 30     13.73        1.50  2.70               22.5      101.0           3.00   
 160    12.36        3.83  2.38               21.0       88.0           2.30   
 ..       ...         ...   ...                ...        ...            ...   
 58     13.72        1.43  2.50               16.7      108.0           3.40   
 21     12.93        3.80  2.65               18.6      102.0           2.41   
 49     13.94        1.73  2.27               17.4      108.0           2.88   
 64     12.17        1.45  2.53               19.0      104.0           1.89   
 68     13.34        0.94  2.36               17.0      110.0           2.53   
 
      flavanoids  nonflavanoid_phenols

In [113]:
round_2.prepare_models()

'Models prepared. Apply them or use kfold (apply + evaluate)'

In [114]:
dict_round_2 = round_2.apply_and_evaluate_kfolds(multiclass_average='macro')

-- Classification: using mean of 5 stratified folds --
Starting LogisticRegression:
- LogisticRegression done in 0.18 sec(s). Total time: 0.18


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [115]:
round_2.create_dataframe()

Unnamed: 0,LogisticRegression
accuracy,0.926333
f1_score,0.931961
precision,0.932374
recall,0.937374


In [116]:
dict_round_2


{'LogisticRegression': {'models': [LogisticRegression(),
   LogisticRegression(),
   LogisticRegression(),
   LogisticRegression(),
   LogisticRegression()],
  'metrics': {'accuracy': 0.9263333333333333,
   'recall': 0.9373737373737374,
   'precision': 0.9323737373737375,
   'f1_score': 0.931960544428575},
  'all_metrics': {'accuracy': [1.0, 0.96, 0.96, 0.92, 0.7916666666666666],
   'recall': [1.0,
    0.9722222222222222,
    0.9629629629629629,
    0.9259259259259259,
    0.8257575757575758],
   'precision': [1.0,
    0.9583333333333334,
    0.9696969696969697,
    0.9393939393939394,
    0.7944444444444444],
   'f1_score': [1.0,
    0.9632850241545894,
    0.9645191409897292,
    0.9249999999999999,
    0.806998556998557]},
  'variances': {'accuracy': 0.005173777777777779,
   'recall': 0.0036760988107789423,
   'precision': 0.005142801754922971,
   'f1_score': 0.0044669200471591074}}}