In [203]:
import os

import matplotlib.pyplot as plt
import seaborn as sns

if str(os.getcwdb()[-3:]).split("'")[1] != 'src':
    os.chdir(os.path.dirname(os.getcwdb()))


In [204]:
import pandas as pd
import numpy as np
from copy import copy
import time
import pickle

from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVR, SVC

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_validate

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, f1_score

from xgboost import XGBRegressor


class Model:
    chosen_models = dict()

    def __init__(self, df, target_name, index=None):
        self.target_name = target_name
        self.index = index
        self.df = df

    @property
    def dataframe(self):
        if self.index:
            return self.df.set_index(self.index)
        else:
            return self.df

    @staticmethod
    def send_pickle():
        pass

    def split_dataframe(self, train_num=0.7, random_num=43, scaler=None):
        self.random_num = random_num
        X = self.dataframe.drop(columns=self.target_name)
        y = self.dataframe[self.target_name]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, train_size=train_num, random_state=self.random_num)
        if scaler:
            self.scaler = eval(scaler + '()')
            self.scaler_name = ' (' + scaler + ')'
            self.X_train = self.scaler.fit_transform(self.X_train)
            self.X_test = self.scaler.transform(self.X_test)        
        else:
            self.scaler_name = ''
        return (self.X_train, self.X_test, self.y_train, self.y_test)

    def apply_models(self, selected_list=None, excluded_list=None, params_list=None):
        self.models = self.chosen_models.copy()
        if not excluded_list:
            excluded_list = []
        if not selected_list:
            selected_list = []
        current_time = time.time()
        self.models_previous = self.models.copy()
        for element in self.models_previous.keys():
            if (len(selected_list) >= 1 and element not in selected_list) or element in excluded_list:
                self.models.pop(element)
        for model_name in self.models.keys():
            self.models[model_name] = eval(model_name + '()')
        if params_list:
            for params in params_list:
                self.models[params[0] + ': ' + params[1]] = eval(params[0] + '(' + params[1] + ')')
            for params in params_list:
                    if params[0] in self.models:
                        try:
                            self.models.pop(params[0])
                        except Exception:
                            continue
        if self.kfolds_num:
            print(f'-- {self.type.capitalize()}{self.scaler_name}: using best of {self.kfolds_num} {self.kfold}s --')
        else:
            print(f'-- {self.type.capitalize()} --')
        total_time = time.time() - current_time
        for model_name, model in self.models.items():
            start_time = time.time()
            print(f'Starting {model_name}:')
            if self.kfolds_num:
                score_string = 'accuracy'
                if self.type == 'regression':
                    score_string = 'neg_mean_absolute_error'
                cross_val = cross_validate(model, self.X_train, self.y_train, cv=self.kfolds, return_estimator=True, scoring=score_string)
                best_score = max(cross_val['test_score'])             
                for index, element in enumerate(cross_val['test_score']):
                    if element == best_score:
                        best_score = index
                model = cross_val['estimator'][best_score]
            else:
                model.fit(self.X_train, self.y_train)
            self.y_pred = model.predict(self.X_test)
            self.models[model_name] = {'test': np.array(self.y_test), 'prediction': self.y_pred, 'model': model}
            execution_time = time.time() - start_time
            total_time += execution_time
            print(f'- {model_name} done in {round(execution_time, 2)} sec(s). Total time: {round(total_time, 2)}')
        return self.models

    def evaluate_metrics(self, selection_list=None):
        self.models_evaluated_previous = self.models
        self.models_evaluated = copy(self.models_evaluated_previous)
        if selection_list:
            for element in self.models_evaluated_previous.keys():
                if element not in selection_list:
                    self.models_evaluated.pop(element)


class Regression(Model):

    def __init__(self, dataframe, target_name, index=None):
        super().__init__(dataframe, target_name, index)
        self.type = 'regression'

    @classmethod
    def add_models(cls, regression_list):
        if regression_list:
            for element in regression_list:
                cls.chosen_models[element] = ''

    @classmethod
    def remove_models(cls, regression_list):
        if regression_list:
            for element in regression_list:
                cls.chosen_models.pop(element)

    def apply_models(self, selected_list=None, excluded_list=None, params_list=None, kfolds_num=None):
        self.kfolds_num = kfolds_num
        if kfolds_num:
            self.kfolds = KFold(n_splits=kfolds_num, shuffle=True, random_state=self.random_num)
            self.kfold = 'fold'
        super().apply_models(selected_list, excluded_list, params_list)

    def evaluate_metrics(self):
        super().evaluate_metrics(selection_list=None)
        for model_name, model_results in self.models_evaluated.items():
            rmse = mean_squared_error(model_results['test'], model_results['prediction'], squared=False)
            mse = mean_squared_error(model_results['test'], model_results['prediction'])
            mae = mean_absolute_error(model_results['test'], model_results['prediction'])
            r2 = r2_score(model_results['test'], model_results['prediction'])
            mape = mean_absolute_percentage_error(model_results['test'], model_results['prediction'])
            self.models_evaluated[model_name]['metrics'] = {'rmse': rmse, 'mse': mse, 'mae': mae, 'r2_score': r2, 'mape': mape}
        return self.models_evaluated

    def create_dataframe(self):
        self.models_metrics = self.models_evaluated.copy()
        metrics_list = []
        best_values_list = []
        worst_values_list = []
        for model_name, model_results in self.models_evaluated.items():
            self.models_metrics[model_name] = self.models_metrics[model_name]['metrics']
            model_values = [value if type(value) is not list else sum([row[index] for index, row in enumerate(value)]) for value in self.models_evaluated[model_name]['metrics'].values()]
            if not metrics_list:
                metrics_list += [key for key in self.models_evaluated[model_name]['metrics'].keys()]
            if not best_values_list:
                best_values_list = [[model_name, value] for value in model_values]
                worst_values_list = [[model_name, value] for value in model_values]
            else:
                for index, value in enumerate(model_values):
                    if value < best_values_list[index][1]:
                        if index != 3:
                            best_values_list[index][1] = value
                            best_values_list[index][0] = model_name
                        else:
                            worst_values_list[index][1] = value
                            worst_values_list[index][0] = model_name
                    if value > worst_values_list[index][1]:
                        if index != 3:
                            worst_values_list[index][1] = value
                            worst_values_list[index][0] = model_name
                        else:
                            best_values_list[index][1] = value
                            best_values_list[index][0] = model_name
        df = pd.DataFrame(data=self.models_metrics)
        best_values_list = [element[0] for element in best_values_list]
        worst_values_list = [element[0] for element in worst_values_list]
        df['BEST'] = best_values_list
        df['WORST'] = worst_values_list
        return df


class Classification(Model):

    def __init__(self, dataframe, target_name, index=None):
        super().__init__(dataframe, target_name, index)
        self.type = 'classification'

    @classmethod
    def add_models(cls, classification_list):
        if classification_list:
            for element in classification_list:
                cls.chosen_models[element] = ''

    @classmethod
    def remove_models(cls, classification_list):
        if classification_list:
            for element in classification_list:
                cls.chosen_models.pop(element)

    def apply_models(self, selected_list=None, excluded_list=None, params_list=None, kfolds_num=None):
        if kfolds_num:
            self.kfolds = StratifiedKFold(n_splits=kfolds_num, shuffle=True, random_state=self.random_num)
            self.kfolds_num = kfolds_num
            self.kfold = 'stratified fold'
        super().apply_models(selected_list, excluded_list, params_list)

    def evaluate_metrics(self, params_list=None):
        super().evaluate_metrics(selection_list=None)
        for model_name, model_results in self.models_evaluated.items():
            accuracy = "accuracy_score (model_results['test'], model_results['prediction']"
            recall = "recall_score (model_results['test'], model_results['prediction']"
            precision = "precision_score (model_results['test'], model_results['prediction']"
            f1 = "f1_score (model_results['test'], model_results['prediction']"
            matrix = "confusion_matrix (model_results['test'], model_results['prediction']"
            list_of_metrics = []
            for element in (accuracy, recall, precision, f1, matrix):
                if params_list:
                    for params in params_list:
                        if params[0] == element.split()[0]:
                            list_of_metrics.append(eval(element + "," + params[1] + ")"))
                        else:
                            list_of_metrics.append(eval(element + ")"))
                            continue
                else:
                    list_of_metrics.append(eval(element + ")"))
            confusion = [element for element in list_of_metrics[-1]]
            self.models_evaluated[model_name]['metrics'] = {'accuracy': list_of_metrics[0], 'recall': list_of_metrics[1], 'precision': list_of_metrics[2], 'f1_score': list_of_metrics[3], 'confusion_matrix': confusion}
        return self.models_evaluated

    def create_dataframe(self):
        self.models_metrics = self.models_evaluated.copy()
        metrics_list = []
        best_values_list = []
        worst_values_list = []
        for model_name, model_results in self.models_evaluated.items():
            self.models_metrics[model_name] = self.models_metrics[model_name]['metrics']
            model_values = [value if type(value) is not list else sum([row[index] for index, row in enumerate(value)]) for value in self.models_evaluated[model_name]['metrics'].values()]
            if not metrics_list:
                metrics_list += [key for key in self.models_evaluated[model_name]['metrics'].keys()]
            if not best_values_list:
                best_values_list = [[model_name, value] for value in model_values]
                worst_values_list = [[model_name, value] for value in model_values]
            else:
                for index, value in enumerate(model_values):
                    if value > best_values_list[index][1]:
                        best_values_list[index][1] = value
                        best_values_list[index][0] = model_name
                    if value < worst_values_list[index][1]:
                        worst_values_list[index][1] = value
                        worst_values_list[index][0] = model_name
        df = pd.DataFrame(data=self.models_metrics)
        best_values_list = [element[0] for element in best_values_list]
        worst_values_list = [element[0] for element in worst_values_list]
        df['BEST'] = best_values_list
        df['WORST'] = worst_values_list
        return df


In [205]:
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler


class Cleansing:

    def __init__(self, dataframes, target_name, index=None):
        self.train = dataframes[0]
        self.test = dataframes[1]
        self.target_name = target_name
        self.target = self.train[target_name]
        if index:
            self.train = self.train.set_index(index)
            self.test = self.test.set_index(index)

    def impute_boxplot_min_max(self, list_of_columns, min=True, max=True):
        '''Imputes the outliers of a boxplot for the chosen columns to its min and max values'''
        for column in list_of_columns:
            q3, q1 = np.percentile(self.train[column], [75, 25])
            iqr = q3 - q1
            if min:
                self.train.loc[self.train[column] < q1 - 1.5*iqr, column] = q1 - 1.5*iqr
            if max:
                self.train.loc[self.train[column] > q3 + 1.5*iqr, column] = q3 + 1.5*iqr
        return self.train

    def remove_elements(self, conditioned_columns_list, condition, number):
        '''Removes the rows of a dataframe based on a condition'''
        for column in conditioned_columns_list:
            if condition == 'equal':
                self.train.drop(self.train[(self.train[column] == number)].index, inplace=True)
            elif condition == 'bigger':
                self.train.drop(self.train[(self.train[column] > number)].index, inplace=True)
            elif condition == 'bigger_or_equal':
                self.train.drop(self.train[(self.train[column] >= number)].index, inplace=True)   
            elif condition == 'smaller':
                self.train.drop(self.train[(self.train[column] < number)].index, inplace=True)   
            elif condition == 'smaller_or_equal':
                self.train.drop(self.train[(self.train[column] <= number)].index, inplace=True)
            return self.train

    def apply_scalar(self, method, list_of_columns=None):
        '''Applies the selected scalar method to a list of train and test dataframes for the chosen columns'''
        scaled_train = self.train.copy()
        scaled_test = self.test.copy()
        if method == 'log' and list_of_columns:
            for df in (scaled_train, scaled_test):
                for column in list_of_columns:
                    df[column] = np.log(df[column])
        elif method == 'standard':
            scaler = StandardScaler().fit(scaled_train.values)
            scaled_test = scaled_test.join(self.target)
            for df in (scaled_train, scaled_test):
                df.loc[:, :] = scaler.transform(df.values)
            scaled_test = scaled_test.drop(columns=self.target_name)
        elif method == 'minmax':
            scaler = MinMaxScaler().fit(scaled_train.values)
            scaled_test = scaled_test.join(self.target)
            for df in (scaled_train, scaled_test):
                df.loc[:, :] = scaler.transform(df.values)
            scaled_test = scaled_test.drop(columns=self.target_name)
        return (scaled_train, scaled_test)


In [206]:
df_diamonds = pd.read_csv(r'data\processed\diamonds_training.csv', index_col='id')
df_predict = pd.read_csv(r'data\processed\diamonds_testing.csv', index_col='id')


In [207]:
Regression.add_models(['LinearRegression',
                        'XGBRegressor'
                        ]
                        )


In [208]:
round_1 = Regression(df_diamonds, 'price')


In [209]:
round_1.split_dataframe()



(       weight (carat)  cut quality  color quality  clarity quality  \
 id                                                                   
 26690            1.38          1.0            0.0              1.0   
 15750            0.70          4.0            4.0              2.0   
 37107            1.51          4.0            0.0              5.0   
 34550            0.23          2.0            4.0              7.0   
 31691            1.50          3.0            3.0              2.0   
 ...               ...          ...            ...              ...   
 5307             1.56          2.0            4.0              2.0   
 25233            1.53          4.0            2.0              2.0   
 18448            0.52          2.0            4.0              2.0   
 19776            1.20          1.0            3.0              3.0   
 14148            1.73          3.0            2.0              3.0   
 
        depth (percentage)  table (percentage)  lenght (millimeters)  \
 id

In [210]:
round_1.apply_models(params_list=[['XGBRegressor', 'random_state=43']], kfolds_num=10)


-- Regression: using best of 10 folds --
Starting LinearRegression:
- LinearRegression done in 0.42 sec(s). Total time: 0.42
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 12.74 sec(s). Total time: 13.16


In [211]:
round_1.evaluate_metrics()


{'LinearRegression': {'test': array([8.069, 9.093, 8.297, ..., 9.234, 8.818, 8.368]),
  'prediction': array([8.1751789 , 8.94534424, 7.94609508, ..., 9.13303572, 8.78005708,
         8.22465851]),
  'model': LinearRegression(),
  'metrics': {'rmse': 0.222037823248969,
   'mse': 0.0493007949531404,
   'mae': 0.12289980638635319,
   'r2_score': 0.9529090797126748,
   'mape': 0.01587291534328809}},
 'XGBRegressor: random_state=43': {'test': array([8.069, 9.093, 8.297, ..., 9.234, 8.818, 8.368]),
  'prediction': array([8.279677, 9.100804, 8.133977, ..., 9.290244, 8.891319, 8.354892],
        dtype=float32),
  'model': XGBRegressor(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, feature_types=None,
               gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
               interaction_cons

In [212]:
round_1.create_dataframe()

Unnamed: 0,LinearRegression,XGBRegressor: random_state=43,BEST,WORST
rmse,0.222038,0.093408,XGBRegressor: random_state=43,LinearRegression
mse,0.049301,0.008725,XGBRegressor: random_state=43,LinearRegression
mae,0.1229,0.066317,XGBRegressor: random_state=43,LinearRegression
r2_score,0.952909,0.991666,XGBRegressor: random_state=43,LinearRegression
mape,0.015873,0.008558,XGBRegressor: random_state=43,LinearRegression


In [213]:
round_2 = Regression(df_diamonds, 'price')

round_2.chosen_models

{'LinearRegression': '', 'XGBRegressor': ''}