In [1]:
import os

import matplotlib.pyplot as plt
import seaborn as sns

if str(os.getcwdb()[-3:]).split("'")[1] != 'src':
    os.chdir(os.path.dirname(os.getcwdb()))


In [2]:
import pandas as pd
import numpy as np
from copy import copy
import time
import pickle

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVR, SVC

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_validate

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, f1_score

from xgboost import XGBRegressor



class Model:


    def __init__(self, df, target_name, index=None):
        self.target_name = target_name
        self.index = index
        self.df = df


    @property
    def dataframe(self):
        if self.index:
            return self.df.set_index(self.index)
        else:
            return self.df


    @staticmethod
    def send_pickle():
        pass


    def split_dataframe(self, train_num=0.7, random_num=43, scaler=None, return_entire_Xy=False):
        self.random_num = random_num
        X = self.dataframe.drop(columns=self.target_name)
        y = self.dataframe[self.target_name]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, train_size=train_num, random_state=self.random_num)
        if scaler:
            self.scaler = eval(scaler + '()')
            self.scaler_name = ' (' + scaler + ')'
            self.X_train = self.scaler.fit_transform(self.X_train)
            self.X_test = self.scaler.transform(self.X_test)        
            if return_entire_Xy:
                self.scaler = eval(scaler + '()')
                X = self.scaler.fit_transform(X)
        else:
            self.scaler_name = ''
        if return_entire_Xy:
            return (X, y)
        else:
            return (self.X_train, self.X_test, self.y_train, self.y_test)


    def prepare_models(self, selected_list=None, excluded_list=None, params_list=None):
        self.models = self.chosen_models.copy()
        if not excluded_list:
            excluded_list = []
        if not selected_list:
            selected_list = []
        self.models_previous = self.models.copy()
        for element in self.models_previous.keys():
            if (len(selected_list) >= 1 and element not in selected_list) or element in excluded_list:
                self.models.pop(element)
        for model_name in self.models.keys():
            self.models[model_name] = eval(model_name + '()')
        if params_list:
            for params in params_list:
                self.models[params[0] + ': ' + params[1]] = eval(params[0] + '(' + params[1] + ')')
            for params in params_list:
                    if params[0] in self.models:
                        try:
                            self.models.pop(params[0])
                        except Exception:
                            continue
        return 'Models prepared. Apply them or use kfold (apply + evaluate)'


    def apply_models(self):
        print(f'-- {self.type.capitalize()} --')
        current_time = time.time()
        total_time = time.time() - current_time
        for model_name, model in self.models.items():
            start_time = time.time()
            print(f'Starting {model_name}:')
            model.fit(self.X_train, self.y_train)
            self.y_pred = model.predict(self.X_test)
            self.models[model_name] = {'test': np.array(self.y_test), 'prediction': self.y_pred, 'model': model}
            execution_time = time.time() - start_time
            total_time += execution_time
            print(f'- {model_name} done in {round(execution_time, 2)} sec(s). Total time: {round(total_time, 2)}')
        return self.models


    def create_dataframe(self, best_values_list, worst_values_list):
        self.df = pd.DataFrame(data=self.models_metrics)
        if best_values_list:
            best_values_list = [element[0] for element in best_values_list]
            worst_values_list = [element[0] for element in worst_values_list]
            self.df['BEST'] = best_values_list
            self.df['WORST'] = worst_values_list


    def visualize(self, metrics_selection=None):
        visualization_dict = {'models': [model_name for model_name in self.models_metrics.keys() for metric in self.models_metrics[model_name] if (not metrics_selection or metric in metrics_selection)],
                              'metrics': [metric for model_name in self.models_metrics.keys() for metric in self.models_metrics[model_name] if (not metrics_selection or metric in metrics_selection)],
                              'values': [self.models_metrics[model_name][metric] for model_name in self.models_metrics.keys() for metric in self.models_metrics[model_name] if (not metrics_selection or metric in metrics_selection)]
                              }
        sns.lineplot(data=visualization_dict, x='models', y='values', hue='metrics')
        plt.tick_params(axis='x', labelrotation = 30)
        plt.title('Medicine price by date')
        plt.show()



class Regression(Model):
    chosen_models = dict()


    def __init__(self, dataframe, target_name, index=None):
        super().__init__(dataframe, target_name, index)
        self.type = 'regression'


    @classmethod
    def add_models(cls, regression_list):
        if regression_list:
            for element in regression_list:
                cls.chosen_models[element] = ''


    @classmethod
    def remove_models(cls, regression_list):
        if regression_list:
            for element in regression_list:
                cls.chosen_models.pop(element)

  
    def apply_and_evaluate_kfolds(self, kfolds_num=5):
        self.kfolds_num = kfolds_num
        self.kfolds = KFold(n_splits=kfolds_num, shuffle=True, random_state=self.random_num)
        self.kfold = 'fold'
        metrics = ['neg_root_mean_squared_error', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'r2', 'neg_mean_absolute_percentage_error']
        self.models_evaluated = dict()
        print(f'-- {self.type.capitalize()}{self.scaler_name}: using mean of {self.kfolds_num} {self.kfold}s --')
        current_time = time.time()
        total_time = time.time() - current_time
        for model_name, model in self.models.items():
            print(f'Starting {model_name}:')
            start_time = time.time()
            cross_val = cross_validate(model, self.X_train, self.y_train, cv=self.kfolds, return_estimator=True, scoring=metrics)
            list_of_metrics = list(cross_val.keys())[3:]
            self.models_evaluated[model_name] = dict()
            self.models_evaluated[model_name]['models'] = cross_val['estimator']
            self.models_evaluated[model_name]['metrics'] = {'rmse': abs(np.mean(list(cross_val.values())[3:][0])), 
                                                            'mse': abs(np.mean(list(cross_val.values())[3:][1])), 
                                                            'mae': abs(np.mean(list(cross_val.values())[3:][2])), 
                                                            'r2_score': np.mean(list(cross_val.values())[3:][3]), 
                                                            'mape': abs(np.mean(list(cross_val.values())[3:][4]))}
            self.models_evaluated[model_name]['all_metrics'] = {'rmse': list(map(abs, list(cross_val.values())[3:][0])), 
                                                            'mse': list(map(abs, list(cross_val.values())[3:][1])), 
                                                            'mae': list(map(abs, list(cross_val.values())[3:][2])), 
                                                            'r2_score': list(map(abs, list(cross_val.values())[3:][3])), 
                                                            'mape': list(map(abs, list(cross_val.values())[3:][4]))}
            self.models_evaluated[model_name]['variances'] = {'rmse': np.var(list(cross_val.values())[3:][0]), 
                                                            'mse': np.var(list(cross_val.values())[3:][1]), 
                                                            'mae': np.var(list(cross_val.values())[3:][2]), 
                                                            'r2_score': np.var(list(cross_val.values())[3:][3]), 
                                                            'mape': np.var(list(cross_val.values())[3:][4])}
            execution_time = time.time() - start_time
            total_time += execution_time
            print(f'- {model_name} done in {round(execution_time, 2)} sec(s). Total time: {round(total_time, 2)}')
        return self.models_evaluated


    def evaluate_metrics(self):
        self.models_evaluated = self.models.copy()
        for model_name, model_results in self.models_evaluated.items():
            rmse = mean_squared_error(model_results['test'], model_results['prediction'], squared=False)
            mse = mean_squared_error(model_results['test'], model_results['prediction'])
            mae = mean_absolute_error(model_results['test'], model_results['prediction'])
            r2 = r2_score(model_results['test'], model_results['prediction'])
            mape = mean_absolute_percentage_error(model_results['test'], model_results['prediction'])
            self.models_evaluated[model_name]['metrics'] = {'rmse': rmse, 'mse': mse, 'mae': mae, 'r2_score': r2, 'mape': mape}
        return self.models_evaluated


    def create_dataframe(self, chosen_metric='mean'):
        self.models_metrics = self.models_evaluated.copy()
        best_values_list = []
        worst_values_list = []
        if chosen_metric == 'mean':
            chosen_metric = 'metrics'
        for model_name, model_results in self.models_evaluated.items():
            self.models_metrics[model_name] = self.models_metrics[model_name][chosen_metric]
            if len(self.models_metrics) > 1:
                model_values = [value if type(value) is not list else sum([row[index] for index, row in enumerate(value)]) for value in self.models_evaluated[model_name][chosen_metric].values()]
                if not best_values_list:
                    best_values_list = [[model_name, value] for value in model_values]
                    worst_values_list = [[model_name, value] for value in model_values]
                else:
                    for index, value in enumerate(model_values):
                        if value < best_values_list[index][1]:
                            if index != 3:
                                best_values_list[index][1] = value
                                best_values_list[index][0] = model_name
                            else:
                                worst_values_list[index][1] = value
                                worst_values_list[index][0] = model_name
                        if value > worst_values_list[index][1]:
                            if index != 3:
                                worst_values_list[index][1] = value
                                worst_values_list[index][0] = model_name
                            else:
                                best_values_list[index][1] = value
                                best_values_list[index][0] = model_name
        super().create_dataframe(best_values_list, worst_values_list)
        return self.df



class Classification(Model):
    chosen_models = dict()


    def __init__(self, dataframe, target_name, index=None):
        super().__init__(dataframe, target_name, index)
        self.type = 'classification'


    @classmethod
    def add_models(cls, classification_list):
        if classification_list:
            for element in classification_list:
                cls.chosen_models[element] = ''


    @classmethod
    def remove_models(cls, classification_list):
        if classification_list:
            for element in classification_list:
                cls.chosen_models.pop(element)


    def apply_and_evaluate_kfolds(self, kfolds_num=5, multiclass_average=None):
        self.kfolds = StratifiedKFold(n_splits=kfolds_num, shuffle=True, random_state=self.random_num)
        self.kfolds_num = kfolds_num
        self.kfold = 'stratified fold'
        metrics = ['accuracy', 'recall', 'precision', 'f1']
        if multiclass_average == 'micro':
            metrics = ['accuracy', 'precision_micro', 'recall_micro', 'f1_micro'] 
        elif multiclass_average == 'macro':
            metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'] 
        elif multiclass_average == 'samples':
            metrics = ['accuracy', 'precision_samples', 'recall_samples', 'f1_samples'] 
        elif multiclass_average == 'weighted':
            metrics = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'] 
        self.models_evaluated = dict()
        print(f'-- {self.type.capitalize()}{self.scaler_name}: using mean of {self.kfolds_num} {self.kfold}s --')
        current_time = time.time()
        total_time = time.time() - current_time
        for model_name, model in self.models.items():
            print(f'Starting {model_name}:')
            start_time = time.time()
            cross_val = cross_validate(model, self.X_train, self.y_train, cv=self.kfolds, return_estimator=True, scoring=metrics)
            self.models_evaluated[model_name] = dict()
            self.models_evaluated[model_name]['models'] = cross_val['estimator']
            self.models_evaluated[model_name]['metrics'] = {'accuracy': abs(np.mean(list(cross_val.values())[3:][0])), 
                                                            'recall': abs(np.mean(list(cross_val.values())[3:][1])), 
                                                            'precision': abs(np.mean(list(cross_val.values())[3:][2])), 
                                                            'f1_score': np.mean(list(cross_val.values())[3:][3])}
            self.models_evaluated[model_name]['all_metrics'] = {'accuracy': list(map(abs, list(cross_val.values())[3:][0])), 
                                                            'recall': list(map(abs, list(cross_val.values())[3:][1])), 
                                                            'precision': list(map(abs, list(cross_val.values())[3:][2])), 
                                                            'f1_score': list(map(abs, list(cross_val.values())[3:][3]))}
            self.models_evaluated[model_name]['variances'] = {'accuracy': np.var(list(cross_val.values())[3:][0]), 
                                                            'recall': np.var(list(cross_val.values())[3:][1]), 
                                                            'precision': np.var(list(cross_val.values())[3:][2]), 
                                                            'f1_score': np.var(list(cross_val.values())[3:][3])}
            execution_time = time.time() - start_time
            total_time += execution_time
            print(f'- {model_name} done in {round(execution_time, 2)} sec(s). Total time: {round(total_time, 2)}')
        return self.models_evaluated


    def evaluate_metrics(self, params_list=None):
        self.models_evaluated = self.models.copy()
        for model_name, model_results in self.models_evaluated.items():
            accuracy = "accuracy_score (model_results['test'], model_results['prediction']"
            recall = "recall_score (model_results['test'], model_results['prediction']"
            precision = "precision_score (model_results['test'], model_results['prediction']"
            f1 = "f1_score (model_results['test'], model_results['prediction']"
            matrix = "confusion_matrix (model_results['test'], model_results['prediction']"
            list_of_metrics = []
            for index, element in enumerate([accuracy, recall, precision, f1, matrix], 1):
                if params_list:
                    for params in params_list:
                        if params[0] == element.split()[0]:
                            element += ', ' + params[1] + ')'
                if element[-1] == ']':
                    element += ')'
                list_of_metrics.append(eval(element))
            print(list_of_metrics)
            confusion = [element for element in list_of_metrics[-1]]
            self.models_evaluated[model_name]['metrics'] = {'accuracy': list_of_metrics[0], 'recall': list_of_metrics[1], 'precision': list_of_metrics[2], 'f1_score': list_of_metrics[3], 'confusion_matrix': confusion}
        return self.models_evaluated


    def create_dataframe(self, chosen_metric='mean'):
        self.models_metrics = self.models_evaluated.copy()
        best_values_list = []
        worst_values_list = []
        if chosen_metric == 'mean':
            chosen_metric = 'metrics'
        for model_name, model_results in self.models_evaluated.items():
            self.models_metrics[model_name] = self.models_metrics[model_name][chosen_metric]
            if len(self.models_metrics) > 1:
                model_values = [value if type(value) is not list else sum([row[index] for index, row in enumerate(value)]) for value in self.models_evaluated[model_name][chosen_metric].values()]
                if not best_values_list:
                    best_values_list = [[model_name, value] for value in model_values]
                    worst_values_list = [[model_name, value] for value in model_values]
                else:
                    for index, value in enumerate(model_values):
                        if value > best_values_list[index][1]:
                            best_values_list[index][1] = value
                            best_values_list[index][0] = model_name
                        if value < worst_values_list[index][1]:
                            worst_values_list[index][1] = value
                            worst_values_list[index][0] = model_name
        super().create_dataframe(best_values_list, worst_values_list)
        return self.df


In [28]:
df_diamonds = pd.read_csv(r'data\processed\diamonds_training.csv', index_col='id')
df_predict = pd.read_csv(r'data\processed\diamonds_testing.csv', index_col='id')


In [None]:
def apply_ridge(df):
    '''Uses ridge to impute a few outliers from the depth (millimeters) column of the diamonds dataframe'''
    q3, q1 = np.percentile(df['depth (millimeters)'], [75, 25])
    iqr = q3 - q1
    y_test = df[(df['depth (millimeters)'] > q3 + 1.5*iqr) | (df['depth (millimeters)'] < q1 - 1.5*iqr)]['depth (millimeters)']
    y_train = df.drop(y_test.index)['depth (millimeters)']
    X_train = df.drop(y_test.index)[['weight (carat)', 'lenght (millimeters)', 'width (millimeters)']]
    X_test = df[(df['depth (millimeters)'] > q3 + 1.5*iqr) | (df['depth (millimeters)'] < q1 - 1.5*iqr)][['weight (carat)', 'lenght (millimeters)', 'width (millimeters)']]
    ridge = Ridge()
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_test)
    df_depth = pd.DataFrame(data={'Original depth': y_test, 'Predicted depth': y_pred})
    for index in df_depth.index:
        df.loc[index, 'depth (millimeters)'] = df_depth.loc[index, 'Predicted depth']
    return df


In [82]:
cols = ['width (millimeters)', 'lenght (millimeters)', 'depth (millimeters)']
df = df_diamonds.copy()
for index, col in enumerate(cols):
    cols.pop(index)
    df_train = df.drop(df[(df[cols[0]] == 0) | (df[cols[1]] == 0) | (df[col] == 0)].index)
    df_test = df[df[col] == 0]
    X_train = df_train.drop(columns=col)
    y_train = df_train[col]
    x_test = df_test.drop(columns=col)
    y_test = df_test[col]
    cols.append(col)
    ridge = Ridge()
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(x_test)
    df[col + ' ridge'] = 0
    df.loc[df[col] == 0, col + ' ridge'] = y_pred



In [83]:
df

Unnamed: 0_level_0,weight (carat),cut quality,color quality,clarity quality,depth (percentage),table (percentage),lenght (millimeters),width (millimeters),depth (millimeters),price,width (millimeters) ridge,depth (millimeters) ridge
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0.30,3.0,6.0,1.0,62.4,58.0,4.31,4.28,2.68,6.353,0.0,0.0
1,1.01,4.0,5.0,5.0,62.7,56.0,6.42,6.46,4.04,9.183,0.0,0.0
2,0.72,4.0,4.0,3.0,61.8,59.0,5.71,5.74,3.54,7.983,0.0,0.0
3,1.08,2.0,3.0,1.0,63.2,57.0,6.54,6.50,4.12,8.371,0.0,0.0
4,0.36,3.0,3.0,4.0,62.3,59.0,4.50,4.55,2.82,6.588,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
40450,0.42,3.0,6.0,1.0,62.1,59.0,4.78,4.82,2.98,6.551,0.0,0.0
40451,0.53,3.0,3.0,3.0,62.0,58.0,5.21,5.18,3.22,7.382,0.0,0.0
40452,0.80,1.0,3.0,1.0,62.8,58.0,5.86,5.90,3.69,7.768,0.0,0.0
40453,1.01,2.0,4.0,3.0,61.5,57.0,6.40,6.48,3.96,8.726,0.0,0.0
