In [1]:
import pandas as pd
import numpy as np
from copy import copy
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import time


predictions_submit = regression.predict(X_pred)
predictions_submit

submission = pd.DataFrame({'id': X_pred.index, 'price': predictions_submit})

submission.to_csv(r'.\data\processed\submission_1.csv', index=False)

submission.head()


In [2]:
class Cleansing:

    def __init__(self, dataframes, target_name, index=None):
        self.train = dataframes[0]
        self.test = dataframes[1]
        self.target_name = target_name
        self.target = self.train[target_name]
        if index:
            self.train = self.train.set_index(index)
            self.test = self.test.set_index(index)

    def impute_boxplot_min_max(self, list_of_columns, min=True, max=True):
        '''Imputes the outliers of a boxplot for the chosen columns to its min and max values'''
        for column in list_of_columns:
            q3, q1 = np.percentile(self.train[column], [75, 25])
            iqr = q3 - q1
            if min:
                self.train.loc[self.train[column] < q1 - 1.5*iqr, column] = q1 - 1.5*iqr
            if max:
                self.train.loc[self.train[column] > q3 + 1.5*iqr, column] = q3 + 1.5*iqr
        return self.train

    def remove_elements(self, conditioned_columns_list, condition, number):
        '''Removes the rows of a dataframe based on a condition'''
        for column in conditioned_columns_list:
            if condition == 'equal':
                self.train.drop(self.train[(self.train[column] == number)].index, inplace=True)
            elif condition == 'bigger':
                self.train.drop(self.train[(self.train[column] > number)].index, inplace=True)
            elif condition == 'bigger_or_equal':
                self.train.drop(self.train[(self.train[column] >= number)].index, inplace=True)   
            elif condition == 'smaller':
                self.train.drop(self.train[(self.train[column] < number)].index, inplace=True)   
            elif condition == 'smaller_or_equal':
                self.train.drop(self.train[(self.train[column] <= number)].index, inplace=True)
            return self.train

    def apply_scalar(self, method, list_of_columns=None):
        '''Applies the selected scalar method to a list of train and test dataframes for the chosen columns'''
        if method == 'log' and list_of_columns:
            for df in (self.train, self.test):
                for column in list_of_columns:
                    df[column] = np.log(df[column])
        elif method == 'standard':
            scaler = StandardScaler().fit(self.train.values)
            self.test = self.test.join(self.target)
            for df in (self.train, self.test):
                df.loc[:, :] = scaler.transform(df.values)
            self.test = self.test.drop(columns=self.target_name)
        return (self.train, self.test)


In [3]:
class Model:
    
    def __init__(self, dataframe, target_name, type='regression', index=None):
        self.target_name = target_name
        if index:
            self.dataframe = dataframe.set_index(index)
        else:
            self.dataframe = dataframe
        self.type = type

    def split_dataframe(self, train_num=0.7, random_num=43):
        '''Splits the dataframe, required to apply the models'''
        X = self.dataframe.drop(columns=self.target_name)
        y = self.dataframe[self.target_name]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, train_size=train_num, random_state=random_num)
        return (self.X_train, self.X_test, self.y_train, self.y_test)
    
    def apply_models(self, selected_list=None, excluded_list=None, params_list=None):
        '''Applies every selected model, all of them if none is selected'''
        if not excluded_list:
            excluded_list = []
        if not selected_list:
            selected_list = []
        current_time = time.time()
        self.models_regression = {'LinearRegression': '', # Write your regression models names as keys of the dict. Must be imported
                                    'Ridge': '', 
                                    'DecisionTreeRegressor': '', 
                                    'KNeighborsRegressor': '',
                                    'RandomForestRegressor': '',
                                    'SVR': ''
                                    }
                                    
        self.models_classification = {'LogisticRegression': '',  # Write your classification models names as keys of the dict. Must be imported
                                        'KNeighborsClassifier': ''
                                        }
        if self.type == 'regression':
            self.models = copy(self.models_regression)
        elif self.type == 'classification':
            self.models = copy(self.models_classification)
        self.models_previous = self.models.copy()
        for element in self.models_previous.keys():
            if (len(selected_list) >= 1 and element not in selected_list) or element in excluded_list:
                self.models.pop(element)
        for model_name in self.models.keys():
            self.models[model_name] = eval(model_name + '()')
        if params_list:
            for params in params_list:
                self.models[params[0]] = eval(params[0] + '(' + params[1] + ')')
                self.models[params[0] + ': ' + params[1]] = self.models.pop(params[0])
        total_time = time.time() - current_time
        for model_name, model in self.models.items():
            start_time = time.time()
            print(f'Starting {model_name}')
            model.fit(self.X_train, self.y_train)
            self.y_pred = model.predict(self.X_test)
            self.models[model_name] = {'test': np.array(self.y_test), 'prediction': self.y_pred, 'model': model}
            execution_time = time.time() - start_time
            total_time += execution_time
            print(f'- {model_name} done in {round(execution_time, 2)} sec(s). Total time: {round(total_time, 2)}')
        return self.models

    def evaluate_metrics(self, selection=None, params_list=False):
        '''Anotates regression metrics based on the real values and the predicion'''
        self.models_evaluated_previous = self.models
        self.models_evaluated = copy(self.models_evaluated_previous)
        if selection:
            for element in self.models_evaluated_previous.keys():
                if element not in selection:
                    self.models_evaluated.pop(element)
        if self.type == 'regression':
            for model_name, model_results in self.models_evaluated.items():
                rmse = mean_squared_error(model_results['test'], model_results['prediction'], squared=False)
                mse = mean_squared_error(model_results['test'], model_results['prediction'])
                mae = mean_absolute_error(model_results['test'], model_results['prediction'])
                r2 = r2_score(model_results['test'], model_results['prediction'])
                mape = mean_absolute_percentage_error(model_results['test'], model_results['prediction'])
                self.models_evaluated[model_name]['metrics'] = {'rmse': rmse, 'mse': mse, 'mae': mae, 'r2_score': r2, 'mape': mape}
        elif self.type == 'classification':
            for model_name, model_results in self.models_evaluated.items():
                accuracy = "accuracy_score (model_results['test'], model_results['prediction']"
                recall = "recall_score (model_results['test'], model_results['prediction']"
                precision = "precision_score (model_results['test'], model_results['prediction']"
                f1 = "f1_score (model_results['test'], model_results['prediction']"
                matrix = "confusion_matrix (model_results['test'], model_results['prediction']"
                list_of_metrics = []
                for element in (accuracy, recall, precision, f1, matrix):
                    if params_list:
                        for params in params_list:
                            if params[0] == element.split()[0]:
                                list_of_metrics.append(eval(element + "," + params[1] + ")"))
                            else:
                                list_of_metrics.append(eval(element + ")"))
                                continue
                    else:
                        list_of_metrics.append(eval(element + ")"))
                confusion = [element for element in list_of_metrics[-1]]
                self.models_evaluated[model_name]['metrics'] = {'accuracy': list_of_metrics[0], 'recall': list_of_metrics[1], 'precision': list_of_metrics[2], 'f1_score': list_of_metrics[3], 'confusion_matrix': confusion}
        return self.models_evaluated

    def create_dataframe(self):
        '''Returns a dataframe with the metrics of each model'''
        self.models_metrics = self.models_evaluated.copy()
        metrics_list = []
        best_values_list = []
        worst_values_list = []
        for model_name, model_results in self.models_evaluated.items():
            self.models_metrics[model_name] = self.models_metrics[model_name]['metrics']
            model_values = [value if type(value) is not list else sum([row[index] for index, row in enumerate(value)]) for value in self.models_evaluated[model_name]['metrics'].values()]
            if not metrics_list:
                metrics_list += [key for key in self.models_evaluated[model_name]['metrics'].keys()]
            if not best_values_list:
                best_values_list = [[model_name, value] for value in model_values]
                worst_values_list = [[model_name, value] for value in model_values]
            else:
                for index, value in enumerate(model_values):
                    if value > best_values_list[index][1] and self.type == 'classification':
                        best_values_list[index][1] = value
                        best_values_list[index][0] = model_name
                    if value < best_values_list[index][1] and self.type == 'regression':
                        best_values_list[index][1] = value
                        best_values_list[index][0] = model_name
                    if value < worst_values_list[index][1] and self.type == 'classification':
                        worst_values_list[index][1] = value
                        worst_values_list[index][0] = model_name
                    if value > worst_values_list[index][1] and self.type == 'regression':
                        worst_values_list[index][1] = value
                        worst_values_list[index][0] = model_name                    
        df = pd.DataFrame(data=self.models_metrics)
        best_values_list = [element[0] for element in best_values_list]
        worst_values_list = [element[0] for element in worst_values_list]
        if self.type == 'regression':
            not_worst_r2 = worst_values_list[-2]
            not_best_r2 = best_values_list[-2]
            worst_values_list = [element if element is not not_worst_r2 else not_best_r2 for element in worst_values_list]
            best_values_list = [element if element is not not_best_r2 else not_worst_r2 for element in worst_values_list]
        df['BEST'] = best_values_list
        df['WORST'] = worst_values_list
        return df

    def send_pickle(self):
        pass


In [4]:
df_train = pd.read_csv(r'.\data\processed\diamonds_train_1.csv').set_index('id')
X_pred = pd.read_csv(r'.\data\processed\diamonds_test_1.csv').set_index('id')


In [5]:
df_train.head()

Unnamed: 0_level_0,weight (carat),cut quality,color quality,clarity quality,depth (percentage),table (percentage),lenght (millimeters),width (millimeters),depth (millimeters),price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.3,3.0,6.0,1.0,62.4,58.0,4.31,4.28,2.68,6.353
1,1.01,4.0,5.0,5.0,62.7,56.0,6.42,6.46,4.04,9.183
2,0.72,4.0,4.0,3.0,61.8,59.0,5.71,5.74,3.54,7.983
3,1.08,2.0,3.0,1.0,63.2,57.0,6.54,6.5,4.12,8.371
4,0.36,3.0,3.0,4.0,62.3,59.0,4.5,4.55,2.82,6.588


In [6]:
# Para la primera ronda, aplicamos los modelos a los datos sin hacer ningún tratamiento
round_1 = Model(df_train, 'price')
round_1.split_dataframe()
round_1.apply_models()
round_1.evaluate_metrics()
round_1.create_dataframe()


Starting LinearRegression
- LinearRegression done in 0.03 sec(s). Total time: 0.03
Starting Ridge
- Ridge done in 0.01 sec(s). Total time: 0.04
Starting DecisionTreeRegressor
- DecisionTreeRegressor done in 0.36 sec(s). Total time: 0.41
Starting KNeighborsRegressor
- KNeighborsRegressor done in 1.49 sec(s). Total time: 1.9
Starting RandomForestRegressor
- RandomForestRegressor done in 29.53 sec(s). Total time: 31.43
Starting SVR
- SVR done in 58.61 sec(s). Total time: 90.04


Unnamed: 0,LinearRegression,Ridge,DecisionTreeRegressor,KNeighborsRegressor,RandomForestRegressor,SVR,BEST,WORST
rmse,0.222536,0.222424,0.129667,0.181378,0.097394,0.208683,RandomForestRegressor,LinearRegression
mse,0.049522,0.049472,0.016814,0.032898,0.009486,0.043548,RandomForestRegressor,LinearRegression
mae,0.122496,0.122548,0.087847,0.133349,0.066544,0.12632,RandomForestRegressor,KNeighborsRegressor
r2_score,0.952698,0.952745,0.98394,0.968577,0.99094,0.958404,LinearRegression,RandomForestRegressor
mape,0.01582,0.015827,0.011311,0.017571,0.008601,0.016377,RandomForestRegressor,KNeighborsRegressor
