In [None]:
from sys import path
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

import numpy as np
import pandas as pd
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

In [None]:
problem_dir = './ingestion_program/'  
score_dir = './scoring_program/'
model_dir = './sample_code_submission/'
result_dir = './sample_result_submission/'
path.append(model_dir); path.append(problem_dir); path.append(score_dir);
%matplotlib inline
%load_ext autoreload
%autoreload 2
sns.set()

In [None]:
from data_io import read_as_df
data_dir = './all_data'
data_name = 'xporters'
data = read_as_df(data_dir  + '/' + data_name)

In [None]:
from libscores import get_metric
metric_name, scoring_function = get_metric()
print('Using scoring metric:', metric_name)

In [None]:
from data_manager import DataManager
D = DataManager(data_name, data_dir, replace_missing=True)

X_train = D.data['X_train']
Y_train = D.data['Y_train']

In [None]:
# Model Imports
from sklearn import linear_model
from sklearn.tree import DecisionTreeRegressor
from sklearn import svm
from sklearn.kernel_ridge import KernelRidge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import ExtraTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import StackingRegressor

In [None]:
# Scoring fonction
score_dir = 'scoring_program/'
path.append(score_dir)
from libscores import get_metric
metric_name, scoring_function = get_metric()
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [None]:
# Model List
voter1 = GradientBoostingRegressor()
voter2 = RandomForestRegressor()
voter3 = DecisionTreeRegressor()

model_name = [
    'KNeighbors',
    'DecisionTree',
    'RandomForest',
    'GradientBoosting',
    'Voting - GB - DT - RF']

model_list = [
    KNeighborsRegressor(n_neighbors=5),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    GradientBoostingRegressor(),
    VotingRegressor(estimators=[('gb', voter1), ('rf', voter2), ('lr', voter3)])]

In [None]:
data_df = pd.DataFrame(columns =['perf_tr', 'perf_te'])

X_train = D.data['X_train']
Y_train = D.data['Y_train']
X_entrainement,X_validation,Y_entrainement,Y_validation = train_test_split(X_train,Y_train,test_size=0.2,random_state=42)

metric_name, scoring_function = get_metric()

for i in range(len(model_list)):
    M = model_list[i]
    M.fit(X_entrainement,Y_entrainement)

    print(model_name[i])
    scores_train = cross_val_score(M, X_entrainement, Y_entrainement, cv=5, scoring=make_scorer(scoring_function))   
    scores_test = cross_val_score(M, X_validation, Y_validation, cv=5, scoring=make_scorer(scoring_function))

    data_df.loc[model_name[i]] = [scores_train.mean(), scores_test.mean()]

data_df[['perf_tr', 'perf_te']].plot.bar()
plt.ylabel(metric_name)
plt.title("performance des modèles en histogramme")


data_df[['perf_tr', 'perf_te']].plot.line()
plt.ylabel(metric_name)
plt.title("performance des modèles en courbe")