In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import pickle

In [2]:
# models
from sklearn.kernel_ridge import KernelRidge
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, DotProduct
from sklearn.linear_model import BayesianRidge, ElasticNet, Lasso
from sklearn.neighbors import KNeighborsRegressor
import sklearn.gaussian_process as gp
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, BaggingRegressor, GradientBoostingRegressor

# helpers etc.
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, max_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from itertools import combinations
import time

# display
from IPython.core.display import display, HTML
display(HTML('<style>.container { width:100% !important; }</style>'))

import pickle

In [3]:
sns.set_style("white")
#x_size, y_size = 12,8
plt.rcParams.update({'font.size': 12})

### Regression results

In [4]:
df_results = pd.read_csv("regression_results.csv")
df_best_results = pd.read_csv("regression_results_best.csv")

### Settings

In [5]:
f = open("data\\counters_per_route.txt", encoding="utf8")
route_counters = {}

for l in f:
    if l.startswith("#") or (l == "\n"):
        continue
    ss = l.strip().split(";")
    route_id = ss[0] 
    #route_id = int(route_id)
    cs = ss[1:]  
    cs = list(map(lambda x: x.strip(), cs))
    if cs != ['']:
        route_counters[route_id] = cs
    
route_names = list(route_counters.keys())

#### Model settings and functions import

In [6]:
from params_and_helpers import *

### Regression

In [7]:
best_models = {}

for route in route_names:

    print("*********************")
    print('route ', route)
    
    file_name = r'data\route_' + route + '_counters.csv'
    df = pd.read_csv(file_name)
    #df = df.iloc[:,4:]
    df.dropna(inplace=True)

    X_full, Y_full = df_to_features_and_labels(df, only_positive_features, scale_features)  
    
    best_model = df_best_results[df_best_results['route'] == route].iloc[0]
    model = best_model.model
    features = eval(best_model.features)
    print('model: ', model)
    print('features: ', features)
    
    
    X = X_full[features]
    Y = Y_full
    
    regmod = GridSearchCV(models[model], grids[model], scoring='r2')
    
    if model == 'rfr': # regression forests -  to avoid UserWarning: X does not have valid feature names, but RandomForestRegressor was fitted with feature names
        regmod.fit(X.values, Y)
        Y_pred = regmod.predict(X.values)

        print(model, "feature importance: ")
        print({name: round(importance, 3)
            for name, importance in zip(X_train_f.columns, regmod.best_estimator_.feature_importances_)})

    else:
        regmod.fit(X, Y)
        Y_pred = regmod.predict(X)

    best_models[route] = regmod
        
    R2 = r2_score(Y_pred, Y.values)
    MSE = mean_squared_error(Y_pred, Y.values)
    max_E = max_error(Y_pred, Y.values)
        
    print("train dataset R2:", R2)
    print('best parameters: ', regmod.best_params_)
    print('best score: ', regmod.best_score_)
    print('best parameters (estimator)', regmod.best_estimator_.get_params())    

*********************
route  Dunajska (from centre)
model:  krr
features:  ['workday', 'weather', 'time_x', 'time_y', '9000000656-1', '9000000655-2']
train dataset R2: 0.7824151871841628
best parameters:  {'alpha': 0.001, 'gamma': 0.001, 'kernel': 'chi2'}
best score:  0.7761744713865708
best parameters (estimator) {'alpha': 0.001, 'coef0': 1, 'degree': 3, 'gamma': 0.001, 'kernel': 'chi2', 'kernel_params': None}
*********************
route  Dunajska (to centre)
model:  krr
features:  ['workday', 'weather', 'time_x', 'time_y', '9000000656-2', '9000000655-1']
train dataset R2: 0.7364465693437422
best parameters:  {'alpha': 0.001, 'gamma': 0.001, 'kernel': 'chi2'}
best score:  0.7208437496844643
best parameters (estimator) {'alpha': 0.001, 'coef0': 1, 'degree': 3, 'gamma': 0.001, 'kernel': 'chi2', 'kernel_params': None}
*********************
route  Ižanska (from centre)
model:  krr
features:  ['workday', 'weather', 'time_x', 'time_y', '9000000820-1', '9000001506-1']
train dataset R2: 0.931

In [18]:
with open('best_models.pickle', 'wb') as handle:
    pickle.dump(best_models, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [37]:
# feature_labels
def set_feature_labels(features, sep=", "):
    features=", ".join(features)
    features=features.replace("workday, weather, time_x, time_y", "basic")
    features=features.replace("900000","")
    features=features.replace(" ","")
    features=features.replace(",",sep)
    
    return features

In [51]:
f = open('best_models.txt', 'w')

for route, regmod in best_models.items():
    best_model = df_best_results[df_best_results['route'] == route].iloc[0]
    model = best_model.model
    features = eval(best_model.features)
       
    
    print(r"\section*{"+route+r"}", file=f)

    print(r"\subsection*{Summary}", file=f)
    print(r"\begin{itemize}", file=f)
    print(r"\item Model: "+model, file=f)
    print(r"\item Features: "+set_feature_labels(features), file=f)
    print(r"\end{itemize}", file=f)
    
    print(r"\subsection*{Model parameters}", file=f)
    print(r"\begin{itemize}", file=f)
    for param, val in regmod.best_estimator_.get_params().items():
        if val != None:
            val = str(val).replace("_", r"\_")
            param = param.replace("_", r"\_")
            print(r"\item",param,"=",val, file=f)
    print(r"\end{itemize}", file=f)
f.close()

In [50]:
regmod.best_estimator_.get_params()

{'alpha': 0.0001,
 'coef0': 1,
 'degree': 3,
 'gamma': 0.01,
 'kernel': 'chi2',
 'kernel_params': None}