In [1]:
import numpy    as np
from numpy.testing._private.utils import decorate_methods
import pandas   as pd
import seaborn  as sb
import matplotlib.pyplot as plt
import sklearn  as skl
import time

from sklearn import pipeline      # Pipeline
from sklearn import preprocessing # OrdinalEncoder, LabelEncoder
from sklearn import impute
from sklearn import compose
from sklearn import model_selection # train_test_split
from sklearn import metrics         # accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import set_config

from sklearn.tree          import DecisionTreeRegressor
from sklearn.ensemble      import RandomForestRegressor
from sklearn.ensemble      import ExtraTreesRegressor
from sklearn.ensemble      import AdaBoostRegressor
from sklearn.ensemble      import GradientBoostingRegressor
from xgboost               import XGBRegressor
from lightgbm              import LGBMRegressor
from catboost              import CatBoostRegressor

data = pd.read_csv(r'data\london_merged.csv')

np.random.seed(0)

#target = data['cnt']
#data = data.drop(['cnt'], axis=1)

#Print data shape
#print(target.shape)
#print(data.shape)

#Take a look at nulls 0 nulls
#print(target.isnull().sum())
#print(data.isnull().sum())

#lets create a 2 new feautures
# Hour time stamp contains the year and the month,
# we will create different columns for each one

data['year'] = data['timestamp'].apply(lambda row: row[:4])
data['month'] = data['timestamp'].apply(lambda row: row.split('-')[2][:2] )
data['hour'] = data['timestamp'].apply(lambda row: row.split(':')[0][-2:] )
'''
print(data['year'])
print(data['month'])
print(data['hour'])
'''
data.drop('timestamp', axis=1, inplace=True)

#print(data.shape)


def data_enhancement(data):
    
    gen_data = data
    
    for season in data['season'].unique():
        seasonal_data =  gen_data[gen_data['season'] == season]
        hum_mn = seasonal_data['hum'].mean()
        wind_speed_mn = seasonal_data['wind_speed'].mean()
        t1_mn = seasonal_data['t1'].mean()
        t2_mn = seasonal_data['t2'].mean()
        
        for i in gen_data[gen_data['season'] == season].index:
            if np.random.randint(2) == 1:
                gen_data['hum'].values[i] += hum_mn/10
            else:
                gen_data['hum'].values[i] -= hum_mn/10
                
            if np.random.randint(2) == 1:
                gen_data['wind_speed'].values[i] += wind_speed_mn/10
            else:
                gen_data['wind_speed'].values[i] -= wind_speed_mn/10
                
            if np.random.randint(2) == 1:
                gen_data['t1'].values[i] += t1_mn/10
            else:
                gen_data['t1'].values[i] -= t1_mn/10
                
            if np.random.randint(2) == 1:
                gen_data['t2'].values[i] += t2_mn/10
            else:
                gen_data['t2'].values[i] -= t2_mn/10

    return gen_data

print(data.head(3))
gen = data_enhancement(data)
print(gen.head(3) )

#print(gen.shape)

#final_data = data
y = data['cnt']
x = data.drop(['cnt'], axis=1)



#print(data.shape)




cat_vars = ['season','is_weekend','is_holiday','year','month','weather_code']
num_vars = ['t1','t2','hum','wind_speed']


x_train, x_val, y_train, y_val = model_selection.train_test_split(x, y,
                                    test_size=0.2,
                                    random_state=0  # Recommended for reproducibility
                                )



extra_sample = gen.sample(gen.shape[0] // 3)
x_train = pd.concat([x_train, extra_sample.drop(['cnt'], axis=1 ) ])
y_train = pd.concat([y_train, extra_sample['cnt'] ])


transformer = preprocessing.PowerTransformer()
y_train = transformer.fit_transform(y_train.values.reshape(-1,1))
y_val = transformer.transform(y_val.values.reshape(-1,1))



rang = abs(y_train.max()) - abs(y_train.min())

num_4_treeModels = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='constant', fill_value=-9999)),
])

cat_4_treeModels = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='constant', fill_value='missing')),
    ('ordinal', preprocessing.OrdinalEncoder()) # handle_unknown='ignore' ONLY IN VERSION 0.24
])

tree_prepro = compose.ColumnTransformer(transformers=[
    ('num', num_4_treeModels, num_vars),
    ('cat', cat_4_treeModels, cat_vars),
], remainder='drop') # Drop other vars not specified in num_vars or cat_vars

tree_classifiers = {
  "Decision Tree": DecisionTreeRegressor(),
  "Extra Trees":   ExtraTreesRegressor(n_estimators=100),
  "Random Forest": RandomForestRegressor(n_estimators=100),
  "AdaBoost":      AdaBoostRegressor(n_estimators=100),
  "Skl GBM":       GradientBoostingRegressor(n_estimators=100),
  "XGBoost":       XGBRegressor(n_estimators=100),
  "LightGBM":      LGBMRegressor(n_estimators=100),
  "CatBoost":      CatBoostRegressor(n_estimators=100),
}
### END SOLUTION

tree_classifiers = {name: pipeline.make_pipeline(tree_prepro, model) for name, model in tree_classifiers.items()}

results = pd.DataFrame({'Model': [], 'MSE': [], 'MAB': [], " % error": [], 'Time': []})

for model_name, model in tree_classifiers.items():
    
    start_time = time.time()
    model.fit(x_train, y_train)
    total_time = time.time() - start_time
        
    pred = model.predict(x_val)
    
    results = results.append({"Model":    model_name,
                              "MSE": metrics.mean_squared_error(y_val, pred),
                              "MAB": metrics.mean_absolute_error(y_val, pred),
                              " % error": metrics.mean_squared_error(y_val, pred) / rang,
                              "Time":     total_time},
                              ignore_index=True)
### END SOLUTION


results_ord = results.sort_values(by=['MSE'], ascending=True, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['MSE', 'MAE'], vmin=0, vmax=100, color='#5fba7d')

print(results_ord)


print(y_train.max())
print(y_train.min())
print(y_val[3])
print(tree_classifiers['Random Forest'].predict(x_val)[3])


   cnt   t1   t2   hum  wind_speed  weather_code  is_holiday  is_weekend  \
0  182  3.0  2.0  93.0         6.0           3.0         0.0         1.0   
1  138  3.0  2.5  93.0         5.0           1.0         0.0         1.0   
2  134  2.5  2.5  96.5         0.0           1.0         0.0         1.0   

   season  year month hour  
0     3.0  2015    04   00  
1     3.0  2015    04   01  
2     3.0  2015    04   02  
   cnt        t1        t2         hum  wind_speed  weather_code  is_holiday  \
0  182  3.768695  1.428926   85.208868    7.712725           3.0         0.0   
1  138  3.768695  3.071074  100.791132    6.712725           1.0         0.0   
2  134  3.268695  1.928926  104.291132    1.712725           1.0         0.0   

   is_weekend  season  year month hour  
0         1.0     3.0  2015    04   00  
1         1.0     3.0  2015    04   01  
2         1.0     3.0  2015    04   02  


  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  return f(*args, **kwargs)
  return f(*args, **kwargs)




  return f(*args, **kwargs)


Learning rate set to 0.381903
0:	learn: 0.9097188	total: 189ms	remaining: 18.7s
1:	learn: 0.8679010	total: 205ms	remaining: 10s
2:	learn: 0.8433973	total: 220ms	remaining: 7.13s
3:	learn: 0.8290488	total: 232ms	remaining: 5.57s
4:	learn: 0.8198865	total: 242ms	remaining: 4.6s
5:	learn: 0.8139700	total: 250ms	remaining: 3.92s
6:	learn: 0.8089893	total: 257ms	remaining: 3.42s
7:	learn: 0.8065389	total: 264ms	remaining: 3.04s
8:	learn: 0.8042793	total: 272ms	remaining: 2.75s
9:	learn: 0.8020470	total: 280ms	remaining: 2.52s
10:	learn: 0.7999499	total: 288ms	remaining: 2.33s
11:	learn: 0.7979965	total: 296ms	remaining: 2.17s
12:	learn: 0.7958510	total: 305ms	remaining: 2.04s
13:	learn: 0.7949176	total: 311ms	remaining: 1.91s
14:	learn: 0.7930065	total: 319ms	remaining: 1.81s
15:	learn: 0.7919385	total: 326ms	remaining: 1.71s
16:	learn: 0.7908960	total: 335ms	remaining: 1.64s
17:	learn: 0.7900497	total: 341ms	remaining: 1.55s
18:	learn: 0.7887126	total: 349ms	remaining: 1.49s
19:	learn: 0.7