In [1]:
import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression, BayesianRidge, Ridge, LassoLars, MultiTaskLasso, MultiTaskElasticNet, LogisticRegression, SGDRegressor, PassiveAggressiveRegressor, HuberRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor
from sklearn.isotonic import IsotonicRegression
from sklearn.neural_network import MLPRegressor
#check multioutputregressos, HistGradientBoostingRegressor,

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [8]:
# models = [LinearRegression(), KNeighborsRegressor(), DecisionTreeRegressor(), Ridge(), LassoLars(), MultiTaskLasso(), BayesianRidge()]
# models = [MultiOutputRegressor(LinearRegression()), MultiOutputRegressor(BayesianRidge()), Ridge(), LassoLars(),
#           MultiTaskLasso(), MultiTaskElasticNet(),
#           MultiOutputRegressor(SGDRegressor()), MultiOutputRegressor(PassiveAggressiveRegressor()), MultiOutputRegressor(HuberRegressor())]
models = [RandomForestRegressor(n_estimators = 1000), 
          ExtraTreesRegressor(n_estimators = 1000),
         BaggingRegressor(RandomForestRegressor(n_estimators = 1000)),
         BaggingRegressor(ExtraTreesRegressor(n_estimators = 1000))] 
#           MultiOutputRegressor(AdaBoostRegressor()), 
#           MultiOutputRegressor(GradientBoostingRegressor())]
def get_training_data_y1(path = 'tmcs_2020_2029_final.csv'):
    data = shuffle(pd.read_csv(path))
    data["is_oneway"] = data["is_oneway"].astype(int)
    data["is_weekend"] = data["is_weekend"].astype(int)
    data["is_holiday"] = data["is_holiday"].astype(int)
#     data = clean_dataset(data)
    X = data[['location_id', 'year', 'month', 'day', 'time_start_hour',
       'time_start_min', 'time_end_hour', 'time_end_min', 'num_lanes',
       'is_oneway', 'is_weekend', 'is_holiday']]
    y = data[['nx', 'sx', 'ex', 'wx']]
    return X, y

def get_training_data_y2(path = 'tmcs_2020_2029_final.csv'):
    data = shuffle(pd.read_csv(path))
    data["is_oneway"] = data["is_oneway"].astype(int)
    data["is_weekend"] = data["is_weekend"].astype(int)
    data["is_holiday"] = data["is_holiday"].astype(int)
#     data = clean_dataset(data)
    X = data[['location_id', 'year', 'month', 'day', 'time_start_hour',
       'time_start_min', 'time_end_hour', 'time_end_min', 'num_lanes',
       'is_oneway', 'is_weekend', 'is_holiday', 'nx', 'sx', 'ex', 'wx']]
    y = data[['nb_r', 'nb_t', 'nb_l', 'sb_r', 'sb_t', 'sb_l', 'eb_r', 'eb_t', 'eb_l', 'wb_r', 'wb_t', 'wb_l']]
    return X, y
    
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = df.isin([np.nan, np.inf, -np.inf]).any(1)
    print(indices_to_keep)
    return df[indices_to_keep].astype(np.float64)

def k_fold_testing_y1():
    X, y = get_training_data_y1()
    kf = KFold(n_splits = 10)
    for model in models:
        print(model)
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            print(X_test)
            model.fit(X_train, y_train)
            yhat = model.predict(X_test)
            print(model.score(X_test, y_test))
        print()
            
def k_fold_testing_y2():
    X, y = get_training_data_y2()
    kf = KFold(n_splits = 10)
    for model in models:
        print(model)
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            model.fit(X_train, y_train)
            yhat = model.predict(X_test)
            print(model.score(X_test, y_test))
        print()

def learning_y1():
    X, y = get_training_data_y1()
    model_y1 = DecisionTreeRegressor()
    model_y1.fit(X,y)
    return model_y1
    
def learning_y2():
    X, y = get_training_data_y2()
    model_y2 = DecisionTreeRegressor()
    model_y2.fit(X,y)
    print(model_y2.score(X, y))
    return model_y2

In [10]:
model_y1 = learning_y1()
print('-------------------')
# k_fold_testing_y2()
print('-------------------')
#learning_y2()

'''
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)
0.8389202321152913
0.9219960550444997
0.9099270646895258
0.9107988009731715
0.9180655438990909
0.91331827967539
0.9197872931568205
0.9158346612357587
0.9182437506698281
0.891985484982969

ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=1000, n_jobs=None, oob_score=False,
                    random_state=None, verbose=0, warm_start=False)
0.8277849393514694
0.9130999763862659
0.8920927781009269
0.8953023938995828
0.9017908121737117
0.90418591566778
0.9096446811449088
0.9048187583213289
0.9020312551047499
0.8465431698377857

BaggingRegressor(base_estimator=RandomForestRegressor(bootstrap=True,
                                                      ccp_alpha=0.0,
                                                      criterion='mse',
                                                      max_depth=None,
                                                      max_features='auto',
                                                      max_leaf_nodes=None,
                                                      max_samples=None,
                                                      min_impurity_decrease=0.0,
                                                      min_impurity_split=None,
                                                      min_samples_leaf=1,
                                                      min_samples_split=2,
                                                      min_weight_fraction_leaf=0.0,
                                                      n_estimators=1000,
                                                      n_jobs=None,
                                                      oob_score=False,
                                                      random_state=None,
                                                      verbose=0,
                                                      warm_start=False),
                 bootstrap=True, bootstrap_features=False, max_features=1.0,
                 max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False,
                 random_state=None, verbose=0, warm_start=False)
0.8357943653304685
0.9158640144829165
0.9138805287111039
0.9133672217166536
0.9204082464638553
0.910741820290316
0.9246540626804408
0.9159515422752131
0.9182841218127582
0.89423645366497

BaggingRegressor(base_estimator=ExtraTreesRegressor(bootstrap=False,
                                                    ccp_alpha=0.0,
                                                    criterion='mse',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators=1000,
                                                    n_jobs=None,
                                                    oob_score=False,
                                                    random_state=None,
                                                    verbose=0,
                                                    warm_start=False),
                 bootstrap=True, bootstrap_features=False, max_features=1.0,
                 max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False,
                 random_state=None, verbose=0, warm_start=False)
0.8022906338500218
0.8777896853552941
0.871910135274951
0.8668597470780979
0.8706915907398458
0.8686419664606684
0.8862431927022101
0.8661560167687659
0.8726266474801763
0.839323249254482

-------------------
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)
0.9476959446962377
0.9319545133139047
0.9347736547228042
0.9531819521993615
0.9253822167744326
0.9408960127392724
0.9490084777949324
0.940941442639825
0.9377650039051464
0.841517613834837

0.9303

ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=1000, n_jobs=None, oob_score=False,
                    random_state=None, verbose=0, warm_start=False)
0.9394539702078839
0.934357538363619
0.9406426130831398
0.9555355565094684
0.9387512365496563
0.938743964056166
0.9508982768102372
0.9463651434246305
0.9441187461270064
0.8432743383740268

0.933214

BaggingRegressor(base_estimator=RandomForestRegressor(bootstrap=True,
                                                      ccp_alpha=0.0,
                                                      criterion='mse',
                                                      max_depth=None,
                                                      max_features='auto',
                                                      max_leaf_nodes=None,
                                                      max_samples=None,
                                                      min_impurity_decrease=0.0,
                                                      min_impurity_split=None,
                                                      min_samples_leaf=1,
                                                      min_samples_split=2,
                                                      min_weight_fraction_leaf=0.0,
                                                      n_estimators=1000,
                                                      n_jobs=None,
                                                      oob_score=False,
                                                      random_state=None,
                                                      verbose=0,
                                                      warm_start=False),
                 bootstrap=True, bootstrap_features=False, max_features=1.0,
                 max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False,
                 random_state=None, verbose=0, warm_start=False)
0.94093668757496
0.9387848744411116
0.9405856514066647
0.8314417493724418
0.9364375284623321
0.9337620301837892
0.9157488325453332
0.9428460052342361
0.9260227397672935
0.9130257980675334

0.92195

BaggingRegressor(base_estimator=ExtraTreesRegressor(bootstrap=False,
                                                    ccp_alpha=0.0,
                                                    criterion='mse',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators=1000,
                                                    n_jobs=None,
                                                    oob_score=False,
                                                    random_state=None,
                                                    verbose=0,
                                                    warm_start=False),
                 bootstrap=True, bootstrap_features=False, max_features=1.0,
                 max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False,
                 random_state=None, verbose=0, warm_start=False)
0.948149858045955
0.9472576798180564
0.9508063648981041
0.8391299957394509
0.9428548481345627
0.941294446387301
0.9111549446222014
0.9484452974444105
0.9331340410162463
0.9218467561709053
'''
#location_id  year  month  day  time_start_hour  time_start_min time_end_hour  time_end_min  num_lanes  is_oneway  is_weekend is_holiday
# 38153  2020      8   27              8.0             0.0            8.0          15.0          2          0           0        0

-------------------
-------------------


"\nRandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',\n                      max_depth=None, max_features='auto', max_leaf_nodes=None,\n                      max_samples=None, min_impurity_decrease=0.0,\n                      min_impurity_split=None, min_samples_leaf=1,\n                      min_samples_split=2, min_weight_fraction_leaf=0.0,\n                      n_estimators=1000, n_jobs=None, oob_score=False,\n                      random_state=None, verbose=0, warm_start=False)\n0.8389202321152913\n0.9219960550444997\n0.9099270646895258\n0.9107988009731715\n0.9180655438990909\n0.91331827967539\n0.9197872931568205\n0.9158346612357587\n0.9182437506698281\n0.891985484982969\n\nExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',\n                    max_depth=None, max_features='auto', max_leaf_nodes=None,\n                    max_samples=None, min_impurity_decrease=0.0,\n                    min_impurity_split=None, min_samples_leaf=1,\n            

In [19]:
# x = [38153,2020,8,27,8.0,0.0,8.0,15.0,2,0,0,0]
x = pd.DataFrame({'location_id': 38153, 'year': 2020, 'month': 8, 'day': 27, 'time_start_hour': 8.0,
                   'time_start_min': 0.0, 'time_end_hour': 8.0, 'time_end_min': 15.0, 'num_lanes': 2,
                   'is_oneway': 0, 'is_weekend': 0, 'is_holiday': 0}, index=[0])
print(x)
pred = model_y1.predict(x)
print(pred[0][0])

   location_id  year  month  day  time_start_hour  time_start_min  \
0        38153  2020      8   27              8.0             0.0   

   time_end_hour  time_end_min  num_lanes  is_oneway  is_weekend  is_holiday  
0            8.0          15.0          2          0           0           0  
5.0
