In [22]:
import pandas as pd
import numpy as np
import os
path = os.getcwd()

import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,BaggingRegressor, StackingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error, r2_score

In [23]:
train = pd.read_csv(path + "\\TRAIN.csv")
test = pd.read_csv(path + "\\TEST.csv")
ss = pd.read_csv(path + "\\sample_submission.csv")

In [24]:
train.drop('index', inplace=True, axis = 1)
test.drop('index', inplace=True, axis = 1)

In [25]:
train['time_stamp'] = pd.to_datetime(train['time_stamp'], unit = 'ms')
test['time_stamp'] = pd.to_datetime(test['time_stamp'], unit = 'ms')

In [26]:
# Cross Validation for Boosting
def cross_val(regressor, train, test, features, name, splits):
    N_splits = splits
    
    oofs = np.zeros(len(train))
    preds = np.zeros(len(test))
    
    target_col = train[target]
    
    folds = StratifiedKFold(n_splits = N_splits, shuffle = True,random_state = 1999)
    stratified_target = pd.qcut( train[target], 10, labels=False, duplicates='drop')
    for index, (trn_idx, val_idx) in enumerate(folds.split(train, stratified_target)):
        print(f'\n================================Fold{index + 1}===================================')
        
        #### Train Set
        X_trn, y_trn = train[features].iloc[trn_idx], train[target].iloc[trn_idx]
        
        #### Validation Set
        X_val, y_val = train[features].iloc[val_idx], train[target].iloc[val_idx]
        
        #### Test Set
        X_test = test[features]
        
        if name != 'cat':
            #### Scaling Data ####
            scaler = StandardScaler()
            _ = scaler.fit(X_trn)
            X_trn = scaler.transform(X_trn)
            X_val = scaler.transform(X_val)
            X_test = scaler.transform(X_test)
        
        ############ Fitting #############
        _ = regressor.fit(X_trn, y_trn, eval_set = [(X_val, y_val)], early_stopping_rounds = 50, verbose = False)
        
        ############ Predicting #############
        val_preds = np.abs(regressor.predict(X_val))
        test_preds = np.abs(regressor.predict(X_test))
        
        error = np.sqrt(mean_squared_error(y_val, val_preds))
        print(f'\n Root Log Mean Squared Error for Validation set is : {error}')
        
        oofs[val_idx] = val_preds
        preds += test_preds / N_splits
        
    total_error = np.sqrt(mean_squared_error(target_col, oofs))
    print(f'\n\Root Log Mean Squared Error for oofs is {total_error}')
    
    return oofs, preds

In [27]:
# Cross Validation for Boosting
def normal_cross_val(regressor, train, test, features, name, splits):
    N_splits = splits
    
    oofs = np.zeros(len(train))
    preds = np.zeros(len(test))
    
    target_col = train[target]
    
    folds = StratifiedKFold(n_splits = N_splits, shuffle = True,random_state = 1999)
    stratified_target = pd.qcut( train[target], 10, labels=False, duplicates='drop')
    ls_1 = [52,59,60,62,82,202,264,270,363,459,460,641,672,723,1057,1067,1068,1200,1243,1358,1448,1814,1852,1854,1954,2042,2197,2211,2212,2214,2292,2402,2452,2659,2745,2780,2875,2876,2877,3007,3115,3211,3399,3499,3576,3579,3581,3582,3719,3753,3778,3780,3879,3881,3882,3883,4224,4502,4677,4685,4700,4723,4730,4731,4814,4815,4886,4887,4912,4954,5094,5169,5175,5178,5179,5189,5209,5326,5359,5364,5546,5658,6105,6106,6107,6108,6271,6272,6378,6390,6540,6581,6772,7067,7069,7230,7231,7294,7306,7386,7388,7389,7521,7583,7781,7894,8002,8003,8005,8006,8177,8316,8453,8455,8529,8722,8739,9081,9086,9231,9845,9905,9906,9919,9947,10217,10708,11085,11105,11107,11236,11320,11404,11443,11444,11451,11723,11902,11990,11991,12194,12320,12344,12440,12483,12485,12486,12511,12618,12619,12621,12625,12793,12923,12932,12933,13293,13597,13650,13696,13701,13739,13758,13759,13813,14039,14221,14243,14378,14381,14383,14384,14426,14432,14646,14837,15072,15249,15417,15418,15419,15422,15423,15498,15500,15659,15663,15688,15723,15749,15840,15989,16087,16104,16344,16348,16366,16681,16689,16723,16726,16728,16885,17115,17128,17129,17231,17327,17485,17679,17683,17712,17863,17869,17870,17871,17885,17886,18275,18296,18298,18468,18576,18618,18619,18620,18622,18691,18800,18801,18838,18855,18925,18926,18939,18940,18942,18983,18986,19171,19334,19377,19445,19653,19654,19655,19795,19816,19817,19818,19919,19943,20004,20008,20147,20149,20150,20151,20152,20290,20354,20553,20568,20598,20626,20727,20731,20787,20958,20959,20961,20967,21145,21217,21463,21618,21900,22150,22200,22202,22300,22302,22319,22321,22322,22407,22424,22578,22582,23100,23111,23275,23276,23277,23307,23388,23389,23391,23392,23417,23431,23731,23732,23734,23749,23750,23811,23872,23876,24444,24499,24526,24732,24789,24830]
    for index, (trn_idx, val_idx) in enumerate(folds.split(train, stratified_target)):
        print(f'\n================================================Fold{index + 1}================================================')
        
        #### Train Set
        X_trn, y_trn = train[features].iloc[trn_idx], train[target].iloc[trn_idx]
        
        #### Validation Set
        X_val, y_val = train[features].iloc[val_idx], train[target].iloc[val_idx]
        
        #### Test Set
        X_test = test[features]
        
        if name != 'cat':
            #### Scaling Data ####
            scaler = StandardScaler()
            _ = scaler.fit(X_trn)
            X_trn = scaler.transform(X_trn)
            X_val = scaler.transform(X_val)
            X_test = scaler.transform(X_test)
        
        ############ Fitting #############
        _ = regressor.fit(X_trn, y_trn)
        
        ############ Predicting #############
        val_preds = np.abs(regressor.predict(X_val))
        test_preds = np.abs(regressor.predict(X_test))
        
        error = np.sqrt(mean_squared_error(y_val, val_preds))
        print(f'\n Root Log Mean Squared Error for Validation set is : {error}')
        
        oofs[val_idx] = val_preds
        preds += test_preds / N_splits
    if name == 'lr':
        preds.iloc[ls_1] = preds.iloc[ls_1]+5
    total_error = np.sqrt(mean_squared_error(target_col, oofs))
    print(f'\n\Root Log Mean Squared Error for oofs is {total_error}')
    
    return oofs, preds

In [28]:
# Test Index for Uber and lyft
uber_index = test[test['cab_provider'] == 'Uber'].index
lyft_index = test[test['cab_provider'] == 'Lyft'].index

In [29]:
df = pd.concat([train, test], axis = 0).reset_index(drop=True)
df['time_stamp'] = pd.to_datetime(df['time_stamp'], unit = 'ms')

In [30]:
df['path'] = df.apply(lambda x: x['source'] + '_' + x['destination'], axis = 1)

In [31]:
train['fare/dis'] = train['fare']/train['distance']

per_km_cab_type = train.groupby(['source', 'destination', 'cab_type'])['fare/dis'].median().to_dict()
train.drop('fare/dis', axis = 1, inplace=True)

In [32]:
df['multiplier'] = df.apply(lambda x: per_km_cab_type[(x['source'], x['destination'], x['cab_type'])], axis = 1)

In [33]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
le = LabelEncoder()
sl = MinMaxScaler()

In [34]:
dict_ = {'Theatre District': 1, 'Fenway': 2, 'Beacon Hill': 3,
        'North End': 4, 'Northeastern University': 5, 'Financial District': 6,
        'Boston University': 7, 'Haymarket Square': 8, 'West End': 9,
        'South Station': 10, 'North Station': 11, 'Back Bay': 12}

df['source'] = df['source'].map(dict_)
df['destination'] = df['destination'].map(dict_) 

In [35]:
label_cols = ['cab_provider', 'cab_type','path']

df[label_cols] = df[label_cols].apply(le.fit_transform)

In [36]:
lyft_size = train[train['cab_provider'] == 'Lyft'].shape[0] # size of lyft in train

df_lyft = df[df['cab_provider'] == 0].copy() # Lyft Dataset

In [37]:
# Splitting Lyft into train and test
train_lyft, test_lyft = df_lyft[:lyft_size], df_lyft[lyft_size:].reset_index(drop=True)

In [38]:
target = 'fare'
time = 'time_stamp'
time_feats = ['hour', 'year', 'minute', 'day','dayofweek', 'month', 'weekday']
provider = 'cab_provider'

features = [col for col in df.columns if col not in [target, time, provider]]

In [39]:
params = {'n_estimators': 643, 'learning_rate': 0.3205115338225746, 'depth': 7, 
          'reg_lambda': 9.470288364364613, 'subsample': 0.7905189237698941}
cat = CatBoostRegressor(random_state=1999,verbose=False , **params)


params = {'n_estimators': 831, 'learning_rate': 0.09832522155935687, 'num_leaves': 152, 'max_depth': 7, 'reg_alpha': 5.284654787743608, 
          'reg_lambda': 0.10004930753366548, 'colsample_bytree': 0.9495225618757219}
lgb = LGBMRegressor(random_state=1999, **params)

params = {'alpha': 7, 'lambda': 3.004343629746868, 'n_estimators': 578, 
          'learning_rate': 0.14246637122353223, 'max_depth': 7, 'colsample_bytree': 0.7955419063856055}
xgb = XGBRegressor(random_state=1999, **params)

bag_cat = BaggingRegressor(base_estimator=cat, verbose=0)
bag_lgb = BaggingRegressor(base_estimator=lgb, verbose=0)
bag_xgb = BaggingRegressor(base_estimator=xgb, verbose=0)

In [None]:
%%time
cat_oofs, cat_preds = cross_val(cat, train_lyft, test_lyft, features, 'cat', 15)

In [40]:
%%time
lgb_oofs, lgb_preds = cross_val(lgb, train_lyft, test_lyft, features, 'lgb', 15)



 Root Log Mean Squared Error for Validation set is : 1.3331563018347368


 Root Log Mean Squared Error for Validation set is : 1.3192832616400145


 Root Log Mean Squared Error for Validation set is : 1.3531281254429068


 Root Log Mean Squared Error for Validation set is : 1.3621997500311955


 Root Log Mean Squared Error for Validation set is : 1.2723336631805902


 Root Log Mean Squared Error for Validation set is : 1.3622639256988411


 Root Log Mean Squared Error for Validation set is : 1.3837272643075582


 Root Log Mean Squared Error for Validation set is : 1.3378145201110148


 Root Log Mean Squared Error for Validation set is : 1.2945493042714686


 Root Log Mean Squared Error for Validation set is : 1.3492709111195085


 Root Log Mean Squared Error for Validation set is : 1.3239021941625835


 Root Log Mean Squared Error for Validation set is : 1.3701376027414605


 Root Log Mean Squared Error for Validation set is : 1.3123227280084435


 Root Log Mean Squared Error for Val

In [None]:
%%time
xgb_oofs, xgb_preds = cross_val(xgb, train_lyft, test_lyft, features, 'xgb', 15)

In [None]:
%%time
bag_cat_oofs, bag_cat_preds= normal_cross_val(bag_cat, train_lyft, test_lyft, features, 'bag_cat', 15)

In [None]:
%%time
bag_lgb_oofs, bag_lgb_preds = normal_cross_val(bag_lgb, train_lyft, test_lyft, features, 'bag_lgb', 15)

In [None]:
%%time
bag_xgb_oofs, bag_xgb_preds = normal_cross_val(bag_xgb, train_lyft, test_lyft, features, 'bag_xgb', 15)

In [41]:
train_new = train_lyft[[target, 'time_stamp']].copy()
test_new = test_lyft[[target, 'time_stamp']].copy()

train_new['lgb'] = lgb_oofs
test_new['lgb'] = lgb_preds

# train_new['cb'] = cat_oofs
# test_new['cb'] = cat_preds

# train_new['xgb'] = xgb_oofs
# test_new['xgb'] = xgb_preds

# train_new['bag_cat'] = bag_cat_oofs
# test_new['bag_cat'] = bag_cat_preds

# train_new['bag_lgb'] = bag_lgb_oofs
# test_new['bag_lgb'] = bag_lgb_preds

# train_new['bag_xgb'] = bag_xgb_oofs
# test_new['bag_xgb'] = bag_xgb_preds

ens_features = [c for c in train_new.columns if c not in [target, 'time_stamp']]

In [42]:
from sklearn.linear_model import Ridge

from sklearn.linear_model import LinearRegression
model = LinearRegression()

ens_linear_oofs, ens_linear_preds = normal_cross_val(model, train_new, test_new, ens_features, 'cat', 15)



 Root Log Mean Squared Error for Validation set is : 1.3332919977259716


 Root Log Mean Squared Error for Validation set is : 1.3195359694576299


 Root Log Mean Squared Error for Validation set is : 1.3532582675095934


 Root Log Mean Squared Error for Validation set is : 1.3622209019150924


 Root Log Mean Squared Error for Validation set is : 1.2723261582776932


 Root Log Mean Squared Error for Validation set is : 1.3632679781935795


 Root Log Mean Squared Error for Validation set is : 1.3837183873622894


 Root Log Mean Squared Error for Validation set is : 1.337811798000872


 Root Log Mean Squared Error for Validation set is : 1.2946156573806522


 Root Log Mean Squared Error for Validation set is : 1.3492846953766546


 Root Log Mean Squared Error for Validation set is : 1.3243638772114337


 Root Log Mean Squared Error for Validation set is : 1.3702239400727814


 Root Log Mean Squared Error for Validation set is : 1.3123085269976753


 Root Log Mean Squared Error for Vali

In [None]:
test['Predictions'] = None
test.loc[lyft_index, 'Predictions'] = ens_linear_preds

In [None]:
uber_size = train[train['cab_provider'] == 'Uber'].shape[0]

df_uber = df[df['cab_provider'] == 1].copy()

In [None]:
train_uber, test_uber = df_uber[:uber_size], df_uber[uber_size:].reset_index(drop=True)

In [None]:
params = {'n_estimators': 916, 'learning_rate': 0.033403127409740704, 'depth': 7, 
          'reg_lambda': 2.4878801598813713, 'subsample': 0.6980227022422014}
cat = CatBoostRegressor(random_state=1999,verbose=False, **params)


params = {'n_estimators': 531, 'learning_rate': 0.020073364554055173, 'num_leaves': 909, 'max_depth': 7, 'reg_alpha': 8.791558407597256, 
          'reg_lambda': 2.87263268971904, 'colsample_bytree': 0.6660164022754701}
lgb = LGBMRegressor(random_state=1999, **params)

params = {'alpha': 3, 'lambda': 3.4282608213045944, 'n_estimators': 527, 
          'learning_rate': 0.021611335825661777, 'max_depth': 6, 'colsample_bytree': 0.8691510349197396}
xgb = XGBRegressor(random_state=1999, **params)

bag_cat = BaggingRegressor(base_estimator=cat, verbose=0)
bag_lgb = BaggingRegressor(base_estimator=lgb, verbose=0)
bag_xgb = BaggingRegressor(base_estimator=xgb, verbose=0)

In [None]:
%%time
lgb_oofs, lgb_preds = cross_val(lgb, train_uber, test_uber, features, 'lgb', 10)

In [None]:
%%time
cat_oofs, cat_preds = cross_val(cat, train_uber, test_uber, features, 'cat', 10)

In [None]:
%%time
xgb_oofs, xgb_preds = cross_val(xgb, train_uber, test_uber, features, 'xgb', 10)

In [None]:
%%time
bag_cat_oofs, bag_cat_preds = normal_cross_val(bag_cat, train_uber, test_uber, features, 'bag_cat', 10)

In [None]:
%%time
bag_lgb_oofs, bag_lgb_preds = normal_cross_val(bag_lgb, train_uber, test_uber, features, 'bag_lgb', 10)

In [None]:
%%time
bag_xgb_oofs, bag_xgb_preds = normal_cross_val(bag_xgb, train_uber, test_uber, features, 'bag_xgb', 10)

In [None]:
train_new = train_uber[[target, 'time_stamp']].copy()
test_new = test_uber[[target, 'time_stamp']].copy()

train_new['lgb'] = lgb_oofs
test_new['lgb'] = lgb_preds

train_new['cb'] = cat_oofs
test_new['cb'] = cat_preds

train_new['xgb'] = xgb_oofs
test_new['xgb'] = xgb_preds

train_new['bag_cat'] = bag_cat_oofs
test_new['bag_cat'] = bag_cat_preds

train_new['bag_lgb'] = bag_lgb_oofs
test_new['bag_lgb'] = bag_lgb_preds

train_new['bag_xgb'] = bag_xgb_oofs
test_new['bag_xgb'] = bag_xgb_preds

ens_features = [c for c in train_new.columns if c not in [target, 'time_stamp']]

In [None]:
model = LinearRegression()

ens_linear_oofs, ens_linear_preds = normal_cross_val(model, train_new, test_new, ens_features, 'lr', 10)

In [None]:
test.loc[uber_index, 'Predictions'] = ens_linear_preds

In [1]:
a=[[1,2],[1,2,3]]
b=[[1,2]]


In [3]:
import numpy as np



  return array(a, dtype, copy=False, order=order, subok=True)


array([], dtype=int32)

In [5]:
np.setdiff1d(a,b)

array([list([1, 2]), list([1, 2, 3])], dtype=object)

In [9]:
z=([1, 2])
type(z)

list