In [1]:
from functions import *
from settings import *
from pipeline import *

%store -r __RequiredPackages
%store -r __JupyterOptions

In [2]:
__RequiredPackages

In [3]:
__JupyterOptions

Populating the interactive namespace from numpy and matplotlib


In [4]:
def pca_testing_pipe(data, keep_features, target, n_components, cont_impute_cols=cont_impute_cols, 
                     cat_impute_cols=cat_impute_cols, mean_enc_cols=mean_enc_cols, dev_seed=dev_seed):
    """
    Does all the preprocessing steps,
    uses cross validation to test the performance of several models.
    Used to quickly compare different sets of features
    
    Note: Make sure the indexes are in order without gaps
    """ 
    # Create different splits of train and test
    kf = KFold(n_splits=5, shuffle=True, random_state=dev_seed)
    
    # Initiate empty object for further analysis
    pred_perf_dict = {'LinearRegression': [], 'KNeighborsRegressor': [], 'RandomForestRegressor': [],
                      'GradientBoostingRegressor': [], 'SVR': []}
    predictions_df = pd.DataFrame({'obs_nr': data.index})
    cv_round = 0
    
    # Split features and target
    X = data.drop(target, axis=1)
    y = data[target]
    
    # Fit models while looping through the train/test-splits
    for train_index, test_index in kf.split(X): 
        cv_round  += 1
        
        X_train, X_test = X.loc[train_index, :], X.loc[test_index, :]
        y_train, y_test = y.values[train_index], y.values[test_index]
        
        # Prepare features based on target variable in testset
        prepper = leakage_preventive_preprocessing_function(target, cont_impute_cols, cat_impute_cols, 
                                                            mean_enc_cols, keep_features)
        pca = sklearnPCA(n_components=n_components)
        X_train = prepper.fit_transform(X_train, y_train)
        X_train = pca.fit_transform(X_train)
        X_test = prepper.transform(X_test)
        X_test = pca.transform(X_test)
        
        # Fit models
        pred_perf_dict, predictions_df = fit_model('LinearRegression', X_train, y_train, X_test, y_test, test_index, 
                                                   pred_perf_dict, predictions_df, cv_round)
        pred_perf_dict, predictions_df = fit_model('KNeighborsRegressor', X_train, y_train, X_test, y_test, test_index, 
                                                   pred_perf_dict, predictions_df, cv_round)
        pred_perf_dict, predictions_df = fit_model('RandomForestRegressor', X_train, y_train, X_test, y_test, test_index, 
                                                   pred_perf_dict, predictions_df, cv_round)
        pred_perf_dict, predictions_df = fit_model('GradientBoostingRegressor', X_train, y_train, X_test, y_test, test_index, 
                                                   pred_perf_dict, predictions_df, cv_round)
        pred_perf_dict, predictions_df = fit_model('SVR', X_train, y_train, X_test, y_test, test_index, 
                                                   pred_perf_dict, predictions_df, cv_round)

        #print('Completed predicting round {}'.format(cv_round))
        
    return pred_perf_dict, predictions_df
        
def fit_model(model_name, X_train, y_train, X_test, y_test, test_index, 
              pred_perf_dict, predictions_df, cv_round, n_jobs=n_jobs, dev_seed=dev_seed):
    if model_name == 'LinearRegression':
        model = LinearRegression(n_jobs=n_jobs).fit(X_train, y_train)
    elif model_name == 'KNeighborsRegressor':  
        model = neighbors.KNeighborsRegressor(n_neighbors = 7, n_jobs=n_jobs).fit(X_train, y_train)
    elif model_name == 'RandomForestRegressor':
        model = RandomForestRegressor(n_estimators=50, random_state=dev_seed, n_jobs=n_jobs).fit(X_train, y_train)
    elif model_name == 'GradientBoostingRegressor':
        model = GradientBoostingRegressor(random_state=dev_seed).fit(X_train, y_train)
    elif model_name == 'SVR':
        model = SVR(kernel = 'rbf').fit(X_train, y_train)
    
    preds = model.predict(X_test)
    pred_perf_dict[model_name].append(sqrt(mean_squared_error(y_test, preds)))
    temp_preds = pd.DataFrame({'obs_nr': test_index, model_name + str(cv_round): preds})
    predictions_df = predictions_df.merge(temp_preds, how='left', on='obs_nr')
    
    return pred_perf_dict, predictions_df

In [5]:
def show_performance(pred_perf_dict):
    return pd.DataFrame(pd.DataFrame(pred_perf_dict).mean(axis=0), columns=['rmsle'])

In [28]:
def hyperparam_testing_pipe(data, keep_features, square_features, target, model, cont_impute_cols=cont_impute_cols, 
                            cat_impute_cols=cat_impute_cols, mean_enc_cols=mean_enc_cols, dev_seed=dev_seed):
    """
    Does all the preprocessing steps,
    uses cross validation to test the performance of several models.
    Used to quickly compare different sets of features
    
    Note: Make sure the indexes are in order without gaps
    """
    # Create different splits of train and test
    kf = KFold(n_splits=5, shuffle=True, random_state=dev_seed)
    
    # Initiate empty object for further analysis
    pred_perf_dict = []
    cv_round = 0
    
    # Split features and target
    X = data.drop(target, axis=1)
    y = data[target]
    
    # Fit models while looping through the train/test-splits
    for train_index, test_index in kf.split(X): 
        cv_round+= 1
        
        X_train, X_test = X.loc[train_index, :], X.loc[test_index, :]
        y_train, y_test = y.values[train_index], y.values[test_index]
        
        # Prepare features based on target variable in testset
        prepper = leakage_preventive_preprocessing_function(target, cont_impute_cols, cat_impute_cols, 
                                                            mean_enc_cols, square_features, keep_features)
        X_train = prepper.fit_transform(X_train, y_train)
        X_test = prepper.transform(X_test)
        
        model = model.fit(X_train, y_train)
        preds = model.predict(X_test)
        
        pred_perf_dict.append(sqrt(mean_squared_error(y_test, preds)))
        
    return np.mean(pred_perf_dict)

In [80]:
def ensambling_testing_pipe(data, keep_features, square_features, target, models, model_weights, use_sq_feats,
                            cont_impute_cols=cont_impute_cols, cat_impute_cols=cat_impute_cols, 
                            mean_enc_cols=mean_enc_cols, dev_seed=dev_seed):
    """
    Does all the preprocessing steps,
    uses cross validation to test the performance of several models ensambled.
    Used to test whether models together can do better performance than a seperate model
    """
    # Check model weights
    if sum(model_weights) != 1:
        raise AssertionError ("model weights does not sum to 1")
        
    # Create different splits of train and test
    kf = KFold(n_splits=5, shuffle=True, random_state=dev_seed)
    
    # Initiate empty object for further analysis
    pred_perf_dict = []
    cv_round = 0
    
    # Split features and target
    X = data.drop(target, axis=1)
    y = data[target]
    
    # Fit models while looping through the train/test-splits
    for train_index, test_index in kf.split(X): 
        cv_round+= 1
        
        X_train, X_test = X.loc[train_index, :], X.loc[test_index, :]
        y_train, y_test = y.values[train_index], y.values[test_index]
        
        # Prepare features based on target variable in testset
        prepper = leakage_preventive_preprocessing_function(target, cont_impute_cols, cat_impute_cols, 
                                                            mean_enc_cols, square_features, keep_features)
        X_train = prepper.fit_transform(X_train, y_train)
        X_test = prepper.transform(X_test)
        
        # Make predictions
        preds_list = []
        
        for index, model in enumerate(models):
            if use_sq_feats[index]:
                model = model.fit(X_train, y_train)
                preds = model.predict(X_test)
            else:
                model = model.fit(X_train[keep_features], y_train)
                preds = model.predict(X_test[keep_features])
            
            preds_list.append(preds)
            
        final_preds = np.sum([a*b for a, b in zip(preds_list, model_weights)], axis=0)
        
        pred_perf_dict.append(sqrt(mean_squared_error(y_test, final_preds)))
        
    return np.mean(pred_perf_dict)

In [7]:
# Import data
data = pd.read_csv('train.csv')

In [8]:
# Standard preprocessing
data = standard_preprocessing_function(data, NA_means_not_there_cols, scale_cont_cols)

In [14]:
# Preview of final dataset (do actual possible data leakage preprocessing steps in cv)
X = data.drop(target, axis=1)
y = data[target]

prepper = leakage_preventive_preprocessing_function(target, cont_impute_cols, cat_impute_cols, mean_enc_cols, 
                                                    square_features, keep_features)
example_X = prepper.fit_transform(X, y)

example_X.head().T

Unnamed: 0,0,1,2,3,4
OverallQual,0.5,0.0,0.5,0.5,1.0
LogTotalSFInclBsmnt,0.091839,0.050782,0.223999,0.0,0.749937
GarageCars,0.0,0.0,0.0,1.0,1.0
ME_Neighborhood,0.203806,0.708166,0.203806,0.323984,1.635183
LogYardArea,-0.227545,0.006945,0.448811,0.025266,0.939392
AllBathsSum,2.0,1.0,2.0,0.0,2.0
YearBuilt,0.663043,0.076087,0.619565,-1.25,0.597826
YearRemodAdd,0.243243,-0.486486,0.216216,-0.648649,0.162162
ME_KitchenQual,1.0,0.0,1.0,1.0,1.0
ME_MSSubClass,0.809783,0.0,0.809783,-0.248567,0.809783


In [None]:
pred_perf_dict, predictions_df = pca_testing_pipe(data, keep_features, target, 2)
preds = show_performance(pred_perf_dict)
preds.columns = ['pca' + str(2)]

for i in range(3, 14):
    pred_perf_dict, predictions_df = pca_testing_pipe(data, keep_features, target, i)
    preds_ = show_performance(pred_perf_dict)
    preds_.columns = ['pca' + str(i)]
    preds = pd.concat([preds, preds_], axis=1)
preds

# Does not improve performance

In [None]:
### Linear models

In [17]:
model = LinearRegression(n_jobs=n_jobs)
hyperparam_testing_pipe(data, keep_features, target, model) 

0.12916725195510975

In [60]:
for alpha in [1, 2, 3, 4, 5, 6, 7, 8, 10, 20, 50]:
    model = Ridge(alpha=alpha)
    rmsle = hyperparam_testing_pipe(data, keep_features, square_features, target, model)
    print('For alpha {}, performance in rmsle is {}'.format(alpha, rmsle))

For alpha 1, performance in rmsle is 0.12911263849690321
For alpha 2, performance in rmsle is 0.129081112877603
For alpha 3, performance in rmsle is 0.1290652808581956
For alpha 4, performance in rmsle is 0.12906078002459426
For alpha 5, performance in rmsle is 0.12906476514163862
For alpha 6, performance in rmsle is 0.1290752557200729
For alpha 7, performance in rmsle is 0.129090812366338
For alpha 8, performance in rmsle is 0.1291103574972633
For alpha 10, performance in rmsle is 0.12915830151244628
For alpha 20, performance in rmsle is 0.1294759120106948
For alpha 50, performance in rmsle is 0.13050359978086087


In [22]:
for alpha in [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, ]:
    model = Lasso(alpha=alpha)
    rmsle = hyperparam_testing_pipe(data, keep_features, target, model)
    print('For alpha {}, performance in rmsle is {}'.format(alpha, rmsle))

For alpha 1e-06, performance in rmsle is 0.12916697660788878
For alpha 1e-05, performance in rmsle is 0.12916540835413604
For alpha 0.0001, performance in rmsle is 0.12916881418735687
For alpha 0.001, performance in rmsle is 0.12973300951694586
For alpha 0.01, performance in rmsle is 0.13499923873928113
For alpha 0.1, performance in rmsle is 0.21883561078265173


In [19]:
for nb in range(4, 13):
    model = neighbors.KNeighborsRegressor(n_neighbors = nb, n_jobs=n_jobs)
    rmsle = hyperparam_testing_pipe(data, keep_features, target, model)
    print('For {} neighbors, performance in rmsle is {}'.format(nb, rmsle))

For 4 neighbors, performance in rmsle is 0.1565119913840703
For 5 neighbors, performance in rmsle is 0.15420557833043014
For 6 neighbors, performance in rmsle is 0.1548898679364827
For 7 neighbors, performance in rmsle is 0.1538968116257658
For 8 neighbors, performance in rmsle is 0.15332970968927212
For 9 neighbors, performance in rmsle is 0.15253614418976208
For 10 neighbors, performance in rmsle is 0.15241538109159114
For 11 neighbors, performance in rmsle is 0.15284205310970975
For 12 neighbors, performance in rmsle is 0.15358698264410606


In [26]:
3*3*3*3*1

81

In [13]:
param_space = {'max_depth': [5, 8, 11],
               'min_samples_split': [4, 8, 16],
               'max_features': [14, 11, 7],
               'max_samples': [.99, 0.75, 0.5],
               #'n_estimators': [50, 100]
              }

params_grid = pd.DataFrame(list(itertools.product(*[param_space[x] for x in param_space])), columns = param_space.keys())
params_grid['rmsle'] = 0

In [16]:
for params in range(len(params_grid)):
    model = RandomForestRegressor(n_estimators=50, 
                                  max_depth=params_grid.loc[params, 'max_depth'],
                                  min_samples_split=params_grid.loc[params, 'min_samples_split'],
                                  max_features=params_grid.loc[params, 'max_features'],
                                  max_samples=params_grid.loc[params, 'max_samples'],
                                  random_state=dev_seed, n_jobs=n_jobs)
    rmsle = hyperparam_testing_pipe(data, keep_features, target, model)
    params_grid.loc[params, 'rmsle'] = rmsle
    print('For params {}, performance in rmsle is {}'.format(params_grid.iloc[params,:].values, rmsle))
print('Done')

For params [ 5.          4.         14.          0.99        0.15150489], performance in rmsle is 0.1515048926738119
For params [ 5.          4.         14.          0.75        0.15018141], performance in rmsle is 0.15018140635713362
For params [ 5.          4.         14.          0.5         0.14870551], performance in rmsle is 0.1487055089306315
For params [ 5.          4.         11.          0.99        0.14821685], performance in rmsle is 0.14821685334768003
For params [ 5.          4.         11.          0.75        0.14712873], performance in rmsle is 0.14712873449583347
For params [ 5.          4.         11.          0.5         0.14579189], performance in rmsle is 0.1457918891822352
For params [5.        4.        7.        0.99      0.1454575], performance in rmsle is 0.14545749882945305
For params [5.        4.        7.        0.75      0.1458921], performance in rmsle is 0.14589209580714074
For params [5.         4.         7.         0.5        0.14547455], performanc

For params [11.          8.          7.          0.5         0.13475839], performance in rmsle is 0.1347583856989704
For params [11.         16.         14.          0.99        0.14103399], performance in rmsle is 0.1410339864473404
For params [11.         16.         14.          0.75        0.14104119], performance in rmsle is 0.14104119061718684
For params [11.         16.         14.          0.5         0.14190857], performance in rmsle is 0.14190857251509728
For params [11.         16.         11.          0.99        0.13789453], performance in rmsle is 0.1378945319053339
For params [11.         16.         11.          0.75        0.13894485], performance in rmsle is 0.1389448499177445
For params [11.         16.         11.          0.5         0.13968852], performance in rmsle is 0.1396885157674008
For params [11.         16.          7.          0.99        0.13522982], performance in rmsle is 0.13522981676959128
For params [11.         16.          7.          0.75        

In [17]:
params_grid.sort_values(by='rmsle')

Unnamed: 0,max_depth,min_samples_split,max_features,max_samples,rmsle
60,11,4,7,0.99,0.131811
70,11,8,7,0.75,0.133325
61,11,4,7,0.75,0.133776
69,11,8,7,0.99,0.133804
62,11,4,7,0.5,0.133841
33,8,4,7,0.99,0.133968
34,8,4,7,0.75,0.134213
57,11,4,11,0.99,0.134387
58,11,4,11,0.75,0.134424
71,11,8,7,0.5,0.134758


In [18]:
param_space = {'max_depth': [10, 11, 12, 14],
               'min_samples_split': [4, 6, 8, 10],
               'max_features': [9, 7, 5],
               'max_samples': [.99, 0.9, 0.8, 0.7],
               #'n_estimators': [50, 100]
              }

params_grid = pd.DataFrame(list(itertools.product(*[param_space[x] for x in param_space])), columns = param_space.keys())
params_grid['rmsle'] = 0

for params in range(len(params_grid)):
    model = RandomForestRegressor(n_estimators=50, 
                                  max_depth=params_grid.loc[params, 'max_depth'],
                                  min_samples_split=params_grid.loc[params, 'min_samples_split'],
                                  max_features=params_grid.loc[params, 'max_features'],
                                  max_samples=params_grid.loc[params, 'max_samples'],
                                  random_state=dev_seed, n_jobs=n_jobs)
    rmsle = hyperparam_testing_pipe(data, keep_features, target, model)
    params_grid.loc[params, 'rmsle'] = rmsle
    
params_grid.sort_values(by='rmsle')

Unnamed: 0,max_depth,min_samples_split,max_features,max_samples,rmsle
152,14,4,5,0.99,0.130342
117,12,6,5,0.9,0.130764
8,10,4,5,0.99,0.131016
177,14,8,5,0.9,0.13104
129,12,8,5,0.9,0.131103
57,11,4,5,0.9,0.131198
69,11,6,5,0.9,0.131347
148,14,4,7,0.99,0.131438
105,12,4,5,0.9,0.131449
153,14,4,5,0.9,0.131467


In [19]:
param_space = {'max_depth': [13, 14, 15],
               'min_samples_split': [4, 6, 8, 10],
               'max_features': [6, 5, 4],
               'max_samples': [.99, 0.95, 0.9],
               #'n_estimators': [50, 100]
              }

params_grid = pd.DataFrame(list(itertools.product(*[param_space[x] for x in param_space])), columns = param_space.keys())
params_grid['rmsle'] = 0

for params in range(len(params_grid)):
    model = RandomForestRegressor(n_estimators=50, 
                                  max_depth=params_grid.loc[params, 'max_depth'],
                                  min_samples_split=params_grid.loc[params, 'min_samples_split'],
                                  max_features=params_grid.loc[params, 'max_features'],
                                  max_samples=params_grid.loc[params, 'max_samples'],
                                  random_state=dev_seed, n_jobs=n_jobs)
    rmsle = hyperparam_testing_pipe(data, keep_features, target, model)
    params_grid.loc[params, 'rmsle'] = rmsle
    
params_grid.sort_values(by='rmsle')

Unnamed: 0,max_depth,min_samples_split,max_features,max_samples,rmsle
39,14,4,5,0.99,0.130342
75,15,4,5,0.99,0.130348
3,13,4,5,0.99,0.130516
4,13,4,5,0.95,0.130861
40,14,4,5,0.95,0.130942
14,13,6,5,0.9,0.130952
73,15,4,6,0.95,0.131023
59,14,8,5,0.9,0.13104
86,15,6,5,0.9,0.131119
1,13,4,6,0.95,0.131285


In [21]:
param_space = {'max_depth': [14],
               'min_samples_split': [4],
               'max_features': [5],
               'max_samples': [0.99],
               'n_estimators': [50, 75, 100, 150, 200]
              }

params_grid = pd.DataFrame(list(itertools.product(*[param_space[x] for x in param_space])), columns = param_space.keys())
params_grid['rmsle'] = 0

for params in range(len(params_grid)):
    model = RandomForestRegressor(n_estimators=params_grid.loc[params, 'n_estimators'],
                                  max_depth=params_grid.loc[params, 'max_depth'],
                                  min_samples_split=params_grid.loc[params, 'min_samples_split'],
                                  max_features=params_grid.loc[params, 'max_features'],
                                  max_samples=params_grid.loc[params, 'max_samples'],
                                  random_state=dev_seed, n_jobs=n_jobs)
    rmsle = hyperparam_testing_pipe(data, keep_features, target, model)
    params_grid.loc[params, 'rmsle'] = rmsle
    
params_grid.sort_values(by='rmsle')

Unnamed: 0,max_depth,min_samples_split,max_features,max_samples,n_estimators,rmsle
2,14,4,5,0.99,100,0.130342
0,14,4,5,0.99,50,0.130342
3,14,4,5,0.99,150,0.130392
1,14,4,5,0.99,75,0.130441
4,14,4,5,0.99,200,0.130746


In [None]:
### GBR

In [23]:
param_space = {'learning_rate': [0.001, 0.01, 0.1, 0.25],
               'max_depth': [6, 9, 12, 15],
               'min_samples_split': [2, 4, 8, 16],
               'max_features': [4, 5, 7, 9],
               'subsample': [1, 0.8, 0.6],
               #'n_estimators': [50, 75, 100, 150, 200]
              }

params_grid = pd.DataFrame(list(itertools.product(*[param_space[x] for x in param_space])), columns = param_space.keys())
params_grid['rmsle'] = 0

for params in range(len(params_grid)):
    model = GradientBoostingRegressor(learning_rate=params_grid.loc[params, 'learning_rate'],
                                      max_depth=params_grid.loc[params, 'max_depth'],
                                      min_samples_split=params_grid.loc[params, 'min_samples_split'],
                                      max_features=params_grid.loc[params, 'max_features'],
                                      subsample=params_grid.loc[params, 'subsample'],
                                      random_state=dev_seed)
    rmsle = hyperparam_testing_pipe(data, keep_features, target, model)
    params_grid.loc[params, 'rmsle'] = rmsle
    
params_grid.sort_values(by='rmsle')

Unnamed: 0,learning_rate,max_depth,min_samples_split,max_features,subsample,rmsle
423,0.100,6,16,5,1.0,0.126087
413,0.100,6,8,5,0.6,0.127128
411,0.100,6,8,5,1.0,0.127390
399,0.100,6,4,5,1.0,0.127550
424,0.100,6,16,5,0.8,0.127684
...,...,...,...,...,...,...
36,0.001,6,16,4,1.0,0.368580
37,0.001,6,16,4,0.8,0.368582
26,0.001,6,8,4,0.6,0.368589
14,0.001,6,4,4,0.6,0.368645


In [25]:
params_grid.groupby('learning_rate')['rmsle'].mean()

learning_rate
0.001    0.367131
0.010    0.196131
0.100    0.132166
0.250    0.141483
Name: rmsle, dtype: float64

In [27]:
params_grid[params_grid['learning_rate'] == 0.1].groupby('max_depth')['rmsle'].mean()

max_depth
6     0.129364
9     0.132234
12    0.133494
15    0.133571
Name: rmsle, dtype: float64

In [28]:
params_grid[params_grid['learning_rate'] == 0.1].groupby('min_samples_split')['rmsle'].mean()

min_samples_split
2     0.133357
4     0.132798
8     0.131717
16    0.130791
Name: rmsle, dtype: float64

In [29]:
params_grid[params_grid['learning_rate'] == 0.1].groupby('subsample')['rmsle'].mean()

subsample
0.6    0.131401
0.8    0.131837
1.0    0.133260
Name: rmsle, dtype: float64

In [30]:
param_space = {'learning_rate': [0.075, 0.1, 0.125,],
               'max_depth': [4, 5, 6, 7],
               'min_samples_split': [10, 12, 16, 20],
               'max_features': [5],
               'subsample': [1, 0.9, 0.75, 0.5],
               #'n_estimators': [50, 75, 100, 150, 200]
              }

params_grid = pd.DataFrame(list(itertools.product(*[param_space[x] for x in param_space])), columns = param_space.keys())
params_grid['rmsle'] = 0

for params in range(len(params_grid)):
    model = GradientBoostingRegressor(learning_rate=params_grid.loc[params, 'learning_rate'],
                                      max_depth=params_grid.loc[params, 'max_depth'],
                                      min_samples_split=params_grid.loc[params, 'min_samples_split'],
                                      max_features=params_grid.loc[params, 'max_features'],
                                      subsample=params_grid.loc[params, 'subsample'],
                                      random_state=dev_seed)
    rmsle = hyperparam_testing_pipe(data, keep_features, target, model)
    params_grid.loc[params, 'rmsle'] = rmsle
    
params_grid.sort_values(by='rmsle')

Unnamed: 0,learning_rate,max_depth,min_samples_split,max_features,subsample,rmsle
10,0.075,4,16,5,0.75,0.125315
35,0.075,6,10,5,0.5,0.125517
2,0.075,4,10,5,0.75,0.12561
6,0.075,4,12,5,0.75,0.125809
78,0.1,4,20,5,0.75,0.125818
98,0.1,6,10,5,0.75,0.125866
58,0.075,7,16,5,0.75,0.126036
38,0.075,6,12,5,0.75,0.126064
104,0.1,6,16,5,1.0,0.126087
94,0.1,5,20,5,0.75,0.126116


In [31]:
param_space = {'learning_rate': [0.006, 0.075, 0.9],
               'max_depth': [4, 5, 6],
               'min_samples_split': [10, 12, 14, 16],
               'max_features': [5],
               'subsample': [0.75],
               #'n_estimators': [50, 75, 100, 150, 200]
              }

params_grid = pd.DataFrame(list(itertools.product(*[param_space[x] for x in param_space])), columns = param_space.keys())
params_grid['rmsle'] = 0

for params in range(len(params_grid)):
    model = GradientBoostingRegressor(learning_rate=params_grid.loc[params, 'learning_rate'],
                                      max_depth=params_grid.loc[params, 'max_depth'],
                                      min_samples_split=params_grid.loc[params, 'min_samples_split'],
                                      max_features=params_grid.loc[params, 'max_features'],
                                      subsample=params_grid.loc[params, 'subsample'],
                                      random_state=dev_seed)
    rmsle = hyperparam_testing_pipe(data, keep_features, target, model)
    params_grid.loc[params, 'rmsle'] = rmsle
    
params_grid.sort_values(by='rmsle')

Unnamed: 0,learning_rate,max_depth,min_samples_split,max_features,subsample,rmsle
15,0.075,4,16,5,0.75,0.125315
18,0.075,5,14,5,0.75,0.125554
12,0.075,4,10,5,0.75,0.12561
13,0.075,4,12,5,0.75,0.125809
21,0.075,6,12,5,0.75,0.126064
22,0.075,6,14,5,0.75,0.126233
14,0.075,4,14,5,0.75,0.126437
17,0.075,5,12,5,0.75,0.126656
23,0.075,6,16,5,0.75,0.126658
20,0.075,6,10,5,0.75,0.126777


In [33]:
param_space = {'learning_rate': [0.075],
               'max_depth': [4],
               'min_samples_split': [16],
               'max_features': [5],
               'subsample': [0.75],
               'n_estimators': [50, 75, 100, 150, 200]
              }

params_grid = pd.DataFrame(list(itertools.product(*[param_space[x] for x in param_space])), columns = param_space.keys())
params_grid['rmsle'] = 0

for params in range(len(params_grid)):
    model = GradientBoostingRegressor(learning_rate=params_grid.loc[params, 'learning_rate'],
                                      max_depth=params_grid.loc[params, 'max_depth'],
                                      min_samples_split=params_grid.loc[params, 'min_samples_split'],
                                      max_features=params_grid.loc[params, 'max_features'],
                                      subsample=params_grid.loc[params, 'subsample'],
                                      n_estimators=params_grid.loc[params, 'n_estimators'],
                                      random_state=dev_seed)
    rmsle = hyperparam_testing_pipe(data, keep_features, target, model)
    params_grid.loc[params, 'rmsle'] = rmsle
    
params_grid.sort_values(by='rmsle')

Unnamed: 0,learning_rate,max_depth,min_samples_split,max_features,subsample,n_estimators,rmsle
4,0.075,4,16,5,0.75,200,0.125139
3,0.075,4,16,5,0.75,150,0.125178
2,0.075,4,16,5,0.75,100,0.125315
1,0.075,4,16,5,0.75,75,0.127159
0,0.075,4,16,5,0.75,50,0.130384


In [10]:
param_space = {'learning_rate': [0.075],
               'max_depth': [2, 3, 4],
               'min_samples_split': [16],
               'max_features': [5],
               'subsample': [0.75],
               'n_estimators': [200]
              }

params_grid = pd.DataFrame(list(itertools.product(*[param_space[x] for x in param_space])), columns = param_space.keys())
params_grid['rmsle'] = 0

for params in range(len(params_grid)):
    model = GradientBoostingRegressor(learning_rate=params_grid.loc[params, 'learning_rate'],
                                      max_depth=params_grid.loc[params, 'max_depth'],
                                      min_samples_split=params_grid.loc[params, 'min_samples_split'],
                                      max_features=params_grid.loc[params, 'max_features'],
                                      subsample=params_grid.loc[params, 'subsample'],
                                      n_estimators=params_grid.loc[params, 'n_estimators'],
                                      random_state=dev_seed)
    rmsle = hyperparam_testing_pipe(data, keep_features, target, model)
    params_grid.loc[params, 'rmsle'] = rmsle
    
params_grid.sort_values(by='rmsle')

Unnamed: 0,learning_rate,max_depth,min_samples_split,max_features,subsample,n_estimators,rmsle
2,0.075,4,16,5,0.75,200,0.125139
1,0.075,3,16,5,0.75,200,0.126311
0,0.075,2,16,5,0.75,200,0.128427


In [12]:
# After robustscaler
param_space = {'learning_rate': [0.07, 0.075, 0.08],
               'max_depth': [3, 4],
               'min_samples_split': [12, 16],
               'max_features': [4, 5, 6],
               'subsample': [0.75],
               'n_estimators': [200]
              }

params_grid = pd.DataFrame(list(itertools.product(*[param_space[x] for x in param_space])), columns = param_space.keys())
params_grid['rmsle'] = 0

for params in range(len(params_grid)):
    model = GradientBoostingRegressor(learning_rate=params_grid.loc[params, 'learning_rate'],
                                      max_depth=params_grid.loc[params, 'max_depth'],
                                      min_samples_split=params_grid.loc[params, 'min_samples_split'],
                                      max_features=params_grid.loc[params, 'max_features'],
                                      subsample=params_grid.loc[params, 'subsample'],
                                      n_estimators=params_grid.loc[params, 'n_estimators'],
                                      random_state=dev_seed)
    rmsle = hyperparam_testing_pipe(data, keep_features, target, model)
    params_grid.loc[params, 'rmsle'] = rmsle
    
params_grid.sort_values(by='rmsle')

Unnamed: 0,learning_rate,max_depth,min_samples_split,max_features,subsample,n_estimators,rmsle
22,0.075,4,16,5,0.75,200,0.124488
29,0.08,3,16,6,0.75,200,0.124589
14,0.075,3,12,6,0.75,200,0.12465
17,0.075,3,16,6,0.75,200,0.125138
34,0.08,4,16,5,0.75,200,0.125231
2,0.07,3,12,6,0.75,200,0.125244
31,0.08,4,12,5,0.75,200,0.125285
5,0.07,3,16,6,0.75,200,0.125316
26,0.08,3,12,6,0.75,200,0.125484
35,0.08,4,16,6,0.75,200,0.125886


In [30]:
# After robustscaler
param_space = {'learning_rate': [0.075],
               'max_depth': [4],
               'min_samples_split': [16],
               'max_features': [5],
               'subsample': [0.75],
               'n_estimators': [200, 500, 1000]
              }

params_grid = pd.DataFrame(list(itertools.product(*[param_space[x] for x in param_space])), columns = param_space.keys())
params_grid['rmsle'] = 0

for params in range(len(params_grid)):
    model = GradientBoostingRegressor(learning_rate=params_grid.loc[params, 'learning_rate'],
                                      max_depth=params_grid.loc[params, 'max_depth'],
                                      min_samples_split=params_grid.loc[params, 'min_samples_split'],
                                      max_features=params_grid.loc[params, 'max_features'],
                                      subsample=params_grid.loc[params, 'subsample'],
                                      n_estimators=params_grid.loc[params, 'n_estimators'],
                                      random_state=dev_seed)
    rmsle = hyperparam_testing_pipe(data, keep_features, square_features=[], target=target, model=model)
    params_grid.loc[params, 'rmsle'] = rmsle
    
params_grid.sort_values(by='rmsle')

Unnamed: 0,learning_rate,max_depth,min_samples_split,max_features,subsample,n_estimators,rmsle
0,0.075,4,16,5,0.75,200,0.124488
1,0.075,4,16,5,0.75,500,0.126727
2,0.075,4,16,5,0.75,1000,0.128996


In [None]:
### SVR

In [38]:
param_space = {'kernel': ['linear'],
               'tol': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1],
               'C': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1],
               'epsilon': [0.001, 0.01, 0.1, 0.25],
               #'degree': [],
               #'gamma': []
              }

params_grid = pd.DataFrame(list(itertools.product(*[param_space[x] for x in param_space])), columns = param_space.keys())
params_grid['rmsle'] = 0

for params in range(len(params_grid)):
    model = SVR(kernel = params_grid.loc[params, 'kernel'],
                tol = params_grid.loc[params, 'tol'],
                C = params_grid.loc[params, 'C'],
                epsilon = params_grid.loc[params, 'epsilon'],
           )
    rmsle = hyperparam_testing_pipe(data, keep_features, square_features=square_features, target=target, model=model)
    params_grid.loc[params, 'rmsle'] = rmsle
    
params_grid.sort_values(by='rmsle')

Unnamed: 0,kernel,tol,C,epsilon,rmsle
94,linear,0.001,0.1,0.1,0.128883
46,linear,1e-05,0.1,0.1,0.128891
22,linear,1e-06,0.1,0.1,0.128891
70,linear,0.0001,0.1,0.1,0.128891
118,linear,0.01,0.1,0.1,0.129028
141,linear,0.1,0.1,0.01,0.12943
142,linear,0.1,0.1,0.1,0.129816
90,linear,0.001,0.01,0.1,0.130061
114,linear,0.01,0.01,0.1,0.130068
18,linear,1e-06,0.01,0.1,0.130086


In [39]:
params_grid[params_grid['C'] > 0.001].groupby('tol')['rmsle'].mean()

tol
0.000001    0.132132
0.000010    0.132133
0.000100    0.132138
0.001000    0.132136
0.010000    0.132201
0.100000    0.132711
Name: rmsle, dtype: float64

In [40]:
params_grid[params_grid['C'] > 0.001].groupby('epsilon')['rmsle'].mean()

epsilon
0.001    0.130776
0.010    0.130427
0.100    0.129634
0.250    0.138130
Name: rmsle, dtype: float64

In [41]:
param_space = {'kernel': ['linear'],
               'tol': [0.0000001, 0.000001],
               'C': [0.01, 0.1, 0.2, 0.5],
               'epsilon': [0.05, 0.1, 0.15],
               #'degree': [],
               #'gamma': []
              }

params_grid = pd.DataFrame(list(itertools.product(*[param_space[x] for x in param_space])), columns = param_space.keys())
params_grid['rmsle'] = 0

for params in range(len(params_grid)):
    model = SVR(kernel = params_grid.loc[params, 'kernel'],
                tol = params_grid.loc[params, 'tol'],
                C = params_grid.loc[params, 'C'],
                epsilon = params_grid.loc[params, 'epsilon'],
           )
    rmsle = hyperparam_testing_pipe(data, keep_features, square_features=square_features, target=target, model=model)
    params_grid.loc[params, 'rmsle'] = rmsle
    
params_grid.sort_values(by='rmsle')

Unnamed: 0,kernel,tol,C,epsilon,rmsle
4,linear,1e-07,0.1,0.1,0.128891
16,linear,1e-06,0.1,0.1,0.128891
7,linear,1e-07,0.2,0.1,0.128922
19,linear,1e-06,0.2,0.1,0.128922
22,linear,1e-06,0.5,0.1,0.128953
10,linear,1e-07,0.5,0.1,0.128953
9,linear,1e-07,0.5,0.05,0.129838
21,linear,1e-06,0.5,0.05,0.129838
18,linear,1e-06,0.2,0.05,0.129909
6,linear,1e-07,0.2,0.05,0.129909


In [43]:
param_space = {'kernel': ['rbf'],
               'tol': [0.0000001, 0.000001],
               'C': [0.5, 1, 2, 5],
               'epsilon': [0.001, 0.01, 0.05, 0.1],
               #'degree': [],
               #'gamma': []
              }

params_grid = pd.DataFrame(list(itertools.product(*[param_space[x] for x in param_space])), columns = param_space.keys())
params_grid['rmsle'] = 0

for params in range(len(params_grid)):
    model = SVR(kernel = params_grid.loc[params, 'kernel'],
                tol = params_grid.loc[params, 'tol'],
                C = params_grid.loc[params, 'C'],
                epsilon = params_grid.loc[params, 'epsilon'],
           )
    rmsle = hyperparam_testing_pipe(data, keep_features, square_features=square_features, target=target, model=model)
    params_grid.loc[params, 'rmsle'] = rmsle
    
params_grid.sort_values(by='rmsle')

Unnamed: 0,kernel,tol,C,epsilon,rmsle
22,rbf,1e-06,1.0,0.05,0.137868
6,rbf,1e-07,1.0,0.05,0.137868
5,rbf,1e-07,1.0,0.01,0.137893
21,rbf,1e-06,1.0,0.01,0.137893
18,rbf,1e-06,0.5,0.05,0.13813
2,rbf,1e-07,0.5,0.05,0.13813
20,rbf,1e-06,1.0,0.001,0.138501
4,rbf,1e-07,1.0,0.001,0.138501
1,rbf,1e-07,0.5,0.01,0.139474
17,rbf,1e-06,0.5,0.01,0.139474


In [44]:
param_space = {'kernel': ['poly'],
               'tol': [0.0000001, 0.000001],
               'C': [0.1, 0.5, 1, 2],
               'epsilon': [0.001, 0.01, 0.05, 0.1],
               'degree': [2, 3, 4, 5],
               #'gamma': []
              }

params_grid = pd.DataFrame(list(itertools.product(*[param_space[x] for x in param_space])), columns = param_space.keys())
params_grid['rmsle'] = 0

for params in range(len(params_grid)):
    model = SVR(kernel = params_grid.loc[params, 'kernel'],
                tol = params_grid.loc[params, 'tol'],
                C = params_grid.loc[params, 'C'],
                epsilon = params_grid.loc[params, 'epsilon'],
           )
    rmsle = hyperparam_testing_pipe(data, keep_features, square_features=square_features, target=target, model=model)
    params_grid.loc[params, 'rmsle'] = rmsle
    
params_grid.sort_values(by='rmsle')

Unnamed: 0,kernel,tol,C,epsilon,degree,rmsle
91,poly,1e-06,0.5,0.05,5,0.152417
90,poly,1e-06,0.5,0.05,4,0.152417
89,poly,1e-06,0.5,0.05,3,0.152417
88,poly,1e-06,0.5,0.05,2,0.152417
24,poly,1e-07,0.5,0.05,2,0.152417
26,poly,1e-07,0.5,0.05,4,0.152417
27,poly,1e-07,0.5,0.05,5,0.152417
25,poly,1e-07,0.5,0.05,3,0.152417
44,poly,1e-07,1.0,0.1,2,0.160858
45,poly,1e-07,1.0,0.1,3,0.160858


In [None]:
# Set up ensable function based on cv split

In [None]:
# individual best performance
# ridge: 0.12906
# gbr:   0.12449
# rfr:   0.13034
# svr:   0.12889

In [73]:
ridge = Ridge(alpha=4)
gbr = GradientBoostingRegressor(learning_rate=0.075, max_depth=4, min_samples_split=16, max_features=5,
                                subsample=0.75, n_estimators=200, random_state=dev_seed)
rfr = RandomForestRegressor(n_estimators=100, max_depth=14, min_samples_split=4, max_features=5,
                            max_samples=0.99, random_state=dev_seed, n_jobs=n_jobs)
svr = SVR(kernel = 'linear', tol = 0.0000001, C = 0.1, epsilon = 0.1)

models = [ridge, gbr, rfr, svr]
use_sq_feats = [True, False, False, True]

In [81]:
model_weights = [0.25, 0.25, 0.25, 0.25]
ensambling_testing_pipe(data, keep_features, square_features, target, models, model_weights, use_sq_feats)

0.12327227408085846

In [82]:
model_weights = [0.15, 0.5, 0.1, 0.25]
ensambling_testing_pipe(data, keep_features, square_features, target, models, model_weights, use_sq_feats)

0.12208419207228124

In [84]:
model_weights = [0.2, 0.4, 0.15, 0.25]
ensambling_testing_pipe(data, keep_features, square_features, target, models, model_weights, use_sq_feats)

0.12241824857478305

In [85]:
model_weights = [0.15, 0.6, 0.05, 0.2]
ensambling_testing_pipe(data, keep_features, square_features, target, models, model_weights, use_sq_feats)

0.1219859942490201

In [86]:
# winner
model_weights = [0.15, 0.6, 0, 0.25]
ensambling_testing_pipe(data, keep_features, square_features, target, models, model_weights, use_sq_feats)

0.12198138766715935

In [87]:
model_weights = [0.25, 0.5, 0, 0.25]
ensambling_testing_pipe(data, keep_features, square_features, target, models, model_weights, use_sq_feats)

0.12227418313971113

In [88]:
model_weights = [0.1, 0.7, 0, 0.2]
ensambling_testing_pipe(data, keep_features, square_features, target, models, model_weights, use_sq_feats)

0.12207095645075428