In [3]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import pandas as pd
import numpy as np
from data_pipe import *

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('Hazard_train.csv')

In [3]:
pipe = Pipeline([
    ('dummy1', create_dummy('T1_V4', 400)),
    ('dummy2', create_dummy('T1_V5', 400)),
    ('dummy3', create_dummy('T1_V6', 400)),
    ('dummy4', create_dummy('T1_V7', 400)),
    ('dummy5', create_dummy('T1_V8', 400)),
    ('dummy6', create_dummy('T1_V9', 400)),
    ('dummy7', create_dummy('T1_V11', 400)),
    ('dummy8', create_dummy('T1_V12', 400)),
    ('dummy9', create_dummy('T1_V15', 400)),
    ('dummy10', create_dummy('T1_V16', 400)),
    ('dummy11', create_dummy('T1_V17', 400)),
    ('dummy12', create_dummy('T2_V3', 400)),
    ('dummy13', create_dummy('T2_V5', 400)),
    ('dummy14', create_dummy('T2_V11', 400)),
    ('dummy15', create_dummy('T2_V12', 400)),
    ('dummy16', create_dummy('T2_V13', 400)),
    ('dropper', Dropper('Id'))
])

In [4]:
pipe.fit(df)

In [5]:
df = pipe.transform(df)

In [7]:
Y = df['Hazard']
del df['Hazard']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, Y, test_size=0.2, random_state=42)

param_grid = {
    'n_estimators': [100, 250, 300, 500],      
    'max_depth': [10, 15, 35, 50],            
    'min_samples_split': [100, 150, 200],                
    'min_samples_leaf': [60, 70, 80],                  
    'max_features': list(range(71, 80)),       
    'bootstrap': [True]                      
}

rf_reg = RandomForestRegressor(random_state=42)

random_search = RandomizedSearchCV(estimator=rf_reg,
                                   param_distributions=param_grid,
                                   n_iter=90,                     
                                   scoring='neg_mean_squared_error', 
                                   cv=10,                          
                                   verbose=1,                     
                                   random_state=42,
                                   n_jobs=-1) 

# random_search.fit(X_train, y_train)

In [None]:
best_params = {'n_estimators': 500, 'min_samples_split': 100, 'min_samples_leaf': 60, 'max_features': 72, 'max_depth': 15, 'bootstrap': True}

best_rf_reg = RandomForestRegressor(**best_params)
best_rf_reg.fit(X_train, y_train)

In [98]:
feat_imp_df=pd.DataFrame({'features':X_train.columns,
                          'importance':best_rf_reg.feature_importances_})

feat_imp_df=feat_imp_df.sort_values('importance',ascending=False)
feat_imp_df['normalised_imp']=feat_imp_df['importance']/np.sum(feat_imp_df['importance'])
feat_imp_df['cum_imp']=np.cumsum(feat_imp_df['normalised_imp'])
imp_feat = list(feat_imp_df.head(30)['features'])
imp_feat_df = pd.concat([X_train, X_test])[imp_feat]

In [100]:
param_grid = {'alpha': np.linspace(10.1, 10.7, 10)}
grid_search_ridge = GridSearchCV(Ridge(fit_intercept=True), param_grid, cv=10)
grid_search_ridge.fit(imp_feat_df, Y)

In [101]:
report(grid_search_ridge.cv_results_, 3)

Model with rank: 1
Mean validation score: -0.001357 (std: 0.000805)
Parameters: {'alpha': 10.7}

Model with rank: 2
Mean validation score: -0.001357 (std: 0.000805)
Parameters: {'alpha': 10.633333333333333}

Model with rank: 3
Mean validation score: -0.001357 (std: 0.000805)
Parameters: {'alpha': 10.566666666666666}



In [104]:
param_grid = {'alpha': np.linspace(10.1, 10.7, 10)}
grid_search_lasso = GridSearchCV(Lasso(fit_intercept=True), param_grid, cv=10)
grid_search_lasso.fit(imp_feat_df, Y)

In [105]:
report(grid_search_lasso.cv_results_, 1)

Model with rank: 1
Mean validation score: -0.000278 (std: 0.000213)
Parameters: {'alpha': 10.1}

Model with rank: 1
Mean validation score: -0.000278 (std: 0.000213)
Parameters: {'alpha': 10.166666666666666}

Model with rank: 1
Mean validation score: -0.000278 (std: 0.000213)
Parameters: {'alpha': 10.233333333333333}

Model with rank: 1
Mean validation score: -0.000278 (std: 0.000213)
Parameters: {'alpha': 10.299999999999999}

Model with rank: 1
Mean validation score: -0.000278 (std: 0.000213)
Parameters: {'alpha': 10.366666666666665}

Model with rank: 1
Mean validation score: -0.000278 (std: 0.000213)
Parameters: {'alpha': 10.433333333333334}

Model with rank: 1
Mean validation score: -0.000278 (std: 0.000213)
Parameters: {'alpha': 10.5}

Model with rank: 1
Mean validation score: -0.000278 (std: 0.000213)
Parameters: {'alpha': 10.566666666666666}

Model with rank: 1
Mean validation score: -0.000278 (std: 0.000213)
Parameters: {'alpha': 10.633333333333333}

Model with rank: 1
Mean valid

In [108]:
print(f'for ridge MAE is {1-(-0.001357/5.4)}, \nfor lasso MAE is {1-(-0.000278/5.4)}.')

for ridge MAE is 1.0002512962962964, 
for lasso MAE is 1.0000514814814814.


In [110]:
test = pd.read_csv('Hazard_test_share.csv')

In [112]:
x_test = pipe.transform(test)

In [114]:
test = x_test[imp_feat]

In [116]:
pd.DataFrame(grid_search_ridge.predict(test)).to_csv('submission.csv', index=False)

In [118]:
grid_search_ridge.predict(test)

array([4.0143288 , 3.97748334, 4.26444731, ..., 3.99737647, 4.00823367,
       3.91596118])

In [148]:
pd.DataFrame(Y)['Hazard'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 40799 entries, 0 to 40798
Series name: Hazard
Non-Null Count  Dtype
--------------  -----
40799 non-null  int64
dtypes: int64(1)
memory usage: 318.9 KB
