In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import Ridge, Lasso, LinearRegression

In [2]:
train = pd.read_csv(r"C:\Users\Administrator.DAI-PC2\Downloads\Shubham\Practical Machine Learning\Kaggle\Flood Prediction\train.csv")
test = pd.read_csv(r"C:\Users\Administrator.DAI-PC2\Downloads\Shubham\Practical Machine Learning\Kaggle\Flood Prediction\test.csv")

In [3]:
X = train.drop(["id","FloodProbability"],axis=1)
X_valid = test.drop("id",axis=1)
y = train["FloodProbability"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.3, random_state=24)

In [5]:
## Model 1

In [6]:
ridge = Ridge()
ridge.fit(X_train,y_train)
y_pred = ridge.predict(X_test)
r2_ridge = r2_score(y_test,y_pred)
r2_ridge

0.8446066903335169

In [7]:
## Model 2

In [8]:
lasso = Lasso()
lasso.fit(X_train,y_train)
y_pred = lasso.predict(X_test)
r2_lasso = r2_score(y_test,y_pred)
r2_lasso

-5.125602061673007e-06

In [9]:
## Model 3

In [10]:
lr = LinearRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
r2_lr = r2_score(y_test,y_pred)
r2_lr

0.8446066905332366

In [11]:
## Model 4

In [12]:
dtc = DecisionTreeRegressor(random_state=24)
dtc.fit(X_train,y_train)
y_pred = dtc.predict(X_test)
r2_dtc = r2_score(y_test,y_pred)
r2_dtc

0.049924522597523335

In [13]:
### Average voting

In [14]:
voting_avg = VotingRegressor([("RIDGE",ridge),("LASSO",lasso),("LR",lr),("TREE",dtc)])
voting_avg.fit(X_train,y_train)
y_pred  = voting_avg.predict(X_test)
print("R2 Score = ", r2_score(y_test,y_pred))

R2 Score =  0.7005696716305334


In [15]:
###### weighted voting averaging

In [16]:
voting_w = VotingRegressor([("RIDGE",ridge),("LASSO",lasso),("LR",lr),("TREE",dtc)],weights = [r2_ridge,r2_lasso,r2_lr,r2_dtc])
voting_w.fit(X_train,y_train)
y_pred  = voting_w.predict(X_test)
print("R2 Score = ", r2_score(y_test,y_pred))

R2 Score =  0.843975021062881


In [17]:
#### RandomizedSearchCV

In [18]:
kfold = KFold(n_splits=5,shuffle=True,random_state=24)
params = {"RIDGE__alpha":np.linspace(0.001,3,10),"LASSO__alpha":np.linspace(0.001,3,10),"TREE__min_samples_split":[2,4,5,8,10],
          "TREE__min_samples_leaf":[1,4,5,8,10],"TREE__max_depth":[None,3,4,5]}
rgcv_avg = RandomizedSearchCV(voting_avg,param_distributions=params,cv=kfold,scoring='r2',n_jobs=-1,n_iter =20,random_state=24)
rgcv_avg.fit(X,y)
print(rgcv_avg.best_params_)
print(rgcv_avg.best_score_)

{'TREE__min_samples_split': 4, 'TREE__min_samples_leaf': 5, 'TREE__max_depth': None, 'RIDGE__alpha': 3.0, 'LASSO__alpha': 0.001}
0.8041880305234562


In [19]:
kfold = KFold(n_splits=5,shuffle=True,random_state=24)
params = {"RIDGE__alpha":np.linspace(0.001,3,10),"LASSO__alpha":np.linspace(0.001,3,10),"TREE__min_samples_split":[2,4,5,8,10],
          "TREE__min_samples_leaf":[1,4,5,8,10],"TREE__max_depth":[None,3,4,5]}
rgcv_w = RandomizedSearchCV(voting_w,param_distributions=params,cv=kfold,scoring='r2',n_jobs=-1,n_iter =20,random_state=24)
rgcv_w.fit(X,y)
print(rgcv_w.best_params_)
print(rgcv_w.best_score_)

{'TREE__min_samples_split': 4, 'TREE__min_samples_leaf': 5, 'TREE__max_depth': None, 'RIDGE__alpha': 3.0, 'LASSO__alpha': 0.001}
0.8444741724884288


In [None]:
### Choose model with best r2 score  (0.8446066905332366 linear)

In [23]:
y_pred= rgcv_w.best_estimator_.predict(X_valid)
submit = pd.DataFrame({"id":test["id"],"FloodProbability":y_pred})
submit.to_csv(r"C:\Users\Administrator.DAI-PC2\Downloads\Shubham\Practical Machine Learning\Kaggle\Flood Prediction\submit_voting.csv",index=False)