In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from scipy.stats import uniform as sp_rand
from sklearn import datasets
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

## Pre-processed data

In [5]:
train_new = pd.read_csv('train_new.csv')
test_new = pd.read_csv('test_new.csv')
train_new2 = pd.read_csv('train_new2.csv')
test_new2 = pd.read_csv('test_new2.csv')

train_x = train_new.drop(["stroke","id"],axis=1)
train_y = train_new["stroke"]
test_x = test_new.iloc[:,1:]

train_x2 = train_new2.drop(["stroke","id"],axis=1)
train_y2 = train_new2["stroke"]
test_x2 = test_new2.iloc[:,1:]

## Ridge paramater analysis and model setting

In [66]:
# prepare a range of alpha values to test
alphas = np.array([207.9763,207.9767,207.9765])
# create and fit a ridge regression model, testing each alpha
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))
grid.fit(train_x, train_y)
print(grid)
# summarize the results of the grid search
print(grid.best_score_)
print(grid.best_estimator_.alpha)

GridSearchCV(cv=None, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid='warn', n_jobs=1,
       param_grid={'alpha': array([207.9763, 207.9767, 207.9765])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)
0.032020652054245057
207.9765




In [38]:
# prepare a uniform distribution to sample for the alpha parameter
param_grid = {'alpha': sp_rand()}
# create and fit a ridge regression model, testing random alpha values
model = Ridge()
rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100)
rsearch.fit(train_x, train_y)
print(rsearch)
# summarize the results of the random parameter search
print(rsearch.best_score_)
print(rsearch.best_estimator_.alpha)

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
          fit_params=None, iid='warn', n_iter=100, n_jobs=1,
          param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1120f67f0>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)
0.031881452354033675
0.999381234890539




In [67]:
clf = Ridge(alpha = 207.9765, solver = 'auto')
clf.fit(train_x, train_y)
preds3 = clf.predict(test_x)

In [32]:
steps = [('scaler', StandardScaler()),
         ('ridge', Ridge())]

# Create the pipeline: pipeline
pipeline = Pipeline(steps)

# Specify the hyperparameter space
parameters = {'ridge__alpha':np.logspace(0.97, 0.98, 1)}

# Create the GridSearchCV object: cv
cv = GridSearchCV(pipeline, parameters, cv=3)

# Fit to the training set
cv.fit(train_x2, train_y2)

#predict on train set
y_pred_train=cv.predict(train_x2)

# Predict test set
y_pred_test=cv.predict(test_x2)

# rmse on train set
rmse = np.sqrt(mean_squared_error(train_y2, y_pred_train))
print("Root Mean Squared Error: {}".format(rmse))

Root Mean Squared Error: 0.12073632557303447




In [68]:
ids = test_new2.iloc[:,0]
output=pd.concat([ids, pd.DataFrame(preds3)], axis=1, ignore_index=True)
output.columns=['id', 'stroke']

output.to_csv('ridge2.csv', index=False)