In [1]:
from sklearn.model_selection import train_test_split #for splitting the data in train and test
from sklearn.preprocessing import MinMaxScaler #for various scaling methods
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import RidgeClassifier

from sklearn.metrics import accuracy_score,confusion_matrix,recall_score #for accuracy matrices
from sklearn.metrics import precision_score,classification_report,roc_auc_score,precision_score #for accuracy matrices


In [2]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import matplotlib.style as style
%matplotlib inline
style.use('ggplot')

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import warnings
warnings.simplefilter("ignore")

pd.pandas.set_option('display.max_columns',None)

In [3]:
df = pd.read_csv('classification_dataset.csv')
df.head()

Unnamed: 0,density_per_km,latitude,longitude,maxtempC,mintempC,totalSnow_cm,sunHour,moon_illumination,DewPointC,WindGustKmph,cloudcover,humidity,precipMM,pressure,visibility,winddirDegree,year,month,Target
0,12625.800781,-34.603684,-58.381559,24,7,0.0,11.6,87,6,16,0,59,0.0,1018,10,189,2016,3,1
1,12625.800781,-34.603684,-58.381559,24,7,0.0,11.6,87,6,16,0,59,0.0,1018,10,189,2016,3,0
2,12625.800781,-34.603684,-58.381559,24,7,0.0,11.6,87,6,16,0,59,0.0,1018,10,189,2016,3,1
3,12625.800781,-34.603684,-58.381559,24,7,0.0,11.6,87,6,16,0,59,0.0,1018,10,189,2016,3,0
4,12625.800781,-34.603684,-58.381559,22,10,0.0,11.5,76,10,27,34,72,0.0,1025,10,99,2016,3,0


In [4]:
df.shape

(155223, 19)

In [5]:
df.columns

Index(['density_per_km', 'latitude', 'longitude', 'maxtempC', 'mintempC',
       'totalSnow_cm', 'sunHour', 'moon_illumination', 'DewPointC',
       'WindGustKmph', 'cloudcover', 'humidity', 'precipMM', 'pressure',
       'visibility', 'winddirDegree', 'year', 'month', 'Target'],
      dtype='object')

In [6]:
# Total no of duplicate rows
df.duplicated().sum()

79878

In [7]:
# Remove duplicate rows
df = df.drop_duplicates(keep='first', inplace=False)

In [8]:
df.shape

(75345, 19)

## Feature Scaling

In [9]:
feature_scale = [feature for feature in df.columns if feature in ['density_per_km','precipMM']]

scaler=MinMaxScaler()
scaler.fit(df[feature_scale])

# transform the train and test set, and add on the Id and SalePrice variables
data = pd.concat([df[['Target','latitude', 'longitude', 'maxtempC', 'mintempC',
       'totalSnow_cm', 'sunHour', 'moon_illumination', 'DewPointC',
       'WindGustKmph', 'cloudcover', 'humidity', 'pressure',
       'visibility', 'winddirDegree', 'year', 'month']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(df[feature_scale]), columns=feature_scale)],axis=1)

data.head()

Unnamed: 0,Target,latitude,longitude,maxtempC,mintempC,totalSnow_cm,sunHour,moon_illumination,DewPointC,WindGustKmph,cloudcover,humidity,pressure,visibility,winddirDegree,year,month,density_per_km,precipMM
0,1,-34.603684,-58.381559,24,7,0.0,11.6,87,6,16,0,59,1018,10,189,2016,3,0.505636,0.0
1,0,-34.603684,-58.381559,24,7,0.0,11.6,87,6,16,0,59,1018,10,189,2016,3,0.505636,0.0
2,0,-34.603684,-58.381559,22,10,0.0,11.5,76,10,27,34,72,1025,10,99,2016,3,0.505636,0.0
3,1,-34.603684,-58.381559,22,10,0.0,11.5,76,10,27,34,72,1025,10,99,2016,3,0.505636,0.0
4,0,-34.603684,-58.381559,22,18,0.0,8.6,26,15,17,61,78,1015,8,105,2016,4,0.505636,0.097011


# Train Test split

In [10]:
X = data.drop('Target',axis=1)
y = data['Target']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.50,random_state=10, stratify=y)

In [11]:
X_train.head()

Unnamed: 0,latitude,longitude,maxtempC,mintempC,totalSnow_cm,sunHour,moon_illumination,DewPointC,WindGustKmph,cloudcover,humidity,pressure,visibility,winddirDegree,year,month,density_per_km,precipMM
9747,5.566838,-73.450095,17,9,0.0,11.6,34,10,9,47,86,1013,6,156,2016,2,0.002883,0.006155
26293,3.239241,-74.350661,23,16,0.0,10.5,48,18,6,56,89,1015,7,263,2016,5,0.000128,0.118406
41021,11.378934,-72.248517,34,26,0.0,11.6,81,22,24,18,65,1009,10,82,2016,8,0.004803,0.007327
12705,4.575712,-76.199521,26,16,0.0,11.6,14,15,7,21,75,1013,9,213,2016,3,0.001296,0.02755
48883,4.956167,-76.607349,24,13,0.0,11.6,2,13,3,33,72,1014,9,142,2016,10,0.000317,0.030481


# RidgeClassifier

In [12]:
ridge = RidgeClassifier() 
ridge.fit(X_train,y_train) 
ridge_pred = ridge.predict(X_test)
print('Accuracy of RidgeClassifier :{:.4f}'.format(accuracy_score(y_test,ridge_pred)))
print(confusion_matrix(y_test,ridge_pred))
print(classification_report(y_test,ridge_pred))

Accuracy of RidgeClassifier :0.5697
[[14411  5956]
 [10254  7052]]
              precision    recall  f1-score   support

           0       0.58      0.71      0.64     20367
           1       0.54      0.41      0.47     17306

    accuracy                           0.57     37673
   macro avg       0.56      0.56      0.55     37673
weighted avg       0.56      0.57      0.56     37673



### K Fold Cross Validation

In [14]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold_validation=KFold(10)

model = RidgeClassifier() 
results=cross_val_score(model,X,y,cv=kfold_validation)
print(results)
print('Max :',np.max(results))
print('Min :',np.min(results))
print('Mean :',np.mean(results))

[0.66290644 0.59097545 0.56376908 0.55527538 0.55063039 0.55495089
 0.54260685 0.3866472  0.41544996 0.50982214]
Max : 0.6629064366290643
Min : 0.3866471993628882
Mean : 0.5333033772665884


### Stratified K-fold Cross Validation

In [15]:
from sklearn.model_selection import StratifiedKFold
skfold=StratifiedKFold(n_splits=10)
model = RidgeClassifier() 
scores=cross_val_score(model,X,y,cv=skfold)
print(np.mean(scores))

0.4957164750498911


In [16]:
scores

array([0.54187127, 0.56960849, 0.56828135, 0.50218978, 0.53881885,
       0.55216353, 0.46270242, 0.26426865, 0.48062118, 0.47663924])

In [17]:
print('Max :',np.max(scores))
print('Min :',np.min(scores))
print('Mean :',np.mean(scores))

Max : 0.5696084936960849
Min : 0.2642686487921423
Mean : 0.4957164750498911


# Tuning the model using grid searchCV
- Ridge regression is a penalized linear regression model for predicting a numerical value.
- Nevertheless, it can be very effective when applied to classification.
- Perhaps the most important parameter to tune is the regularization strength (alpha). A good starting point might be values in the range [0.1 to 1.0]

1. alpha in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

In [18]:
# example of grid searching key hyperparametres for logistic regression
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

model = RidgeClassifier() 

# define models and parameters
alpha = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

# define grid search
grid = dict(alpha=alpha)

cv = KFold(n_splits=10)

grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=1, cv=cv, scoring='accuracy',error_score=0,verbose=2)
grid_result = grid_search.fit(X_train,y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.2; total time=   0.0s
[CV] END .....................................

In [19]:
accuracy = grid_result.best_score_
accuracy

0.5707954019626243

In [20]:
grid_result.best_params_

{'alpha': 0.5}

## Model with parameters

In [27]:
ridge_model = RidgeClassifier(alpha=0.5) 
ridge_model.fit(X_train,y_train) 
ridge_model_pred = ridge_model.predict(X_test)
print('Accuracy of RidgeClassifier :{:.4f}'.format(accuracy_score(y_test,ridge_model_pred)))
print(confusion_matrix(y_test,ridge_model_pred))
print(classification_report(y_test,ridge_model_pred))

Accuracy of RidgeClassifier :0.5697
[[14410  5957]
 [10253  7053]]
              precision    recall  f1-score   support

           0       0.58      0.71      0.64     20367
           1       0.54      0.41      0.47     17306

    accuracy                           0.57     37673
   macro avg       0.56      0.56      0.55     37673
weighted avg       0.56      0.57      0.56     37673



In [22]:
print('Accuracy before tuning :',accuracy_score(y_test,ridge_pred))
print('\nAccuracy after tuning :',accuracy_score(y_test,ridge_model_pred))

Accuracy before tuning : 0.5697183659384705

Accuracy after tuning : 0.5697183659384705
