In [13]:
from sklearn.model_selection import train_test_split #for splitting the data in train and test
from sklearn.preprocessing import MinMaxScaler #for various scaling methods
from sklearn.svm import SVC #for Support vector classifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import RidgeClassifier

from sklearn.metrics import accuracy_score,confusion_matrix,recall_score #for accuracy matrices
from sklearn.metrics import precision_score,classification_report,roc_auc_score,precision_score #for accuracy matrices

In [2]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import matplotlib.style as style
%matplotlib inline
style.use('ggplot')

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import warnings
warnings.simplefilter("ignore")

pd.pandas.set_option('display.max_columns',None)

In [3]:
df = pd.read_csv('classification_dataset.csv')
df.head()

Unnamed: 0,density_per_km,latitude,longitude,maxtempC,mintempC,totalSnow_cm,sunHour,moon_illumination,DewPointC,WindGustKmph,cloudcover,humidity,precipMM,pressure,visibility,winddirDegree,year,month,Target
0,12625.800781,-34.603684,-58.381559,24,7,0.0,11.6,87,6,16,0,59,0.0,1018,10,189,2016,3,1
1,12625.800781,-34.603684,-58.381559,24,7,0.0,11.6,87,6,16,0,59,0.0,1018,10,189,2016,3,0
2,12625.800781,-34.603684,-58.381559,24,7,0.0,11.6,87,6,16,0,59,0.0,1018,10,189,2016,3,1
3,12625.800781,-34.603684,-58.381559,24,7,0.0,11.6,87,6,16,0,59,0.0,1018,10,189,2016,3,0
4,12625.800781,-34.603684,-58.381559,22,10,0.0,11.5,76,10,27,34,72,0.0,1025,10,99,2016,3,0


In [4]:
df.shape

(155223, 19)

In [5]:
df.columns

Index(['density_per_km', 'latitude', 'longitude', 'maxtempC', 'mintempC',
       'totalSnow_cm', 'sunHour', 'moon_illumination', 'DewPointC',
       'WindGustKmph', 'cloudcover', 'humidity', 'precipMM', 'pressure',
       'visibility', 'winddirDegree', 'year', 'month', 'Target'],
      dtype='object')

In [6]:
# Total no of duplicate rows
df.duplicated().sum()

79878

In [7]:
# Remove duplicate rows
df = df.drop_duplicates(keep='first', inplace=False)

In [8]:
df.shape

(75345, 19)

## Feature Scaling

In [9]:
feature_scale = [feature for feature in df.columns if feature not in ['Target']]

scaler=MinMaxScaler()
scaler.fit(df[feature_scale])

# transform the train and test set, and add on the Id and SalePrice variables
data = pd.concat([df[['Target']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(df[feature_scale]), columns=feature_scale)],axis=1)

data.head()

Unnamed: 0,Target,density_per_km,latitude,longitude,maxtempC,mintempC,totalSnow_cm,sunHour,moon_illumination,DewPointC,WindGustKmph,cloudcover,humidity,precipMM,pressure,visibility,winddirDegree,year,month
0,1,0.505636,0.196981,0.742734,0.641791,0.567164,0.0,0.625806,0.87,0.622642,0.164835,0.0,0.55914,0.0,0.602941,1.0,0.531429,0.333333,0.181818
1,0,0.505636,0.196981,0.742734,0.641791,0.567164,0.0,0.625806,0.87,0.622642,0.164835,0.0,0.55914,0.0,0.602941,1.0,0.531429,0.333333,0.181818
2,0,0.505636,0.196981,0.742734,0.61194,0.61194,0.0,0.619355,0.76,0.698113,0.285714,0.34,0.698925,0.0,0.705882,1.0,0.274286,0.333333,0.181818
3,1,0.505636,0.196981,0.742734,0.61194,0.61194,0.0,0.619355,0.76,0.698113,0.285714,0.34,0.698925,0.0,0.705882,1.0,0.274286,0.333333,0.181818
4,0,0.505636,0.196981,0.742734,0.61194,0.731343,0.0,0.432258,0.26,0.792453,0.175824,0.61,0.763441,0.097011,0.558824,0.8,0.291429,0.333333,0.272727


# Train Test split

In [10]:
X = data.drop('Target',axis=1)
y = data['Target']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.50,random_state=10, stratify=y)

In [11]:
X_train.head()

Unnamed: 0,density_per_km,latitude,longitude,maxtempC,mintempC,totalSnow_cm,sunHour,moon_illumination,DewPointC,WindGustKmph,cloudcover,humidity,precipMM,pressure,visibility,winddirDegree,year,month
9747,0.002883,0.588673,0.563257,0.537313,0.597015,0.0,0.625806,0.34,0.698113,0.087912,0.47,0.849462,0.006155,0.529412,0.6,0.437143,0.333333,0.090909
26293,0.000128,0.565977,0.552531,0.626866,0.701493,0.0,0.554839,0.48,0.849057,0.054945,0.56,0.88172,0.118406,0.558824,0.7,0.742857,0.333333,0.363636
41021,0.004803,0.645345,0.577569,0.791045,0.850746,0.0,0.625806,0.81,0.924528,0.252747,0.18,0.623656,0.007327,0.470588,1.0,0.225714,0.333333,0.636364
12705,0.001296,0.579009,0.53051,0.671642,0.701493,0.0,0.625806,0.14,0.792453,0.065934,0.21,0.731183,0.02755,0.529412,0.9,0.6,0.333333,0.181818
48883,0.000317,0.582719,0.525652,0.641791,0.656716,0.0,0.625806,0.02,0.754717,0.021978,0.33,0.698925,0.030481,0.544118,0.9,0.397143,0.333333,0.818182


# RidgeClassifier

In [15]:
ridge = RidgeClassifier() 
ridge.fit(X_train,y_train) 
ridge_pred = ridge.predict(X_test)
print('Accuracy of RidgeClassifier :{:.4f}'.format(accuracy_score(y_test,ridge_pred)))
print(confusion_matrix(y_test,ridge_pred))
print(classification_report(y_test,ridge_pred))

Accuracy of RidgeClassifier :0.5697
[[14493  5874]
 [10338  6968]]
              precision    recall  f1-score   support

           0       0.58      0.71      0.64     20367
           1       0.54      0.40      0.46     17306

    accuracy                           0.57     37673
   macro avg       0.56      0.56      0.55     37673
weighted avg       0.56      0.57      0.56     37673



### K Fold Cross Validation

In [16]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold_validation=KFold(10)

model = LogisticRegression() 
results=cross_val_score(model,X,y,cv=kfold_validation)
print(results)
print('Max :',np.max(results))
print('Min :',np.min(results))
print('Mean :',np.mean(results))

[0.66489715 0.59163902 0.56323822 0.5554081  0.55089582 0.55495089
 0.54273958 0.38731086 0.41040616 0.50955668]
Max : 0.6648971466489715
Min : 0.3873108574462437
Mean : 0.5331042463724283


### Stratified K-fold Cross Validation

In [17]:
from sklearn.model_selection import StratifiedKFold
skfold=StratifiedKFold(n_splits=10)
model = LogisticRegression() 
scores=cross_val_score(model,X,y,cv=skfold)
print(np.mean(scores))

0.4953713939849589


In [18]:
scores

array([0.5421367 , 0.56987392, 0.56522893, 0.50245521, 0.53948242,
       0.55760552, 0.46018051, 0.26148128, 0.47571011, 0.47955933])

In [20]:
print('Max :',np.max(scores))
print('Min :',np.min(scores))
print('Mean :',np.mean(scores))

Max : 0.5698739216987392
Min : 0.26148128484204936
Mean : 0.4953713939849589


# Tuning the model using grid searchCV
- Ridge regression is a penalized linear regression model for predicting a numerical value.
- Nevertheless, it can be very effective when applied to classification.
- Perhaps the most important parameter to tune is the regularization strength (alpha). A good starting point might be values in the range [0.1 to 1.0]

1. alpha in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

In [22]:
# example of grid searching key hyperparametres for logistic regression
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

model = RidgeClassifier() 

# define models and parameters
alpha = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

# define grid search
grid = dict(alpha=alpha)

cv = KFold(n_splits=10)

grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=1, cv=cv, scoring='accuracy',error_score=0,verbose=2)
grid_result = grid_search.fit(X_train,y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.2; total time=   0.0s
[CV] END .....................................

In [23]:
accuracy = grid_result.best_score_
accuracy

0.5709812191807614

In [24]:
grid_result.best_params_

{'alpha': 0.1}

## Model with parameters

In [32]:
ridge_model = RidgeClassifier(alpha=0.1) 
ridge_model.fit(X_train,y_train) 
ridge_model_pred = ridge_model.predict(X_test)
print('Accuracy of RidgeClassifier :{:.4f}'.format(accuracy_score(y_test,ridge_model_pred)))
print(confusion_matrix(y_test,ridge_model_pred))
print(classification_report(y_test,ridge_model_pred))

Accuracy of RidgeClassifier :0.5699
[[14428  5939]
 [10264  7042]]
              precision    recall  f1-score   support

           0       0.58      0.71      0.64     20367
           1       0.54      0.41      0.47     17306

    accuracy                           0.57     37673
   macro avg       0.56      0.56      0.55     37673
weighted avg       0.57      0.57      0.56     37673



In [36]:
print('Accuracy before tuning :',accuracy_score(y_test,ridge_pred))
print('\nAccuracy after tuning :',accuracy_score(y_test,ridge_model_pred))

Accuracy before tuning : 0.5696652775197091

Accuracy after tuning : 0.5699041754041356
