<a href="https://colab.research.google.com/github/WoradeeKongthong/raining_tomorrow_classification/blob/master/02_Raining_LogisticRegression_plus_FeatureEngineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# libraries
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# **Data Set**

In [0]:
df = pd.read_csv('https://raw.githubusercontent.com/WoradeeKongthong/raining_tomorrow_classification/master/weatherAUS.csv')

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142193 entries, 0 to 142192
Data columns (total 24 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           142193 non-null  object 
 1   Location       142193 non-null  object 
 2   MinTemp        141556 non-null  float64
 3   MaxTemp        141871 non-null  float64
 4   Rainfall       140787 non-null  float64
 5   Evaporation    81350 non-null   float64
 6   Sunshine       74377 non-null   float64
 7   WindGustDir    132863 non-null  object 
 8   WindGustSpeed  132923 non-null  float64
 9   WindDir9am     132180 non-null  object 
 10  WindDir3pm     138415 non-null  object 
 11  WindSpeed9am   140845 non-null  float64
 12  WindSpeed3pm   139563 non-null  float64
 13  Humidity9am    140419 non-null  float64
 14  Humidity3pm    138583 non-null  float64
 15  Pressure9am    128179 non-null  float64
 16  Pressure3pm    128212 non-null  float64
 17  Cloud9am       88536 non-null

In [0]:
# drop RISK_MM column (Recommendation from data description in Kaggle)
df.drop(['RISK_MM'], axis = 1, inplace = True)

# Extract Year, Month, Day from Date column
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

# drop Date column
df.drop(['Date'], axis = 1, inplace = True)

# select year 2016-2017 to train the model
df = df[df['Year'] > 2015]

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25974 entries, 2474 to 142192
Data columns (total 25 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Location       25974 non-null  object 
 1   MinTemp        25814 non-null  float64
 2   MaxTemp        25881 non-null  float64
 3   Rainfall       25710 non-null  float64
 4   Evaporation    10704 non-null  float64
 5   Sunshine       8019 non-null   float64
 6   WindGustDir    24514 non-null  object 
 7   WindGustSpeed  24514 non-null  float64
 8   WindDir9am     24398 non-null  object 
 9   WindDir3pm     24746 non-null  object 
 10  WindSpeed9am   25841 non-null  float64
 11  WindSpeed3pm   24854 non-null  float64
 12  Humidity9am    25686 non-null  float64
 13  Humidity3pm    24218 non-null  float64
 14  Pressure9am    23294 non-null  float64
 15  Pressure3pm    23289 non-null  float64
 16  Cloud9am       14674 non-null  float64
 17  Cloud3pm       12665 non-null  float64
 18  Te

# **Logistic Regression**

## Create Preprocessor : ColumnTransformer of numerical and categorical features
- impute missing numerical values with median
- impute missing categorical values with mode (most frequent)

In [0]:
numerical_features = [x for x in X.columns if df[x].dtype != 'object']

numeric_transformer = Pipeline(steps=[
          ('imputer', SimpleImputer(strategy='median')),
          ('scaler', MinMaxScaler())
])

categorical_features = [x for x in X.columns if df[x].dtype == 'object']

categorical_transformer = Pipeline(steps=[
          ('imputer', SimpleImputer(strategy='most_frequent')),
          ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
          ('num', numeric_transformer, numerical_features),
          ('cat', categorical_transformer, categorical_features)
    ]
)

## Create model

In [0]:
model = LogisticRegression(solver='sag', max_iter=500, n_jobs = -1)

## Create Pipeline

In [0]:
clf = Pipeline(steps=[
      ('preprocessor', preprocessor),
      ('model', model)
])

# **Trial 1** : keep outliers
- cross validation 
- train - Test evaluation

In [51]:
df

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Year,Month,Day
2474,Albury,20.4,37.6,0.0,,,ENE,54.0,,ESE,0.0,7.0,46.0,17.0,1013.4,1009.2,7.0,3.0,26.1,36.7,No,No,2016,1,1
2475,Albury,20.9,33.6,0.4,,,SSE,50.0,SSE,SE,9.0,17.0,54.0,30.0,1011.1,1008.4,8.0,8.0,24.8,31.7,No,Yes,2016,1,2
2476,Albury,18.4,23.1,2.2,,,ENE,48.0,ESE,ENE,11.0,39.0,62.0,67.0,1014.0,1014.8,8.0,8.0,21.8,19.5,Yes,Yes,2016,1,3
2477,Albury,17.3,23.7,15.6,,,SSE,39.0,SE,SSE,9.0,17.0,74.0,65.0,1017.9,1016.5,8.0,8.0,19.2,21.6,Yes,Yes,2016,1,4
2478,Albury,15.5,22.9,6.8,,,ENE,31.0,SE,SSE,6.0,9.0,92.0,63.0,1016.3,1013.9,8.0,8.0,17.2,22.2,Yes,No,2016,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142188,Uluru,3.5,21.8,0.0,,,E,31.0,ESE,E,15.0,13.0,59.0,27.0,1024.7,1021.2,,,9.4,20.9,No,No,2017,6,20
142189,Uluru,2.8,23.4,0.0,,,E,31.0,SE,ENE,13.0,11.0,51.0,24.0,1024.6,1020.3,,,10.1,22.4,No,No,2017,6,21
142190,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,N,13.0,9.0,56.0,21.0,1023.5,1019.1,,,10.9,24.5,No,No,2017,6,22
142191,Uluru,5.4,26.9,0.0,,,N,37.0,SE,WNW,9.0,9.0,53.0,24.0,1021.0,1016.8,,,12.5,26.1,No,No,2017,6,23


In [0]:
X = df.drop(['RainTomorrow'], axis=1)
y = df['RainTomorrow']

**Cross Validation**

In [30]:
accuracy = cross_val_score(clf,X,y,cv=10)
print('accuracy : ', accuracy)
print('mean : ', accuracy.mean())
print('std : ', accuracy.std())

accuracy :  [0.81293303 0.67898383 0.77290223 0.81408776 0.82056219 0.74778591
 0.7058144  0.67963034 0.69772815 0.82595302]
mean :  0.7556380859895485
std :  0.05805783589625687


Note : the mean accuracy of the model is satisfied  
the model has low variance

**Training and Test Sets**

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [32]:
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

print('\n\nTraining and Test Sets result')
print('\naccuracy score : ', accuracy_score(y_test,y_pred))
print('\nconfusion matrix : \n', confusion_matrix(y_test, y_pred))
print('\nclassification report : \n', classification_report(y_test,y_pred))

print('Training set score : ',clf.score(X_train,y_train))
print('Test set score : ',clf.score(X_test,y_test))





Training and Test Sets result

accuracy score :  0.8433108758421559

confusion matrix : 
 [[3794  186]
 [ 628  587]]

classification report : 
               precision    recall  f1-score   support

          No       0.86      0.95      0.90      3980
         Yes       0.76      0.48      0.59      1215

    accuracy                           0.84      5195
   macro avg       0.81      0.72      0.75      5195
weighted avg       0.83      0.84      0.83      5195

Training set score :  0.84214832282593
Test set score :  0.8433108758421559


Note : The model is not overfitting

# Trial 2 : drop outliers
- drop the outliers from df
- cross validation
- get training / test sets from the dropped-outlier df
- train - test evaluation

In [52]:
df

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Year,Month,Day
2474,Albury,20.4,37.6,0.0,,,ENE,54.0,,ESE,0.0,7.0,46.0,17.0,1013.4,1009.2,7.0,3.0,26.1,36.7,No,No,2016,1,1
2475,Albury,20.9,33.6,0.4,,,SSE,50.0,SSE,SE,9.0,17.0,54.0,30.0,1011.1,1008.4,8.0,8.0,24.8,31.7,No,Yes,2016,1,2
2476,Albury,18.4,23.1,2.2,,,ENE,48.0,ESE,ENE,11.0,39.0,62.0,67.0,1014.0,1014.8,8.0,8.0,21.8,19.5,Yes,Yes,2016,1,3
2477,Albury,17.3,23.7,15.6,,,SSE,39.0,SE,SSE,9.0,17.0,74.0,65.0,1017.9,1016.5,8.0,8.0,19.2,21.6,Yes,Yes,2016,1,4
2478,Albury,15.5,22.9,6.8,,,ENE,31.0,SE,SSE,6.0,9.0,92.0,63.0,1016.3,1013.9,8.0,8.0,17.2,22.2,Yes,No,2016,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142188,Uluru,3.5,21.8,0.0,,,E,31.0,ESE,E,15.0,13.0,59.0,27.0,1024.7,1021.2,,,9.4,20.9,No,No,2017,6,20
142189,Uluru,2.8,23.4,0.0,,,E,31.0,SE,ENE,13.0,11.0,51.0,24.0,1024.6,1020.3,,,10.1,22.4,No,No,2017,6,21
142190,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,N,13.0,9.0,56.0,21.0,1023.5,1019.1,,,10.9,24.5,No,No,2017,6,22
142191,Uluru,5.4,26.9,0.0,,,N,37.0,SE,WNW,9.0,9.0,53.0,24.0,1021.0,1016.8,,,12.5,26.1,No,No,2017,6,23


**drop the outliers from df**

In [33]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

df_drop_outliers = df[~((df < (Q1 - 1.5*IQR)) | (df > (Q3 + 1.5*IQR))).any(axis=1)]
df_drop_outliers

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Year,Month,Day
2474,Albury,20.4,37.6,0.0,,,ENE,54.0,,ESE,0.0,7.0,46.0,17.0,1013.4,1009.2,7.0,3.0,26.1,36.7,No,No,2016,1,1
2475,Albury,20.9,33.6,0.4,,,SSE,50.0,SSE,SE,9.0,17.0,54.0,30.0,1011.1,1008.4,8.0,8.0,24.8,31.7,No,Yes,2016,1,2
2479,Albury,17.0,28.1,0.2,,,SE,39.0,SSE,ENE,9.0,11.0,76.0,52.0,1014.6,1012.6,8.0,8.0,20.3,25.7,No,No,2016,1,6
2480,Albury,16.4,28.0,0.0,,,SE,35.0,SE,ESE,20.0,20.0,46.0,31.0,1017.4,1015.7,,2.0,20.9,26.6,No,No,2016,1,7
2481,Albury,14.3,31.7,0.0,,,NNW,24.0,SSE,ENE,11.0,6.0,63.0,24.0,1019.6,1017.2,,8.0,19.4,28.7,No,No,2016,1,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142188,Uluru,3.5,21.8,0.0,,,E,31.0,ESE,E,15.0,13.0,59.0,27.0,1024.7,1021.2,,,9.4,20.9,No,No,2017,6,20
142189,Uluru,2.8,23.4,0.0,,,E,31.0,SE,ENE,13.0,11.0,51.0,24.0,1024.6,1020.3,,,10.1,22.4,No,No,2017,6,21
142190,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,N,13.0,9.0,56.0,21.0,1023.5,1019.1,,,10.9,24.5,No,No,2017,6,22
142191,Uluru,5.4,26.9,0.0,,,N,37.0,SE,WNW,9.0,9.0,53.0,24.0,1021.0,1016.8,,,12.5,26.1,No,No,2017,6,23


In [0]:
X = df_drop_outliers.drop(['RainTomorrow'], axis=1)
y = df_drop_outliers['RainTomorrow']

**Cross Validation**

In [35]:
accuracy = cross_val_score(clf,X,y,cv=10)
print('accuracy : ', accuracy)
print('mean : ', accuracy.mean())
print('std : ', accuracy.std())

accuracy :  [0.852      0.7065     0.795      0.8365     0.86       0.7425
 0.742      0.7105     0.67483742 0.84892446]
mean :  0.776876188094047
std :  0.06617867329383562


Note : Dropping the outliers improves the model mean accuracy but the variance is higher.

**Training and Test Sets**

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [37]:
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

print('\n\nTraining and Test Sets result')
print('\naccuracy score : ', accuracy_score(y_test,y_pred))
print('\nconfusion matrix : \n', confusion_matrix(y_test, y_pred))
print('\nclassification report : \n', classification_report(y_test,y_pred))

print('Training set score : ',clf.score(X_train,y_train))
print('Test set score : ',clf.score(X_test,y_test))





Training and Test Sets result

accuracy score :  0.8655

confusion matrix : 
 [[3234  105]
 [ 433  228]]

classification report : 
               precision    recall  f1-score   support

          No       0.88      0.97      0.92      3339
         Yes       0.68      0.34      0.46       661

    accuracy                           0.87      4000
   macro avg       0.78      0.66      0.69      4000
weighted avg       0.85      0.87      0.85      4000

Training set score :  0.8679834979372422
Test set score :  0.8655


Note : The model is not overfitting

# Trial 3 : drop outliers from training set, keep test set outliers
- drop the outliers from training set (need to create training dataset)
- cross validation on training set
- train - test evaluation 

In [38]:
df

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Year,Month,Day
2474,Albury,20.4,37.6,0.0,,,ENE,54.0,,ESE,0.0,7.0,46.0,17.0,1013.4,1009.2,7.0,3.0,26.1,36.7,No,No,2016,1,1
2475,Albury,20.9,33.6,0.4,,,SSE,50.0,SSE,SE,9.0,17.0,54.0,30.0,1011.1,1008.4,8.0,8.0,24.8,31.7,No,Yes,2016,1,2
2476,Albury,18.4,23.1,2.2,,,ENE,48.0,ESE,ENE,11.0,39.0,62.0,67.0,1014.0,1014.8,8.0,8.0,21.8,19.5,Yes,Yes,2016,1,3
2477,Albury,17.3,23.7,15.6,,,SSE,39.0,SE,SSE,9.0,17.0,74.0,65.0,1017.9,1016.5,8.0,8.0,19.2,21.6,Yes,Yes,2016,1,4
2478,Albury,15.5,22.9,6.8,,,ENE,31.0,SE,SSE,6.0,9.0,92.0,63.0,1016.3,1013.9,8.0,8.0,17.2,22.2,Yes,No,2016,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142188,Uluru,3.5,21.8,0.0,,,E,31.0,ESE,E,15.0,13.0,59.0,27.0,1024.7,1021.2,,,9.4,20.9,No,No,2017,6,20
142189,Uluru,2.8,23.4,0.0,,,E,31.0,SE,ENE,13.0,11.0,51.0,24.0,1024.6,1020.3,,,10.1,22.4,No,No,2017,6,21
142190,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,N,13.0,9.0,56.0,21.0,1023.5,1019.1,,,10.9,24.5,No,No,2017,6,22
142191,Uluru,5.4,26.9,0.0,,,N,37.0,SE,WNW,9.0,9.0,53.0,24.0,1021.0,1016.8,,,12.5,26.1,No,No,2017,6,23


In [0]:
X = df.drop(['RainTomorrow'], axis=1)
y = df['RainTomorrow']

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

**drop outliers from training set**

In [0]:
# create temp_df to combine X_train and y_train and cap the outliers
train_df = pd.DataFrame.copy(X_train)
train_df['RainTomorrow'] = y_train

In [42]:
train_df

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,Year,Month,Day,RainTomorrow
96540,Adelaide,10.6,21.4,0.4,,,NNW,24.0,NNE,NNW,9.0,11.0,84.0,43.0,1024.0,1020.9,,,14.5,21.0,No,2016,5,13,No
47387,Canberra,7.1,11.3,0.0,,,NW,43.0,NW,NW,20.0,26.0,76.0,76.0,1004.2,1004.1,8.0,8.0,9.3,9.6,No,2016,6,7,No
75682,Portland,7.0,19.9,0.2,,,E,20.0,N,E,11.0,11.0,94.0,53.0,1023.3,1021.5,,,12.0,19.2,No,2017,4,17,No
8814,Cobar,17.6,32.5,1.6,,,SSE,41.0,E,SSE,6.0,9.0,90.0,36.0,1012.4,1010.2,6.0,2.0,19.9,29.5,Yes,2017,3,5,No
47608,Canberra,18.9,30.3,0.4,,,WNW,56.0,WNW,WNW,30.0,35.0,50.0,17.0,1004.7,1004.6,,,23.9,29.7,No,2017,1,14,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111595,Witchcliffe,15.4,18.0,42.6,,,NNW,59.0,NW,NW,22.0,20.0,82.0,97.0,1008.0,1003.5,,,17.5,17.3,Yes,2016,7,16,Yes
35416,SydneyAirport,13.9,23.2,0.0,2.8,9.6,E,31.0,NW,ENE,9.0,22.0,68.0,59.0,1030.8,1027.5,3.0,3.0,20.1,22.1,No,2016,4,26,No
23233,NorfolkIsland,17.0,21.5,0.4,12.8,,SW,48.0,SW,SW,22.0,22.0,66.0,64.0,1018.2,1017.0,2.0,2.0,20.3,20.3,No,2016,5,30,Yes
11477,CoffsHarbour,13.7,24.3,0.0,,,SE,17.0,S,ESE,9.0,11.0,64.0,44.0,1017.9,1014.5,,,19.8,23.7,No,2016,5,19,No


In [43]:
Q1 = train_df.quantile(0.25)
Q3 = train_df.quantile(0.75)
IQR = Q3 - Q1

train_df_drop_outliers = train_df[~((train_df < (Q1 - 1.5*IQR)) | (train_df > (Q3 + 1.5*IQR))).any(axis=1)]
train_df_drop_outliers

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,Year,Month,Day,RainTomorrow
96540,Adelaide,10.6,21.4,0.4,,,NNW,24.0,NNE,NNW,9.0,11.0,84.0,43.0,1024.0,1020.9,,,14.5,21.0,No,2016,5,13,No
47387,Canberra,7.1,11.3,0.0,,,NW,43.0,NW,NW,20.0,26.0,76.0,76.0,1004.2,1004.1,8.0,8.0,9.3,9.6,No,2016,6,7,No
75682,Portland,7.0,19.9,0.2,,,E,20.0,N,E,11.0,11.0,94.0,53.0,1023.3,1021.5,,,12.0,19.2,No,2017,4,17,No
8814,Cobar,17.6,32.5,1.6,,,SSE,41.0,E,SSE,6.0,9.0,90.0,36.0,1012.4,1010.2,6.0,2.0,19.9,29.5,Yes,2017,3,5,No
47608,Canberra,18.9,30.3,0.4,,,WNW,56.0,WNW,WNW,30.0,35.0,50.0,17.0,1004.7,1004.6,,,23.9,29.7,No,2017,1,14,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70732,Mildura,12.5,24.2,0.0,8.0,7.2,S,30.0,SSE,SW,7.0,15.0,61.0,29.0,1019.5,1017.1,8.0,7.0,13.2,23.1,No,2016,3,28,No
35416,SydneyAirport,13.9,23.2,0.0,2.8,9.6,E,31.0,NW,ENE,9.0,22.0,68.0,59.0,1030.8,1027.5,3.0,3.0,20.1,22.1,No,2016,4,26,No
23233,NorfolkIsland,17.0,21.5,0.4,12.8,,SW,48.0,SW,SW,22.0,22.0,66.0,64.0,1018.2,1017.0,2.0,2.0,20.3,20.3,No,2016,5,30,Yes
11477,CoffsHarbour,13.7,24.3,0.0,,,SE,17.0,S,ESE,9.0,11.0,64.0,44.0,1017.9,1014.5,,,19.8,23.7,No,2016,5,19,No


In [0]:
# retrive X_train and y_train from temp_df
X_train = train_df_drop_outliers.drop(['RainTomorrow'], axis=1)
y_train = train_df_drop_outliers['RainTomorrow']

**Cross Validation on Training set**

In [45]:
accuracy = cross_val_score(clf,X_train,y_train,cv=10)
print('accuracy : ', accuracy)
print('mean : ', accuracy.mean())
print('std : ', accuracy.std())

accuracy :  [0.868125   0.85991245 0.86366479 0.87617261 0.86116323 0.87054409
 0.86804253 0.86616635 0.86053784 0.86679174]
mean :  0.8661120622263916
std :  0.004786317292263473


Note : The model is low bias and low variance.  
The variance is much lower than Trial 1 and 2.

**Training and Test sets**

In [46]:
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

print('\n\nTraining and Test Sets result')
print('\naccuracy score : ', accuracy_score(y_test,y_pred))
print('\nconfusion matrix : \n', confusion_matrix(y_test, y_pred))
print('\nclassification report : \n', classification_report(y_test,y_pred))

print('Training set score : ',clf.score(X_train,y_train))
print('Test set score : ',clf.score(X_test,y_test))





Training and Test Sets result

accuracy score :  0.826371511068335

confusion matrix : 
 [[3625  386]
 [ 516  668]]

classification report : 
               precision    recall  f1-score   support

          No       0.88      0.90      0.89      4011
         Yes       0.63      0.56      0.60      1184

    accuracy                           0.83      5195
   macro avg       0.75      0.73      0.74      5195
weighted avg       0.82      0.83      0.82      5195

Training set score :  0.8681758489150146
Test set score :  0.826371511068335


Note : Train and test scores are slightly different.  
Test score is lower because I didn't drop the outliers in the test set.  
The model is not overfitting.  

# Trial 4 : drop outliers in training set, cap outliers in test set
- drop the outliers in training set (need to create training dataset)
- cross validation on training set
- cap outliers in the test set using top/bottom values of training set
- train - test evaluation 

In [47]:
df

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Year,Month,Day
2474,Albury,20.4,37.6,0.0,,,ENE,54.0,,ESE,0.0,7.0,46.0,17.0,1013.4,1009.2,7.0,3.0,26.1,36.7,No,No,2016,1,1
2475,Albury,20.9,33.6,0.4,,,SSE,50.0,SSE,SE,9.0,17.0,54.0,30.0,1011.1,1008.4,8.0,8.0,24.8,31.7,No,Yes,2016,1,2
2476,Albury,18.4,23.1,2.2,,,ENE,48.0,ESE,ENE,11.0,39.0,62.0,67.0,1014.0,1014.8,8.0,8.0,21.8,19.5,Yes,Yes,2016,1,3
2477,Albury,17.3,23.7,15.6,,,SSE,39.0,SE,SSE,9.0,17.0,74.0,65.0,1017.9,1016.5,8.0,8.0,19.2,21.6,Yes,Yes,2016,1,4
2478,Albury,15.5,22.9,6.8,,,ENE,31.0,SE,SSE,6.0,9.0,92.0,63.0,1016.3,1013.9,8.0,8.0,17.2,22.2,Yes,No,2016,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142188,Uluru,3.5,21.8,0.0,,,E,31.0,ESE,E,15.0,13.0,59.0,27.0,1024.7,1021.2,,,9.4,20.9,No,No,2017,6,20
142189,Uluru,2.8,23.4,0.0,,,E,31.0,SE,ENE,13.0,11.0,51.0,24.0,1024.6,1020.3,,,10.1,22.4,No,No,2017,6,21
142190,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,N,13.0,9.0,56.0,21.0,1023.5,1019.1,,,10.9,24.5,No,No,2017,6,22
142191,Uluru,5.4,26.9,0.0,,,N,37.0,SE,WNW,9.0,9.0,53.0,24.0,1021.0,1016.8,,,12.5,26.1,No,No,2017,6,23


In [0]:
X = df.drop(['RainTomorrow'], axis=1)
y = df['RainTomorrow']

**Separate Training set and test set**

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

**drop outliers from training set**

In [0]:
# create temp_df to combine X_train and y_train and cap the outliers
train_df = pd.DataFrame.copy(X_train)
train_df['RainTomorrow'] = y_train

In [55]:
train_df

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,Year,Month,Day,RainTomorrow
123702,SalmonGums,13.9,35.2,0.0,,,S,56.0,NNW,SSW,19.0,31.0,35.0,47.0,,,,,30.8,25.6,No,2017,1,22,No
135795,AliceSprings,16.1,39.1,0.0,14.2,,ENE,30.0,E,SE,9.0,13.0,13.0,9.0,1009.6,1006.0,,,29.4,37.0,No,2017,2,16,No
120738,Perth,15.3,35.1,0.0,13.0,13.1,SW,39.0,SSW,SSW,6.0,19.0,34.0,41.0,1008.4,1006.1,0.0,1.0,28.9,29.9,No,2017,1,15,No
120752,Perth,21.4,25.8,6.2,11.2,0.3,ESE,30.0,WSW,S,7.0,2.0,90.0,75.0,1011.3,1010.3,8.0,8.0,21.5,24.7,Yes,2017,1,29,Yes
99739,MountGambier,1.2,19.0,0.0,,,WNW,39.0,NNE,NNW,19.0,26.0,61.0,45.0,1022.9,1017.5,,,10.7,17.9,No,2016,10,24,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75247,Portland,15.0,22.2,0.2,6.2,12.8,SE,39.0,ESE,SE,17.0,24.0,63.0,62.0,1022.1,1019.5,,7.0,17.3,21.1,No,2016,2,5,No
123775,SalmonGums,9.5,31.5,0.0,,,SSW,43.0,N,NNW,24.0,13.0,37.0,18.0,,,,,21.6,29.8,No,2017,4,6,Yes
105626,Woomera,5.3,16.8,0.0,2.8,,NE,61.0,NE,NNE,26.0,31.0,78.0,54.0,1020.9,1016.6,6.0,8.0,8.6,14.9,No,2016,7,9,Yes
81170,Dartmoor,10.5,30.2,0.0,9.6,12.0,SSW,41.0,SSE,SW,2.0,22.0,50.0,36.0,1015.7,1014.2,,,20.0,26.2,No,2016,1,18,No


In [56]:
Q1 = train_df.quantile(0.25)
Q3 = train_df.quantile(0.75)
IQR = Q3 - Q1

lower_cap = Q1 - 1.5*IQR
upper_cap = Q3 + 1.5*IQR

train_df_drop_outliers = train_df[~((train_df < lower_cap) | (train_df > upper_cap)).any(axis=1)]
train_df_drop_outliers

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,Year,Month,Day,RainTomorrow
123702,SalmonGums,13.9,35.2,0.0,,,S,56.0,NNW,SSW,19.0,31.0,35.0,47.0,,,,,30.8,25.6,No,2017,1,22,No
120738,Perth,15.3,35.1,0.0,13.0,13.1,SW,39.0,SSW,SSW,6.0,19.0,34.0,41.0,1008.4,1006.1,0.0,1.0,28.9,29.9,No,2017,1,15,No
99739,MountGambier,1.2,19.0,0.0,,,WNW,39.0,NNE,NNW,19.0,26.0,61.0,45.0,1022.9,1017.5,,,10.7,17.9,No,2016,10,24,No
11588,CoffsHarbour,11.0,,0.0,,,ESE,28.0,WSW,ESE,9.0,11.0,69.0,59.0,1031.1,1028.9,7.0,5.0,19.1,20.9,No,2016,9,7,No
14343,Moree,4.3,20.8,0.0,,,N,30.0,ENE,NNE,17.0,19.0,72.0,46.0,1027.3,1023.6,2.0,,10.4,20.2,No,2016,5,31,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96696,Adelaide,12.5,15.4,1.2,,,SW,44.0,NW,WSW,13.0,17.0,85.0,90.0,1005.4,1005.1,,,12.5,11.7,Yes,2016,10,16,Yes
75247,Portland,15.0,22.2,0.2,6.2,12.8,SE,39.0,ESE,SE,17.0,24.0,63.0,62.0,1022.1,1019.5,,7.0,17.3,21.1,No,2016,2,5,No
123775,SalmonGums,9.5,31.5,0.0,,,SSW,43.0,N,NNW,24.0,13.0,37.0,18.0,,,,,21.6,29.8,No,2017,4,6,Yes
105626,Woomera,5.3,16.8,0.0,2.8,,NE,61.0,NE,NNE,26.0,31.0,78.0,54.0,1020.9,1016.6,6.0,8.0,8.6,14.9,No,2016,7,9,Yes


In [0]:
# retrive X_train and y_train from temp_df
X_train = train_df_drop_outliers.drop(['RainTomorrow'], axis=1)
y_train = train_df_drop_outliers['RainTomorrow']

**Cap outliers in test set**

In [0]:
features = lower_cap.index.values

for feature in features :
  X_test.loc[:,feature] = np.where(X_test.loc[:,feature]<lower_cap[feature],lower_cap[feature], X_test.loc[:,feature])
  X_test.loc[:,feature] = np.where(X_test.loc[:,feature]>upper_cap[feature],upper_cap[feature], X_test.loc[:,feature])

**Cross Validation on Training set**

In [91]:
accuracy = cross_val_score(clf,X_train,y_train,cv=10)
print('accuracy : ', accuracy)
print('mean : ', accuracy.mean())
print('std : ', accuracy.std())

accuracy :  [0.87038197 0.8622417  0.86036318 0.86662492 0.86842105 0.86716792
 0.86716792 0.86654135 0.87656642 0.86403509]
mean :  0.866951152144607
std :  0.0042524867475645985


Note : The model is low bias and low variance.  
The variance is much lower than Trial 1 and 2.

**Training and Test sets**

In [92]:
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

print('\n\nTraining and Test Sets result')
print('\naccuracy score : ', accuracy_score(y_test,y_pred))
print('\nconfusion matrix : \n', confusion_matrix(y_test, y_pred))
print('\nclassification report : \n', classification_report(y_test,y_pred))

print('Training set score : ',clf.score(X_train,y_train))
print('Test set score : ',clf.score(X_test,y_test))





Training and Test Sets result

accuracy score :  0.8375360923965351

confusion matrix : 
 [[3804  198]
 [ 646  547]]

classification report : 
               precision    recall  f1-score   support

          No       0.85      0.95      0.90      4002
         Yes       0.73      0.46      0.56      1193

    accuracy                           0.84      5195
   macro avg       0.79      0.70      0.73      5195
weighted avg       0.83      0.84      0.82      5195

Training set score :  0.8698947632172388
Test set score :  0.8375360923965351


Note : Train and test scores are slightly different.  
Test score is lower because I drop the outliers in the train set, I might lose some information.  
The model is not overfitting.  

# Trial 4 : cap outliers in the whole dataset
- cap the outliers in df
- cross validation
- train - test evaluation

In [0]:
temp_df = pd.DataFrame.copy(df)

In [94]:
temp_df

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Year,Month,Day
2474,Albury,20.4,37.6,0.0,,,ENE,54.0,,ESE,0.0,7.0,46.0,17.0,1013.4,1009.2,7.0,3.0,26.1,36.7,No,No,2016,1,1
2475,Albury,20.9,33.6,0.4,,,SSE,50.0,SSE,SE,9.0,17.0,54.0,30.0,1011.1,1008.4,8.0,8.0,24.8,31.7,No,Yes,2016,1,2
2476,Albury,18.4,23.1,2.2,,,ENE,48.0,ESE,ENE,11.0,39.0,62.0,67.0,1014.0,1014.8,8.0,8.0,21.8,19.5,Yes,Yes,2016,1,3
2477,Albury,17.3,23.7,15.6,,,SSE,39.0,SE,SSE,9.0,17.0,74.0,65.0,1017.9,1016.5,8.0,8.0,19.2,21.6,Yes,Yes,2016,1,4
2478,Albury,15.5,22.9,6.8,,,ENE,31.0,SE,SSE,6.0,9.0,92.0,63.0,1016.3,1013.9,8.0,8.0,17.2,22.2,Yes,No,2016,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142188,Uluru,3.5,21.8,0.0,,,E,31.0,ESE,E,15.0,13.0,59.0,27.0,1024.7,1021.2,,,9.4,20.9,No,No,2017,6,20
142189,Uluru,2.8,23.4,0.0,,,E,31.0,SE,ENE,13.0,11.0,51.0,24.0,1024.6,1020.3,,,10.1,22.4,No,No,2017,6,21
142190,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,N,13.0,9.0,56.0,21.0,1023.5,1019.1,,,10.9,24.5,No,No,2017,6,22
142191,Uluru,5.4,26.9,0.0,,,N,37.0,SE,WNW,9.0,9.0,53.0,24.0,1021.0,1016.8,,,12.5,26.1,No,No,2017,6,23


In [0]:
# cap the outliers
Q1 = temp_df.quantile(0.25)
Q3 = temp_df.quantile(0.75)
IQR = Q3 - Q1

lower_cap = Q1 - 1.5*IQR
upper_cap = Q3 + 1.5*IQR

features = lower_cap.index.values

for feature in features :
  temp_df.loc[:,feature] = np.where(temp_df.loc[:,feature]<lower_cap[feature],lower_cap[feature], temp_df.loc[:,feature])
  temp_df.loc[:,feature] = np.where(temp_df.loc[:,feature]>upper_cap[feature],upper_cap[feature], temp_df.loc[:,feature])

In [97]:
temp_df.shape

(25974, 25)

In [0]:
X = temp_df.drop(['RainTomorrow'], axis=1)
y = temp_df['RainTomorrow']

**Cross Validation**

In [101]:
accuracy = cross_val_score(clf,X,y,cv=10)
print('accuracy : ', accuracy)
print('mean : ', accuracy.mean())
print('std : ', accuracy.std())

accuracy :  [0.81370285 0.6812933  0.76943803 0.81100847 0.82826338 0.75009626
 0.70350404 0.67963034 0.69734309 0.82787832]
mean :  0.7562158089084254
std :  0.05879530806045305


Note :  low bias and low variance.  

**Training and Test Sets**

In [102]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

print('\n\nTraining and Test Sets result')
print('\naccuracy score : ', accuracy_score(y_test,y_pred))
print('\nconfusion matrix : \n', confusion_matrix(y_test, y_pred))
print('\nclassification report : \n', classification_report(y_test,y_pred))

print('\nTraining set score : ',clf.score(X_train,y_train))
print('Test set score : ',clf.score(X_test,y_test))





Training and Test Sets result

accuracy score :  0.8483156881616939

confusion matrix : 
 [[3822  225]
 [ 563  585]]

classification report : 
               precision    recall  f1-score   support

          No       0.87      0.94      0.91      4047
         Yes       0.72      0.51      0.60      1148

    accuracy                           0.85      5195
   macro avg       0.80      0.73      0.75      5195
weighted avg       0.84      0.85      0.84      5195


Training set score :  0.841715193223928
Test set score :  0.8483156881616939


Note : the model is not overfitting

# Trial 5 : cap the outliers in training set, cap the outlier in test set
- cap the outliers in X_train
- cap the outliers in X_test using the upper_cap and lower_cap of X_train
- cross validation on training set
- train - test evaluation

In [103]:
df

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Year,Month,Day
2474,Albury,20.4,37.6,0.0,,,ENE,54.0,,ESE,0.0,7.0,46.0,17.0,1013.4,1009.2,7.0,3.0,26.1,36.7,No,No,2016,1,1
2475,Albury,20.9,33.6,0.4,,,SSE,50.0,SSE,SE,9.0,17.0,54.0,30.0,1011.1,1008.4,8.0,8.0,24.8,31.7,No,Yes,2016,1,2
2476,Albury,18.4,23.1,2.2,,,ENE,48.0,ESE,ENE,11.0,39.0,62.0,67.0,1014.0,1014.8,8.0,8.0,21.8,19.5,Yes,Yes,2016,1,3
2477,Albury,17.3,23.7,15.6,,,SSE,39.0,SE,SSE,9.0,17.0,74.0,65.0,1017.9,1016.5,8.0,8.0,19.2,21.6,Yes,Yes,2016,1,4
2478,Albury,15.5,22.9,6.8,,,ENE,31.0,SE,SSE,6.0,9.0,92.0,63.0,1016.3,1013.9,8.0,8.0,17.2,22.2,Yes,No,2016,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142188,Uluru,3.5,21.8,0.0,,,E,31.0,ESE,E,15.0,13.0,59.0,27.0,1024.7,1021.2,,,9.4,20.9,No,No,2017,6,20
142189,Uluru,2.8,23.4,0.0,,,E,31.0,SE,ENE,13.0,11.0,51.0,24.0,1024.6,1020.3,,,10.1,22.4,No,No,2017,6,21
142190,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,N,13.0,9.0,56.0,21.0,1023.5,1019.1,,,10.9,24.5,No,No,2017,6,22
142191,Uluru,5.4,26.9,0.0,,,N,37.0,SE,WNW,9.0,9.0,53.0,24.0,1021.0,1016.8,,,12.5,26.1,No,No,2017,6,23


In [0]:
X = df.drop(['RainTomorrow'], axis=1)
y = df['RainTomorrow']

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

**Cap the outlier in X_train**

In [0]:
Q1 = X_train.quantile(0.25)
Q3 = X_train.quantile(0.75)
IQR = Q3 - Q1

lower_cap = Q1 - 1.5*IQR
upper_cap = Q3 + 1.5*IQR

features = lower_cap.index.values

for feature in features :
  X_train.loc[:,feature] = np.where(X_train.loc[:,feature]<lower_cap[feature],lower_cap[feature], X_train.loc[:,feature])
  X_train.loc[:,feature] = np.where(X_train.loc[:,feature]>upper_cap[feature],upper_cap[feature], X_train.loc[:feature])

In [0]:
# cap the outliers in X_test

for feature in features :
  X_test.loc[:,feature] = np.where(X_test.loc[:,feature]<lower_cap[feature],lower_cap[feature], X_test.loc[:,feature])
  X_test.loc[:,feature] = np.where(X_test.loc[:,feature]>upper_cap[feature],upper_cap[feature], X_test.loc[:,feature])

**Cross Validation on Training set**

In [108]:
accuracy = cross_val_score(clf,X_train,y_train,cv=10)
print('accuracy : ', accuracy)
print('mean : ', accuracy.mean())
print('std : ', accuracy.std())

accuracy :  [0.84456208 0.84841193 0.84119346 0.84023099 0.85033686 0.84359962
 0.84167469 0.84552454 0.83782483 0.84159846]
mean :  0.8434957458353857
std :  0.0036230931869390237


Note : low bias and very low variance 

**Training and Test sets**

In [109]:
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

print('\n\nTraining and Test Sets result')
print('\naccuracy score : ', accuracy_score(y_test,y_pred))
print('\nconfusion matrix : \n', confusion_matrix(y_test, y_pred))
print('\nclassification report : \n', classification_report(y_test,y_pred))

print('Training set score : ',clf.score(X_train,y_train))
print('Test set score : ',clf.score(X_test,y_test))





Training and Test Sets result

accuracy score :  0.831953801732435

confusion matrix : 
 [[3736  237]
 [ 636  586]]

classification report : 
               precision    recall  f1-score   support

          No       0.85      0.94      0.90      3973
         Yes       0.71      0.48      0.57      1222

    accuracy                           0.83      5195
   macro avg       0.78      0.71      0.73      5195
weighted avg       0.82      0.83      0.82      5195

Training set score :  0.8448433514606093
Test set score :  0.831953801732435


Note : There is no overfitting.