<a href="https://colab.research.google.com/github/WoradeeKongthong/raining_tomorrow_classification/blob/master/02_Raining_LogisticRegression_plus_FeatureEngineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# libraries
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# **Data Set**

In [0]:
df = pd.read_csv('https://raw.githubusercontent.com/WoradeeKongthong/raining_tomorrow_classification/master/weatherAUS.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142193 entries, 0 to 142192
Data columns (total 24 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           142193 non-null  object 
 1   Location       142193 non-null  object 
 2   MinTemp        141556 non-null  float64
 3   MaxTemp        141871 non-null  float64
 4   Rainfall       140787 non-null  float64
 5   Evaporation    81350 non-null   float64
 6   Sunshine       74377 non-null   float64
 7   WindGustDir    132863 non-null  object 
 8   WindGustSpeed  132923 non-null  float64
 9   WindDir9am     132180 non-null  object 
 10  WindDir3pm     138415 non-null  object 
 11  WindSpeed9am   140845 non-null  float64
 12  WindSpeed3pm   139563 non-null  float64
 13  Humidity9am    140419 non-null  float64
 14  Humidity3pm    138583 non-null  float64
 15  Pressure9am    128179 non-null  float64
 16  Pressure3pm    128212 non-null  float64
 17  Cloud9am       88536 non-null

In [0]:
# drop RISK_MM column (Recommendation from data description in Kaggle)
df.drop(['RISK_MM'], axis = 1, inplace = True)

# Extract Year, Month, Day from Date column
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

# drop Date column
df.drop(['Date'], axis = 1, inplace = True)

# select year 2016-2017 to train the model
df = df[df['Year'] > 2015]

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25974 entries, 2474 to 142192
Data columns (total 25 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Location       25974 non-null  object 
 1   MinTemp        25814 non-null  float64
 2   MaxTemp        25881 non-null  float64
 3   Rainfall       25710 non-null  float64
 4   Evaporation    10704 non-null  float64
 5   Sunshine       8019 non-null   float64
 6   WindGustDir    24514 non-null  object 
 7   WindGustSpeed  24514 non-null  float64
 8   WindDir9am     24398 non-null  object 
 9   WindDir3pm     24746 non-null  object 
 10  WindSpeed9am   25841 non-null  float64
 11  WindSpeed3pm   24854 non-null  float64
 12  Humidity9am    25686 non-null  float64
 13  Humidity3pm    24218 non-null  float64
 14  Pressure9am    23294 non-null  float64
 15  Pressure3pm    23289 non-null  float64
 16  Cloud9am       14674 non-null  float64
 17  Cloud3pm       12665 non-null  float64
 18  Te

In [0]:
X = df.drop(['RainTomorrow'], axis=1)
y = df['RainTomorrow']

# **Trial 1**
- keep the outliers
- impute the missing categorical values with mode
- impute the missing numerical values with median

**Create Preprocessor : ColumnTransformer of numerical and categorical features**

In [0]:
numerical_features = [x for x in X.columns if df[x].dtype != 'object']

numeric_transformer = Pipeline(steps=[
          ('imputer', SimpleImputer(strategy='median')),
          ('scaler', MinMaxScaler())
])

categorical_features = [x for x in X.columns if df[x].dtype == 'object']

categorical_transformer = Pipeline(steps=[
          ('imputer', SimpleImputer(strategy='most_frequent')),
          ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
          ('num', numeric_transformer, numerical_features),
          ('cat', categorical_transformer, categorical_features)
    ]
)

**Create model**

In [0]:
model = LogisticRegression(solver='sag', max_iter=500, n_jobs = -1)

**Create Pipeline**

In [0]:
clf = Pipeline(steps=[
      ('preprocessor', preprocessor),
      ('model', model)
])

**Cross Validation**

In [10]:
accuracy = cross_val_score(clf,X,y,cv=10)
print('accuracy : ', accuracy)
print('mean : ', accuracy.mean())
print('std : ', accuracy.std())

accuracy :  [0.81293303 0.67898383 0.77290223 0.81408776 0.82056219 0.74778591
 0.7058144  0.67963034 0.69772815 0.82595302]
mean :  0.7556380859895485
std :  0.05805783589625687


Note : the model has low bias and low variance

**Training and Test Sets**

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

print('\n\nTraining and Test Sets result')
print('\naccuracy score : ', accuracy_score(y_test,y_pred))
print('\nconfusion matrix : \n', confusion_matrix(y_test, y_pred))
print('\nclassification report : \n', classification_report(y_test,y_pred))

print('Training set score : ',clf.score(X_train,y_train))
print('Test set score : ',clf.score(X_test,y_test))





Training and Test Sets result

accuracy score :  0.8442733397497594

confusion matrix : 
 [[3811  209]
 [ 600  575]]

classification report : 
               precision    recall  f1-score   support

          No       0.86      0.95      0.90      4020
         Yes       0.73      0.49      0.59      1175

    accuracy                           0.84      5195
   macro avg       0.80      0.72      0.75      5195
weighted avg       0.83      0.84      0.83      5195

Training set score :  0.8408970595312575
Test set score :  0.8442733397497594


Note : The model is not overfitting

# Trial 2 
- drop the outliers from df
- impute the missing categorical values with mode
- impute the missing numerical values with median

In [12]:
# drop the outliers from df

Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df_drop_outliers = df[~((df < (Q1 - 1.5*IQR)) | (df > (Q3 + 1.5*IQR))).any(axis=1)]
df_drop_outliers

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Year,Month,Day
2474,Albury,20.4,37.6,0.0,,,ENE,54.0,,ESE,0.0,7.0,46.0,17.0,1013.4,1009.2,7.0,3.0,26.1,36.7,No,No,2016,1,1
2475,Albury,20.9,33.6,0.4,,,SSE,50.0,SSE,SE,9.0,17.0,54.0,30.0,1011.1,1008.4,8.0,8.0,24.8,31.7,No,Yes,2016,1,2
2479,Albury,17.0,28.1,0.2,,,SE,39.0,SSE,ENE,9.0,11.0,76.0,52.0,1014.6,1012.6,8.0,8.0,20.3,25.7,No,No,2016,1,6
2480,Albury,16.4,28.0,0.0,,,SE,35.0,SE,ESE,20.0,20.0,46.0,31.0,1017.4,1015.7,,2.0,20.9,26.6,No,No,2016,1,7
2481,Albury,14.3,31.7,0.0,,,NNW,24.0,SSE,ENE,11.0,6.0,63.0,24.0,1019.6,1017.2,,8.0,19.4,28.7,No,No,2016,1,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142188,Uluru,3.5,21.8,0.0,,,E,31.0,ESE,E,15.0,13.0,59.0,27.0,1024.7,1021.2,,,9.4,20.9,No,No,2017,6,20
142189,Uluru,2.8,23.4,0.0,,,E,31.0,SE,ENE,13.0,11.0,51.0,24.0,1024.6,1020.3,,,10.1,22.4,No,No,2017,6,21
142190,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,N,13.0,9.0,56.0,21.0,1023.5,1019.1,,,10.9,24.5,No,No,2017,6,22
142191,Uluru,5.4,26.9,0.0,,,N,37.0,SE,WNW,9.0,9.0,53.0,24.0,1021.0,1016.8,,,12.5,26.1,No,No,2017,6,23


In [0]:
X = df_drop_outliers.drop(['RainTomorrow'], axis=1)
y = df_drop_outliers['RainTomorrow']

**Cross Validation**

In [14]:
accuracy = cross_val_score(clf,X,y,cv=10)
print('accuracy : ', accuracy)
print('mean : ', accuracy.mean())
print('std : ', accuracy.std())

accuracy :  [0.852      0.7065     0.795      0.8365     0.86       0.7425
 0.742      0.7105     0.67483742 0.84892446]
mean :  0.776876188094047
std :  0.06617867329383562


Note : Dropping the outliers improves the model mean accuracy but the variance is higher.

**Training and Test Sets**

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

print('\n\nTraining and Test Sets result')
print('\naccuracy score : ', accuracy_score(y_test,y_pred))
print('\nconfusion matrix : \n', confusion_matrix(y_test, y_pred))
print('\nclassification report : \n', classification_report(y_test,y_pred))

print('Training set score : ',clf.score(X_train,y_train))
print('Test set score : ',clf.score(X_test,y_test))





Training and Test Sets result

accuracy score :  0.86325

confusion matrix : 
 [[3230   89]
 [ 458  223]]

classification report : 
               precision    recall  f1-score   support

          No       0.88      0.97      0.92      3319
         Yes       0.71      0.33      0.45       681

    accuracy                           0.86      4000
   macro avg       0.80      0.65      0.69      4000
weighted avg       0.85      0.86      0.84      4000

Training set score :  0.8687335916989624
Test set score :  0.86325


Note : The model is not overfitting

# Trial 3
- drop the outliers from X_train
- impute the missing categorical values with mode
- impute the missing numerical values with median

In [16]:
df

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Year,Month,Day
2474,Albury,20.4,37.6,0.0,,,ENE,54.0,,ESE,0.0,7.0,46.0,17.0,1013.4,1009.2,7.0,3.0,26.1,36.7,No,No,2016,1,1
2475,Albury,20.9,33.6,0.4,,,SSE,50.0,SSE,SE,9.0,17.0,54.0,30.0,1011.1,1008.4,8.0,8.0,24.8,31.7,No,Yes,2016,1,2
2476,Albury,18.4,23.1,2.2,,,ENE,48.0,ESE,ENE,11.0,39.0,62.0,67.0,1014.0,1014.8,8.0,8.0,21.8,19.5,Yes,Yes,2016,1,3
2477,Albury,17.3,23.7,15.6,,,SSE,39.0,SE,SSE,9.0,17.0,74.0,65.0,1017.9,1016.5,8.0,8.0,19.2,21.6,Yes,Yes,2016,1,4
2478,Albury,15.5,22.9,6.8,,,ENE,31.0,SE,SSE,6.0,9.0,92.0,63.0,1016.3,1013.9,8.0,8.0,17.2,22.2,Yes,No,2016,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142188,Uluru,3.5,21.8,0.0,,,E,31.0,ESE,E,15.0,13.0,59.0,27.0,1024.7,1021.2,,,9.4,20.9,No,No,2017,6,20
142189,Uluru,2.8,23.4,0.0,,,E,31.0,SE,ENE,13.0,11.0,51.0,24.0,1024.6,1020.3,,,10.1,22.4,No,No,2017,6,21
142190,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,N,13.0,9.0,56.0,21.0,1023.5,1019.1,,,10.9,24.5,No,No,2017,6,22
142191,Uluru,5.4,26.9,0.0,,,N,37.0,SE,WNW,9.0,9.0,53.0,24.0,1021.0,1016.8,,,12.5,26.1,No,No,2017,6,23


In [0]:
X = df.drop(['RainTomorrow'], axis=1)
y = df['RainTomorrow']

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [0]:
# create temp_df to combine X_train and y_train and cap the outliers
temp_df = X_train
temp_df['RainTomorrow'] = y_train

In [20]:
temp_df

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,Year,Month,Day,RainTomorrow
17637,Newcastle,,21.2,3.8,,,,,SE,,9.0,,76.0,,,,7.0,,17.8,,Yes,2017,5,4,No
70902,Mildura,8.1,17.0,5.4,0.8,7.0,NW,78.0,NW,WNW,26.0,43.0,77.0,44.0,1009.1,1006.7,6.0,7.0,11.4,15.6,Yes,2016,9,14,Yes
132697,Launceston,7.0,21.7,0.0,,,N,28.0,N,NW,2.0,15.0,59.0,41.0,,,,,14.9,20.7,No,2016,12,11,No
41339,Williamtown,6.6,22.8,0.2,,,WNW,37.0,NW,WNW,7.0,22.0,100.0,53.0,1014.6,1011.1,8.0,,12.5,22.5,No,2017,5,28,No
70650,Mildura,17.0,33.5,0.0,8.0,13.0,S,39.0,ENE,S,9.0,15.0,58.0,25.0,1015.2,1013.6,1.0,2.0,23.4,32.0,No,2016,1,6,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32316,Sydney,21.6,25.2,0.0,4.6,0.0,,,E,E,13.0,13.0,73.0,64.0,1014.5,1013.3,8.0,8.0,23.2,24.7,No,2016,1,25,No
139017,Darwin,23.5,30.6,33.8,3.4,5.6,NW,31.0,NW,NNW,9.0,15.0,92.0,73.0,1008.9,1005.9,7.0,7.0,26.5,30.1,Yes,2017,3,21,Yes
138829,Darwin,23.8,33.1,0.0,7.2,9.2,NW,33.0,NNE,NNW,15.0,17.0,66.0,49.0,1013.7,1009.2,2.0,3.0,28.9,31.8,No,2016,9,14,No
108789,Albany,14.6,19.0,0.0,6.4,,,,S,,15.0,,65.0,,1019.7,1018.9,6.0,,16.5,,No,2016,11,29,No


In [21]:
Q1 = temp_df.quantile(0.25)
Q3 = temp_df.quantile(0.75)
IQR = Q3 - Q1
temp_df_drop_outliers = temp_df[~((temp_df < (Q1 - 1.5*IQR)) | (temp_df > (Q3 + 1.5*IQR))).any(axis=1)]
temp_df_drop_outliers

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,Year,Month,Day,RainTomorrow
132697,Launceston,7.0,21.7,0.0,,,N,28.0,N,NW,2.0,15.0,59.0,41.0,,,,,14.9,20.7,No,2016,12,11,No
41339,Williamtown,6.6,22.8,0.2,,,WNW,37.0,NW,WNW,7.0,22.0,100.0,53.0,1014.6,1011.1,8.0,,12.5,22.5,No,2017,5,28,No
70650,Mildura,17.0,33.5,0.0,8.0,13.0,S,39.0,ENE,S,9.0,15.0,58.0,25.0,1015.2,1013.6,1.0,2.0,23.4,32.0,No,2016,1,6,No
138946,Darwin,28.2,32.4,0.0,8.0,9.6,NW,37.0,WNW,WNW,19.0,24.0,73.0,67.0,1004.6,1001.8,7.0,7.0,29.8,31.9,No,2017,1,9,No
65249,MelbourneAirport,13.4,18.8,0.8,9.0,4.5,S,52.0,SW,S,28.0,31.0,58.0,42.0,1015.6,1016.8,7.0,7.0,14.9,18.2,No,2016,2,16,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135896,AliceSprings,2.2,19.2,0.0,,,ESE,48.0,SE,ESE,15.0,26.0,58.0,31.0,1029.2,1025.3,,,11.5,18.6,No,2017,5,31,No
32316,Sydney,21.6,25.2,0.0,4.6,0.0,,,E,E,13.0,13.0,73.0,64.0,1014.5,1013.3,8.0,8.0,23.2,24.7,No,2016,1,25,No
138829,Darwin,23.8,33.1,0.0,7.2,9.2,NW,33.0,NNE,NNW,15.0,17.0,66.0,49.0,1013.7,1009.2,2.0,3.0,28.9,31.8,No,2016,9,14,No
108789,Albany,14.6,19.0,0.0,6.4,,,,S,,15.0,,65.0,,1019.7,1018.9,6.0,,16.5,,No,2016,11,29,No


In [0]:
# retrive X_train and y_train from temp_df
X_train = temp_df_drop_outliers.drop(['RainTomorrow'], axis=1)
y_train = temp_df_drop_outliers['RainTomorrow']

**Cross Validation on Training set**

In [23]:
accuracy = cross_val_score(clf,X_train,y_train,cv=10)
print('accuracy : ', accuracy)
print('mean : ', accuracy.mean())
print('std : ', accuracy.std())

accuracy :  [0.87492183 0.86116323 0.86053784 0.8630394  0.85419274 0.86357947
 0.86733417 0.87296621 0.86545682 0.86858573]
mean :  0.8651777432860493
std :  0.005808546303785941


Note : The model is low bias and low variance.  
The variance is much lower than Trial 1 and 2.

**Training and Test sets**

In [24]:
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

print('\n\nTraining and Test Sets result')
print('\naccuracy score : ', accuracy_score(y_test,y_pred))
print('\nconfusion matrix : \n', confusion_matrix(y_test, y_pred))
print('\nclassification report : \n', classification_report(y_test,y_pred))

print('Training set score : ',clf.score(X_train,y_train))
print('Test set score : ',clf.score(X_test,y_test))





Training and Test Sets result

accuracy score :  0.8202117420596727

confusion matrix : 
 [[3590  439]
 [ 495  671]]

classification report : 
               precision    recall  f1-score   support

          No       0.88      0.89      0.88      4029
         Yes       0.60      0.58      0.59      1166

    accuracy                           0.82      5195
   macro avg       0.74      0.73      0.74      5195
weighted avg       0.82      0.82      0.82      5195

Training set score :  0.8678053053053053
Test set score :  0.8202117420596727


Note : Train and test scores are slightly different.  
Test score is lower because I didn't drop the outliers in the test set.  
Think of them as real data to be predicted.  
The model is not overfitting.  
But let's try adding regularization.

In [25]:
# C=inverse regularization strength

# using the same preprocessor
model = LogisticRegression(solver='sag', max_iter=500, n_jobs = -1, C=0.01)
clf = Pipeline(steps=[
      ('preprocessor', preprocessor),
      ('model', model)
])

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

print('\n\nTraining and Test Sets result')
print('\naccuracy score : ', accuracy_score(y_test,y_pred))
print('\nconfusion matrix : \n', confusion_matrix(y_test, y_pred))
print('\nclassification report : \n', classification_report(y_test,y_pred))

print('Training set score : ',clf.score(X_train,y_train))
print('Test set score : ',clf.score(X_test,y_test))



Training and Test Sets result

accuracy score :  0.8102021174205968

confusion matrix : 
 [[3770  259]
 [ 727  439]]

classification report : 
               precision    recall  f1-score   support

          No       0.84      0.94      0.88      4029
         Yes       0.63      0.38      0.47      1166

    accuracy                           0.81      5195
   macro avg       0.73      0.66      0.68      5195
weighted avg       0.79      0.81      0.79      5195

Training set score :  0.8434684684684685
Test set score :  0.8102021174205968


Note : adding regularization makes the scores lower.

# Trial 4
- cap the outliers in df
- impute the missing categorical values with mode
- impute the missing numerical values with median

In [26]:
df

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Year,Month,Day
2474,Albury,20.4,37.6,0.0,,,ENE,54.0,,ESE,0.0,7.0,46.0,17.0,1013.4,1009.2,7.0,3.0,26.1,36.7,No,No,2016,1,1
2475,Albury,20.9,33.6,0.4,,,SSE,50.0,SSE,SE,9.0,17.0,54.0,30.0,1011.1,1008.4,8.0,8.0,24.8,31.7,No,Yes,2016,1,2
2476,Albury,18.4,23.1,2.2,,,ENE,48.0,ESE,ENE,11.0,39.0,62.0,67.0,1014.0,1014.8,8.0,8.0,21.8,19.5,Yes,Yes,2016,1,3
2477,Albury,17.3,23.7,15.6,,,SSE,39.0,SE,SSE,9.0,17.0,74.0,65.0,1017.9,1016.5,8.0,8.0,19.2,21.6,Yes,Yes,2016,1,4
2478,Albury,15.5,22.9,6.8,,,ENE,31.0,SE,SSE,6.0,9.0,92.0,63.0,1016.3,1013.9,8.0,8.0,17.2,22.2,Yes,No,2016,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142188,Uluru,3.5,21.8,0.0,,,E,31.0,ESE,E,15.0,13.0,59.0,27.0,1024.7,1021.2,,,9.4,20.9,No,No,2017,6,20
142189,Uluru,2.8,23.4,0.0,,,E,31.0,SE,ENE,13.0,11.0,51.0,24.0,1024.6,1020.3,,,10.1,22.4,No,No,2017,6,21
142190,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,N,13.0,9.0,56.0,21.0,1023.5,1019.1,,,10.9,24.5,No,No,2017,6,22
142191,Uluru,5.4,26.9,0.0,,,N,37.0,SE,WNW,9.0,9.0,53.0,24.0,1021.0,1016.8,,,12.5,26.1,No,No,2017,6,23


In [0]:
# cap the outliers
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
lower_cap = Q1 - 1.5*IQR
upper_cap = Q3 + 1.5*IQR

features = lower_cap.index.values

for feature in features :
  df[feature] = np.where(df[feature]<lower_cap[feature],lower_cap[feature], df[feature])
  df[feature] = np.where(df[feature]>upper_cap[feature],upper_cap[feature], df[feature])

In [28]:
df.shape

(25974, 25)

In [29]:
df_drop_outliers = df[~((df < (Q1 - 1.5*IQR)) | (df > (Q3 + 1.5*IQR))).any(axis=1)]
df_drop_outliers.shape

(25974, 25)

Note : no outliers are dropped, cap outliers successfully

In [30]:
df

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Year,Month,Day
2474,Albury,20.4,37.6,0.0,,,ENE,54.0,,ESE,0.0,7.0,46.0,17.0,1013.4,1009.2,7.0,3.0,26.1,36.7,No,No,2016.0,1.0,1.0
2475,Albury,20.9,33.6,0.4,,,SSE,50.0,SSE,SE,9.0,17.0,54.0,30.0,1011.1,1008.4,8.0,8.0,24.8,31.7,No,Yes,2016.0,1.0,2.0
2476,Albury,18.4,23.1,2.0,,,ENE,48.0,ESE,ENE,11.0,39.0,62.0,67.0,1014.0,1014.8,8.0,8.0,21.8,19.5,Yes,Yes,2016.0,1.0,3.0
2477,Albury,17.3,23.7,2.0,,,SSE,39.0,SE,SSE,9.0,17.0,74.0,65.0,1017.9,1016.5,8.0,8.0,19.2,21.6,Yes,Yes,2016.0,1.0,4.0
2478,Albury,15.5,22.9,2.0,,,ENE,31.0,SE,SSE,6.0,9.0,92.0,63.0,1016.3,1013.9,8.0,8.0,17.2,22.2,Yes,No,2016.0,1.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142188,Uluru,3.5,21.8,0.0,,,E,31.0,ESE,E,15.0,13.0,59.0,27.0,1024.7,1021.2,,,9.4,20.9,No,No,2017.0,6.0,20.0
142189,Uluru,2.8,23.4,0.0,,,E,31.0,SE,ENE,13.0,11.0,51.0,24.0,1024.6,1020.3,,,10.1,22.4,No,No,2017.0,6.0,21.0
142190,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,N,13.0,9.0,56.0,21.0,1023.5,1019.1,,,10.9,24.5,No,No,2017.0,6.0,22.0
142191,Uluru,5.4,26.9,0.0,,,N,37.0,SE,WNW,9.0,9.0,53.0,24.0,1021.0,1016.8,,,12.5,26.1,No,No,2017.0,6.0,23.0


In [0]:
X = df.drop(['RainTomorrow'], axis=1)
y = df['RainTomorrow']

**Cross Validation**

In [32]:
accuracy = cross_val_score(clf,X,y,cv=10)
print('accuracy : ', accuracy)
print('mean : ', accuracy.mean())
print('std : ', accuracy.std())

accuracy :  [0.80138568 0.79907621 0.81524249 0.81947652 0.82094725 0.82980362
 0.81709665 0.82133231 0.80246438 0.8290335 ]
mean :  0.8155858613435351
std :  0.010529359794717126


Note :  low bias and low variance.  
The result is better than keeping or dropping the outliers in df

**Training and Test Sets**

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

print('\n\nTraining and Test Sets result')
print('\naccuracy score : ', accuracy_score(y_test,y_pred))
print('\nconfusion matrix : \n', confusion_matrix(y_test, y_pred))
print('\nclassification report : \n', classification_report(y_test,y_pred))

print('\nTraining set score : ',clf.score(X_train,y_train))
print('Test set score : ',clf.score(X_test,y_test))



Training and Test Sets result

accuracy score :  0.8186717998075073

confusion matrix : 
 [[3860  141]
 [ 801  393]]

classification report : 
               precision    recall  f1-score   support

          No       0.83      0.96      0.89      4001
         Yes       0.74      0.33      0.45      1194

    accuracy                           0.82      5195
   macro avg       0.78      0.65      0.67      5195
weighted avg       0.81      0.82      0.79      5195


Training set score :  0.826363155108523
Test set score :  0.8186717998075073


Note : the scores are slightly different

# Trial 5
- cap the outliers in X_train
- cap the outliers in X_test using the upper_cap and lower_cap of X_train
- impute the missing categorical values with mode
- impute the missing numerical values with median

In [0]:
df = pd.read_csv('https://raw.githubusercontent.com/WoradeeKongthong/raining_tomorrow_classification/master/weatherAUS.csv')

In [0]:
# drop RISK_MM column (Recommendation from data description in Kaggle)
df.drop(['RISK_MM'], axis = 1, inplace = True)

# Extract Year, Month, Day from Date column
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

# drop Date column
df.drop(['Date'], axis = 1, inplace = True)

# select year 2016-2017 to train the model
df = df[df['Year'] > 2015]

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25974 entries, 2474 to 142192
Data columns (total 25 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Location       25974 non-null  object 
 1   MinTemp        25814 non-null  float64
 2   MaxTemp        25881 non-null  float64
 3   Rainfall       25710 non-null  float64
 4   Evaporation    10704 non-null  float64
 5   Sunshine       8019 non-null   float64
 6   WindGustDir    24514 non-null  object 
 7   WindGustSpeed  24514 non-null  float64
 8   WindDir9am     24398 non-null  object 
 9   WindDir3pm     24746 non-null  object 
 10  WindSpeed9am   25841 non-null  float64
 11  WindSpeed3pm   24854 non-null  float64
 12  Humidity9am    25686 non-null  float64
 13  Humidity3pm    24218 non-null  float64
 14  Pressure9am    23294 non-null  float64
 15  Pressure3pm    23289 non-null  float64
 16  Cloud9am       14674 non-null  float64
 17  Cloud3pm       12665 non-null  float64
 18  Te

In [0]:
X = df.drop(['RainTomorrow'], axis=1)
y = df['RainTomorrow']

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [0]:
# cap the outliers in X_train

Q1 = X_train.quantile(0.25)
Q3 = X_train.quantile(0.75)
IQR = Q3 - Q1

lower_cap = Q1 - 1.5*IQR
upper_cap = Q3 + 1.5*IQR

features = lower_cap.index.values

for feature in features :
  X_train[feature] = np.where(X_train[feature]<lower_cap[feature],lower_cap[feature], X_train[feature])
  X_train[feature] = np.where(X_train[feature]>upper_cap[feature],upper_cap[feature], X_train[feature])

In [0]:
# retrive y_train and X_train from temp_df
X_train = temp_df.drop(['RainTomorrow'], axis=1)
y_train = temp_df['RainTomorrow']

**Cross Validation on Training set**

In [51]:
accuracy = cross_val_score(clf,X_train,y_train,cv=10)
print('accuracy : ', accuracy)
print('mean : ', accuracy.mean())
print('std : ', accuracy.std())

accuracy :  [0.81183831 0.81568816 0.82242541 0.81761309 0.81953802 0.81857555
 0.81857555 0.8238691  0.81857555 0.82089552]
mean :  0.8187594271184979
std :  0.0032166355532423593


Note : low bias and very low variance 

**Training and Test sets**

In [0]:
# first, cap the outliers in Tes set 

for feature in features :
  X_test[feature] = np.where(X_test[feature]<lower_cap[feature],lower_cap[feature], X_test[feature])
  X_test[feature] = np.where(X_test[feature]>upper_cap[feature],upper_cap[feature], X_test[feature])

In [53]:
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

print('\n\nTraining and Test Sets result')
print('\naccuracy score : ', accuracy_score(y_test,y_pred))
print('\nconfusion matrix : \n', confusion_matrix(y_test, y_pred))
print('\nclassification report : \n', classification_report(y_test,y_pred))

print('Training set score : ',clf.score(X_train,y_train))
print('Test set score : ',clf.score(X_test,y_test))



Training and Test Sets result

accuracy score :  0.8186717998075073

confusion matrix : 
 [[3886  128]
 [ 814  367]]

classification report : 
               precision    recall  f1-score   support

          No       0.83      0.97      0.89      4014
         Yes       0.74      0.31      0.44      1181

    accuracy                           0.82      5195
   macro avg       0.78      0.64      0.66      5195
weighted avg       0.81      0.82      0.79      5195

Training set score :  0.8207805957938303
Test set score :  0.8186717998075073


Note : There is no overfitting.
The training and test score are closer than dropping the outliers from X_train.