# Machine learning case study: rain in Australia dataset
### Anna Przybyłowska, Gurbet Gungoren, Wojciech Tomczak, Witold Taisner

## 1. Used libraries

In [1]:
# importing libraries
from time import perf_counter
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA

In [2]:
############ MODELS ############################

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

#################################################

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from collections import Counter

import warnings
warnings.filterwarnings('ignore')

## 2. Importing preprocessed data

We decided to use one-hot encoding, as it performed slightly better than label-encoding. Here we are using our preprocessed dataset, without outliers.

In [3]:
df = pd.read_csv("data/rain_outliers_removed.csv")

# encoding RainTomorrow and RainToday as binary values
df.RainToday.replace(("Yes", "No"), (1,0), inplace = True)
df.RainTomorrow.replace(("Yes", "No"), (1,0), inplace = True)

#################### ONE-HOT ENCODING #########################################

# columns to be changed to one-hot encoding
categorical_columns = ["Season", "WindGustDir", "WindDir9am", "WindDir3pm"]

# creating one-hot encoding
df = pd.get_dummies(df, columns = categorical_columns)

#################### LABEL ENCODER ############################################

# le = LabelEncoder()

# df["Season"] = le.fit_transform(df["Season"])
# df["WindDir9am"]= le.fit_transform(df["WindDir9am"])
# df["WindDir3pm"]= le.fit_transform(df["WindDir3pm"])
# df["WindGustDir"] = le.fit_transform(df["WindGustDir"])

In [4]:
df.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
0,13.4,22.9,0.6,7.0,10.5,44,20,24,71,22,...,0,0,0,0,0,0,0,0,1,0
1,7.4,25.1,0.0,7.6,13.3,44,4,22,44,25,...,0,0,0,0,0,0,0,0,0,1
2,12.9,25.7,0.0,11.4,10.0,46,19,26,38,30,...,0,0,0,0,0,0,0,0,0,1
3,9.2,28.0,0.0,6.8,12.2,24,11,9,45,16,...,0,0,0,0,0,0,0,0,0,0
4,17.5,32.3,1.0,8.0,5.0,41,7,20,82,33,...,0,1,0,0,0,0,0,0,0,0


In [5]:
print(f'Dataset dimensions:\nRows:{df.shape[0]}\nColunms:{df.shape[1]}')

Dataset dimensions:
Rows:123710
Colunms:70


We split data to the training set (80%) and test set (20%).

In [6]:
y = df.RainTomorrow.to_numpy()
X = df.drop(columns=['RainTomorrow']).to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

## 3. Used metrics:
Apart from standard accuracy, we decided to also evaluate our models based on [balanced accuracy](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.balanced_accuracy_score.html), which is better suited for inbalanced data, as well as F1, Precision and Recall.

## 4. Tested approaches:
We decided to test 4 approaches, as our data is quite imbalanced. All the results were obtained for the preprocessed dataset without outliers.

### 4.1 Training on preprocessed only dataset

- SVC:
    - StandardScaler: accuracy: 85%, balanced accuracy: 72%;
- KNN:
    - StandardScaler: accuracy: 80%, balanced accuracy: 64%;
- MLP Classifier:
    - StandardScaler: accuracy: 83%, balanced accuracy: 72%;
- Decision Tree Classifier:
    - StandardScaler: accuracy: 78%, balanced accuracy: 69%;
- **Random Forest Classifier**:
    - StandardScaler: accuracy: 85%, balanced accuracy: 72%; // 100 estimators
    - StandardScaler: accuracy: 85%, balanced accuracy: 72%; // 300 estimators

- AdaBoost Classifier:
    - StandardScaler: accuracy: 84%, balanced accuracy: 71%; // 100 estimators
    - StandardScaler: accuracy: 84%, balanced accuracy: 72%; // 300 estimators
- **XGBoost Classifier**:
    - StandardScaler: accuracy: 85%, balanced accuracy: 74%;
- **Logistic Regression**:
    - StandardScaler: accuracy: 85%, balanced accuracy: 72.7%;
- **LGBMClassifier**:
    - StandardScaler: accuracy: 85.3%, balanced accuracy: 73.5%;
    
We managed to determine four classifiers, written in bold case, which manged to get the best results on this data.

### 4.2 Oversampling

Second of tested approaches focused only on previously found classifiers: *XGBoost*, *Random Forest*, *LGBM* and some of the more promising: *AdaBoost*. 
Oversampling creates copies of minority class, so that there is even number of each class instance.

- **Random Forest Classifier**:
    - StandardScaler: accuracy: 85.2%, balanced accuracy: 74.3%; 
- LGBMClassifier:
    - StandardScaler: accuracy: 85.2%, balanced accuracy: 73.5%;
- XGBoost:
    - StandardScaler: accuracy: 85.2%, balanced accuracy: 73.8%;
- AdaBoost:
    - StandardScaler: accuracy: 81.4%, balanced accuracy: 75%;
    
Easy to notice, oversampling did not improve our results in a significant way.

### 4.3 Undersampling
Similarly to oversampling, we focused on *XGBoost*, *Random Forest*, *LGBM* and *AdaBoost*. Undersampling is a method of removing similar instances of majority class, so that its cardinality is the same as minority class.

- **Random Forest Classifier**:
    - StandardScaler: accuracy: 79%, balanced accuracy 79%;
- LGBMClassifier:
    - StandardScaler: accuracy: 79.1%, balanced accuracy: 78.9%;
- **XGBoost**:
    - StandardScaler: accuracy: 79%, balanced accuracy: 79%;
- AdaBoost:
    - StandardScaler: accuracy: 78.2%, balanced accuracy: 77.4%;
    
Undersampling decreases overall accuracy, but at the same time increases balanced accuracy (better prediction of minority class)

### 4.4 Feature selection, grid search, class weights
In addition we tested some other approaches:       

#### 4.4.1 Grid search 
##### (only for Random Forest and XGBoost)
We tried to determine the best parameters for Random Forest and XGBoost with GridSerchCV method connected with K-best feature selection. It managed to determine best parameters and correspodning results:

- RandomForest: 
    - 'criterion': 'entropy',
    - 'max_depth': None,
    - 'min_samples_leaf': 4,
    - 'n_estimators': 100,
    - 'feature_selection k': 20
    - StandardScaler: **accuracy: 85%, balanced accuracy: 72%**;

- XGBoost Classifier:
    - 'colsample_bytree': 0.6,
    - 'gamma': 0,
    - 'max_depth': 8,
    - 'min_child_weight': 2,
    - 'subsample': 1.0,
    - 'feature_selection k': 40}
    - StandardScaler: **accuracy: 85.2%, balanced accuracy: 73.6%**;
    
Grid searches managed to get similar results as training on dataset only.

#### 4.4.2 Class weights
In this approach we assigned weights to each class, so that the model would maximize its objective function with minority class having bigger weight.
- **XGBoost Classifier**:
    - StandardScaler: accuracy: 81%, balanced accuracy 79.4%;
- Random Forest Classifier:
    - StandardScaler: accuracy: 85.1%, balanced accuracy 71.5%;
- LGBMClassifier:
    - StandardScaler: accuracy: 80.1%, balanced accuracy: 79.6%;
    
Assigning weights to classes seems to produce the best trade-off between overall accuracy and balanced accuracy.

## 5. Choosing the best approches.
  We decided to choose 3 classifiers which gave us the best results and check how they perform on the original dataset and the dataset with outliers.

  - LGBM Classifier
  - Random Forest
  - XGBoost Classifier


We will test the results with 4 differetn approaches:
  - just simple training on training set
  - oversampling
  - undersampling
  - class weights

Since grid search is quite expensive when it comes to time, we decided not to check how it performs on the other datasets.


## 6. Implementation

Dataset with outliers (one-hot encoding)

In [7]:
df1 = pd.read_csv("data/rain_with_outliers.csv")

# encoding RainTomorrow and RainToday as binary values
df1.RainToday.replace(("Yes", "No"), (1,0), inplace = True)
df1.RainTomorrow.replace(("Yes", "No"), (1,0), inplace = True)

#################### ONE-HOT ENCODING #########################################

# columns to be changed to one-hot encoding
categorical_columns = ["Season", "WindGustDir", "WindDir9am", "WindDir3pm"]

# creating one-hot encoding
df1 = pd.get_dummies(df1, columns = categorical_columns)

#################### LABEL ENCODER ############################################

# le = LabelEncoder()

# df["Season"] = le.fit_transform(df["Season"])
# df["WindDir9am"]= le.fit_transform(df["WindDir9am"])
# df["WindDir3pm"]= le.fit_transform(df["WindDir3pm"])
# df["WindGustDir"] = le.fit_transform(df["WindGustDir"])

###############################################################################

print(f'Dataset dimensions:\nRows:{df1.shape[0]}\nColunms:{df1.shape[1]}')

Dataset dimensions:
Rows:123710
Colunms:70


In [8]:
y1 = df1.RainTomorrow.to_numpy()
X1 = df1.drop(columns=['RainTomorrow']).to_numpy()

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.20, random_state=42)

Original dataset  - we have to remove all na value to make all the classifiers work, but since we don't want to perform a lot of processing we will use label encoding instead of one-hot encoding

In [9]:
df2 = pd.read_csv("data/weatherAUS_original_data.csv")
df2.drop(df2.index[(df2['Cloud3pm'] > 8) | (df2['Cloud9am'] > 8)], inplace = True)
df2.drop("Date", axis = 1, inplace = True)
df2.dropna(inplace = True)

# encoding RainTomorrow and RainToday as binary values
df2.RainToday.replace(("Yes", "No"), (1,0), inplace = True)
df2.RainTomorrow.replace(("Yes", "No"), (1,0), inplace = True)

#################### LABEL ENCODER ############################################

le = LabelEncoder()

df2["Location"] = le.fit_transform(df2["Location"])
df2["WindDir9am"]= le.fit_transform(df2["WindDir9am"])
df2["WindDir3pm"]= le.fit_transform(df2["WindDir3pm"])
df2["WindGustDir"] = le.fit_transform(df2["WindGustDir"])

###############################################################################

print(f'Dataset dimensions:\nRows:{df2.shape[0]}\nColunms:{df2.shape[1]}')

Dataset dimensions:
Rows:56419
Colunms:22


In [10]:
y2 = df2.RainTomorrow.to_numpy()
X2 = df2.drop(columns=['RainTomorrow']).to_numpy()

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.20, random_state=42)

LGBM Classifier

In [11]:
pipe_LGBM = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('classifier', LGBMClassifier())

    ], 
    verbose=True
    ) 

pipe_LGBM_weight = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('classifier', LGBMClassifier(class_weight='balanced'))

    ], 
    verbose=True
    ) 

Random Forest

In [12]:
pipe_RF = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier())

    ], 
    verbose=True
    ) 

pipe_RF_weight = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier(class_weight='balanced'))

    ], 
    verbose=True
    ) 

XGBoost Classifier

In [13]:
pipe_XGB = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('classifier', XGBClassifier())

    ], 
    verbose=True
    ) 
def pipe_XGB_weight(y_train):
  pipe_XGB_weight = Pipeline(
      [
          ('scaler', StandardScaler()),
          ('classifier', XGBClassifier(use_label_encoder=False, 
            scale_pos_weight = sum(y_train == 0)/sum(y_train == 1)))

      ], 
      verbose=True
      ) 
  return pipe_XGB_weight

Training the models - functions:

In [14]:
def count_time(func):
    def _count(*args, **kwargs):
        start = perf_counter()
        result = func(*args, **kwargs)
        stop = perf_counter()
        print(f'Total time: {stop - start}')
        return result
    return _count

In [15]:
@count_time
def training(pipe, X_train, y_train, X_test):
  pipe.fit(X_train, y_train)
  y_predicted = pipe.predict(X_test)
  return y_predicted

In [16]:
def report(y_predicted, y_test, data):
  report = metrics.classification_report(y_test, y_predicted)
  print(data)
  print(report)
  p1=metrics.accuracy_score(y_test,y_predicted)*100
  p2=metrics.balanced_accuracy_score(y_test, y_predicted)*100
  print("Accuracy of the model is:",p1,"%")
  print("Balanced accuracy of the model is:",p2,"%")
  cm = metrics.confusion_matrix(y_test, y_predicted)
  print(cm)
  return ((p1,p2))

In [17]:
results = {}

### 6.1 Simple approach

LGBM (simple)

In [18]:
results['LGBM(simple, no outliers)'] = report(
    training(pipe_LGBM, X_train, y_train, X_test), 
    y_test, "Rain outliers removed"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.1s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.6s
Total time: 0.7581365599999117
Rain outliers removed
              precision    recall  f1-score   support

           0       0.88      0.95      0.91     19284
           1       0.74      0.52      0.61      5458

    accuracy                           0.85     24742
   macro avg       0.81      0.74      0.76     24742
weighted avg       0.84      0.85      0.84     24742

Accuracy of the model is: 85.37304987470698 %
Balanced accuracy of the model is: 73.54623289506358 %
[[18264  1020]
 [ 2599  2859]]


In [19]:
results['LGBM(simple, outliers)'] = report(
    training(pipe_LGBM, X1_train, y1_train, X1_test),
    y1_test, "Rain with outliers"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.1s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.7s
Total time: 0.7685166970004502
Rain with outliers
              precision    recall  f1-score   support

           0       0.88      0.95      0.91     19284
           1       0.74      0.53      0.62      5458

    accuracy                           0.86     24742
   macro avg       0.81      0.74      0.77     24742
weighted avg       0.85      0.86      0.85     24742

Accuracy of the model is: 85.58321881820386 %
Balanced accuracy of the model is: 74.07514219767569 %
[[18256  1028]
 [ 2539  2919]]


In [20]:
results['LGBM(simple, original)'] = report(
    training(pipe_LGBM, X2_train, y2_train, X2_test),
    y2_test, "original data"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.3s
Total time: 0.36958353800037
original data
              precision    recall  f1-score   support

           0       0.89      0.95      0.92      8793
           1       0.76      0.57      0.65      2491

    accuracy                           0.86     11284
   macro avg       0.82      0.76      0.78     11284
weighted avg       0.86      0.86      0.86     11284

Accuracy of the model is: 86.41439205955335 %
Balanced accuracy of the model is: 75.81797598843612 %
[[8335  458]
 [1075 1416]]


XGBoost (simple)

In [21]:
results['XGB(simple, no outliers)'] = report(
    training(pipe_XGB, X_train, y_train, X_test),
    y_test, "Rain outliers removed"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.1s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   3.9s
Total time: 3.991296522999619
Rain outliers removed
              precision    recall  f1-score   support

           0       0.88      0.94      0.91     19284
           1       0.73      0.54      0.62      5458

    accuracy                           0.85     24742
   macro avg       0.80      0.74      0.76     24742
weighted avg       0.84      0.85      0.85     24742

Accuracy of the model is: 85.33667448064021 %
Balanced accuracy of the model is: 74.03520474162258 %
[[18177  1107]
 [ 2521  2937]]


In [22]:
results['XGB(simple, outliers)'] = report(
    training(pipe_XGB, X1_train, y1_train, X1_test),
    y1_test, "Rain with outliers"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.1s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   4.0s
Total time: 4.086445136999828
Rain with outliers
              precision    recall  f1-score   support

           0       0.88      0.95      0.91     19284
           1       0.74      0.55      0.63      5458

    accuracy                           0.86     24742
   macro avg       0.81      0.75      0.77     24742
weighted avg       0.85      0.86      0.85     24742

Accuracy of the model is: 85.79742947215261 %
Balanced accuracy of the model is: 74.67889278227227 %
[[18238  1046]
 [ 2468  2990]]


In [23]:
results['XGB(simple, original)'] = report(
    training(pipe_XGB, X2_train, y2_train, X2_test),
    y2_test, "original data"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   1.1s
Total time: 1.0986063290001766
original data
              precision    recall  f1-score   support

           0       0.89      0.94      0.91      8793
           1       0.74      0.57      0.65      2491

    accuracy                           0.86     11284
   macro avg       0.81      0.76      0.78     11284
weighted avg       0.85      0.86      0.86     11284

Accuracy of the model is: 86.19283941864587 %
Balanced accuracy of the model is: 75.90599215289451 %
[[8294  499]
 [1059 1432]]


Random Forest (simple)

In [24]:
results['RF(simple, no outliers)'] = report(
    training(pipe_RF, X_train, y_train, X_test),
    y_test, "Rain outliers removed"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.1s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=  17.2s
Total time: 17.85263612000017
Rain outliers removed
              precision    recall  f1-score   support

           0       0.87      0.96      0.91     19284
           1       0.76      0.49      0.60      5458

    accuracy                           0.85     24742
   macro avg       0.81      0.72      0.75     24742
weighted avg       0.84      0.85      0.84     24742

Accuracy of the model is: 85.32859105973648 %
Balanced accuracy of the model is: 72.26978391456275 %
[[18443   841]
 [ 2789  2669]]


In [25]:
results['RF(simple, outliers)'] = report(
    training(pipe_RF, X1_train, y1_train, X1_test),
    y1_test, "Rain with outliers"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.1s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=  17.2s
Total time: 17.76227094999922
Rain with outliers
              precision    recall  f1-score   support

           0       0.87      0.96      0.91     19284
           1       0.76      0.50      0.60      5458

    accuracy                           0.85     24742
   macro avg       0.82      0.73      0.76     24742
weighted avg       0.85      0.85      0.84     24742

Accuracy of the model is: 85.49834289871474 %
Balanced accuracy of the model is: 72.7070845693185 %
[[18435   849]
 [ 2739  2719]]


In [26]:
results['RF(simple, original)'] = report(
    training(pipe_RF, X2_train, y2_train, X2_test),
    y2_test, "original data"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   7.9s
Total time: 8.144166994999978
original data
              precision    recall  f1-score   support

           0       0.88      0.96      0.91      8793
           1       0.77      0.53      0.63      2491

    accuracy                           0.86     11284
   macro avg       0.83      0.74      0.77     11284
weighted avg       0.85      0.86      0.85     11284

Accuracy of the model is: 86.13966678482808 %
Balanced accuracy of the model is: 74.11679201956338 %
[[8410  383]
 [1181 1310]]


### 6.2 Class weight approach

LGBM (class weight)

In [27]:
results['LGBM(weights, no outliers)'] = report(
    training(pipe_LGBM_weight, X_train, y_train, X_test),
    y_test, "Rain outliers removed"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.1s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.8s
Total time: 0.8909794690007402
Rain outliers removed
              precision    recall  f1-score   support

           0       0.93      0.81      0.86     19284
           1       0.53      0.79      0.64      5458

    accuracy                           0.80     24742
   macro avg       0.73      0.80      0.75     24742
weighted avg       0.84      0.80      0.81     24742

Accuracy of the model is: 80.0945760245736 %
Balanced accuracy of the model is: 79.56544171405956 %
[[15526  3758]
 [ 1167  4291]]


In [28]:
results['LGBM(weights, outliers)'] = report(
    training(pipe_LGBM_weight, X1_train, y1_train, X1_test),
    y1_test, "Rain with outliers"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.1s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.6s
Total time: 0.7623002319996885
Rain with outliers
              precision    recall  f1-score   support

           0       0.93      0.81      0.87     19284
           1       0.54      0.79      0.64      5458

    accuracy                           0.81     24742
   macro avg       0.74      0.80      0.76     24742
weighted avg       0.85      0.81      0.82     24742

Accuracy of the model is: 80.69274917144936 %
Balanced accuracy of the model is: 80.17906098798701 %
[[15639  3645]
 [ 1132  4326]]


In [29]:
results['LGBM(weights, original)'] = report(
    training(pipe_LGBM_weight, X2_train, y2_train, X2_test),
    y2_test, "original data"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.3s
Total time: 0.37492626200037193
original data
              precision    recall  f1-score   support

           0       0.94      0.82      0.88      8793
           1       0.57      0.81      0.67      2491

    accuracy                           0.82     11284
   macro avg       0.75      0.82      0.77     11284
weighted avg       0.86      0.82      0.83     11284

Accuracy of the model is: 82.05423608649414 %
Balanced accuracy of the model is: 81.66623317159105 %
[[7242 1551]
 [ 474 2017]]


XGBoost (class weight)

In [30]:
results['XGB(weights, no outliers)'] = report(
    training(pipe_XGB_weight(y_train), X_train, y_train, X_test),
    y_test, "Rain outliers removed"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.1s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   3.7s
Total time: 3.821975509999902
Rain outliers removed
              precision    recall  f1-score   support

           0       0.93      0.82      0.87     19284
           1       0.55      0.77      0.64      5458

    accuracy                           0.81     24742
   macro avg       0.74      0.79      0.75     24742
weighted avg       0.84      0.81      0.82     24742

Accuracy of the model is: 80.9514186403686 %
Balanced accuracy of the model is: 79.359795406213 %
[[15853  3431]
 [ 1282  4176]]


In [31]:
results['XGB(weights, outliers)'] = report(
    training(pipe_XGB_weight(y1_train), X1_train, y1_train, X1_test),
    y1_test, "Rain with outliers"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.1s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   3.4s
Total time: 3.548556625000856
Rain with outliers
              precision    recall  f1-score   support

           0       0.93      0.82      0.87     19284
           1       0.55      0.77      0.64      5458

    accuracy                           0.81     24742
   macro avg       0.74      0.80      0.76     24742
weighted avg       0.84      0.81      0.82     24742

Accuracy of the model is: 81.24242179290275 %
Balanced accuracy of the model is: 79.73695187682385 %
[[15896  3388]
 [ 1253  4205]]


In [32]:
results['XGB(weights, original)'] = report(
    training(pipe_XGB_weight(y2_train), X2_train, y2_train, X2_test),
    y2_test, "original data"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.9s
Total time: 0.911173652000798
original data
              precision    recall  f1-score   support

           0       0.93      0.85      0.89      8793
           1       0.59      0.77      0.67      2491

    accuracy                           0.83     11284
   macro avg       0.76      0.81      0.78     11284
weighted avg       0.85      0.83      0.84     11284

Accuracy of the model is: 83.18858560794044 %
Balanced accuracy of the model is: 80.82601973039483 %
[[7479 1314]
 [ 583 1908]]


Random Forest (class weight)

In [33]:
results['RF(weights, no outliers)'] = report(
    training(pipe_RF_weight, X_train, y_train, X_test),
    y_test, "Rain outliers removed"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.1s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=  15.9s
Total time: 16.537422116000016
Rain outliers removed
              precision    recall  f1-score   support

           0       0.86      0.96      0.91     19284
           1       0.76      0.47      0.58      5458

    accuracy                           0.85     24742
   macro avg       0.81      0.71      0.74     24742
weighted avg       0.84      0.85      0.84     24742

Accuracy of the model is: 85.03354619675046 %
Balanced accuracy of the model is: 71.37772736673536 %
[[18477   807]
 [ 2896  2562]]


In [34]:
results['RF(weights, outliers)'] = report(
    training(pipe_RF_weight, X1_train, y1_train, X1_test),
    y1_test, "Rain with outliers"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.1s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=  16.1s
Total time: 16.661805219999223
Rain with outliers
              precision    recall  f1-score   support

           0       0.87      0.96      0.91     19284
           1       0.77      0.48      0.59      5458

    accuracy                           0.85     24742
   macro avg       0.82      0.72      0.75     24742
weighted avg       0.84      0.85      0.84     24742

Accuracy of the model is: 85.25179856115108 %
Balanced accuracy of the model is: 71.83300581484039 %
[[18483   801]
 [ 2848  2610]]


In [35]:
results['RF(weights, original)'] = report(
    training(pipe_RF_weight, X2_train, y2_train, X2_test),
    y2_test, "original data"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   7.5s
Total time: 7.737518910998915
original data
              precision    recall  f1-score   support

           0       0.87      0.96      0.92      8793
           1       0.79      0.51      0.62      2491

    accuracy                           0.86     11284
   macro avg       0.83      0.74      0.77     11284
weighted avg       0.85      0.86      0.85     11284

Accuracy of the model is: 86.13080467919177 %
Balanced accuracy of the model is: 73.52128301028476 %
[[8450  343]
 [1222 1269]]


### 6.3 Oversampling

In [36]:
oversample = SMOTE() 
X_traino, y_traino = oversample.fit_resample(X_train, y_train)
X1_traino, y1_traino = oversample.fit_resample(X1_train, y1_train) 
X2_traino, y2_traino = oversample.fit_resample(X2_train, y2_train)  

LGBM (oversampling)

In [37]:
results['LGBM(oversampling, no outliers)'] = report(
    training(pipe_LGBM, X_traino, y_traino, X_test),
    y_test, "Rain outliers removed"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.1s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   1.6s
Total time: 1.7330779549993167
Rain outliers removed
              precision    recall  f1-score   support

           0       0.88      0.94      0.91     19284
           1       0.73      0.53      0.61      5458

    accuracy                           0.85     24742
   macro avg       0.80      0.74      0.76     24742
weighted avg       0.84      0.85      0.84     24742

Accuracy of the model is: 85.21946487753618 %
Balanced accuracy of the model is: 73.68415512048068 %
[[18190  1094]
 [ 2563  2895]]


In [38]:
results['LGBM(oversampling, outliers)'] = report(
    training(pipe_LGBM, X1_traino, y1_traino, X1_test),
    y1_test, "Rain with outliers"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.1s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   1.5s
Total time: 1.6542812419993425
Rain with outliers
              precision    recall  f1-score   support

           0       0.88      0.94      0.91     19284
           1       0.73      0.55      0.62      5458

    accuracy                           0.86     24742
   macro avg       0.81      0.74      0.77     24742
weighted avg       0.85      0.86      0.85     24742

Accuracy of the model is: 85.54280171368524 %
Balanced accuracy of the model is: 74.44329647021107 %
[[18186  1098]
 [ 2479  2979]]


In [39]:
results['LGBM(oversampling, original)'] = report(
    training(pipe_LGBM, X2_traino, y2_traino, X2_test),
    y2_test, "original data"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.4s
Total time: 0.467367954999645
original data
              precision    recall  f1-score   support

           0       0.89      0.94      0.91      8793
           1       0.73      0.60      0.66      2491

    accuracy                           0.86     11284
   macro avg       0.81      0.77      0.79     11284
weighted avg       0.86      0.86      0.86     11284

Accuracy of the model is: 86.17511520737328 %
Balanced accuracy of the model is: 76.75777459379182 %
[[8232  561]
 [ 999 1492]]


XGBoost (oversampling)

In [40]:
results['XGB(oversampling, no outliers)'] = report(
    training(pipe_XGB, X_traino, y_traino, X_test),
    y_test, "Rain outliers removed"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.1s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   7.6s
Total time: 7.770929865000653
Rain outliers removed
              precision    recall  f1-score   support

           0       0.88      0.94      0.91     19284
           1       0.72      0.54      0.62      5458

    accuracy                           0.85     24742
   macro avg       0.80      0.74      0.76     24742
weighted avg       0.84      0.85      0.85     24742

Accuracy of the model is: 85.33263277018834 %
Balanced accuracy of the model is: 74.19024491983397 %
[[18152  1132]
 [ 2497  2961]]


In [41]:
results['XGB(oversampling, outliers)'] = report(
    training(pipe_XGB, X1_traino, y1_traino, X1_test),
    y1_test, "Rain with outliers"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.1s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   8.1s
Total time: 8.275296193000031
Rain with outliers
              precision    recall  f1-score   support

           0       0.88      0.94      0.91     19284
           1       0.73      0.55      0.63      5458

    accuracy                           0.86     24742
   macro avg       0.81      0.75      0.77     24742
weighted avg       0.85      0.86      0.85     24742

Accuracy of the model is: 85.668094737693 %
Balanced accuracy of the model is: 74.81923586264412 %
[[18172  1112]
 [ 2434  3024]]


In [42]:
results['XGB(oversampling, original)'] = report(
    training(pipe_XGB, X2_traino, y2_traino, X2_test),
    y2_test, "original data"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   2.6s
Total time: 2.6454684479995194
original data
              precision    recall  f1-score   support

           0       0.89      0.94      0.91      8793
           1       0.73      0.59      0.65      2491

    accuracy                           0.86     11284
   macro avg       0.81      0.77      0.78     11284
weighted avg       0.85      0.86      0.86     11284

Accuracy of the model is: 86.13080467919177 %
Balanced accuracy of the model is: 76.57109778073806 %
[[8238  555]
 [1010 1481]]


Random Forest (oversampling)

In [43]:
results['RF(oversampling, no outliers)'] = report(
    training(pipe_RF, X_traino, y_traino, X_test),
    y_test, "Rain outliers removed"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.1s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=  37.0s
Total time: 37.714844272000846
Rain outliers removed
              precision    recall  f1-score   support

           0       0.88      0.94      0.91     19284
           1       0.71      0.55      0.62      5458

    accuracy                           0.85     24742
   macro avg       0.80      0.74      0.76     24742
weighted avg       0.84      0.85      0.84     24742

Accuracy of the model is: 85.13054724759517 %
Balanced accuracy of the model is: 74.20510068438368 %
[[18080  1204]
 [ 2475  2983]]


In [44]:
results['RF(oversampling, outliers)'] = report(
    training(pipe_RF, X1_traino, y1_traino, X1_test),
    y1_test, "Rain with outliers"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.1s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=  38.6s
Total time: 39.30132561800019
Rain with outliers
              precision    recall  f1-score   support

           0       0.88      0.94      0.91     19284
           1       0.72      0.56      0.63      5458

    accuracy                           0.85     24742
   macro avg       0.80      0.75      0.77     24742
weighted avg       0.85      0.85      0.85     24742

Accuracy of the model is: 85.47813434645542 %
Balanced accuracy of the model is: 74.92068659702966 %
[[18091  1193]
 [ 2400  3058]]


In [45]:
results['RF(oversampling, original)'] = report(
    training(pipe_RF, X2_traino, y2_traino, X2_test),
    y2_test, "original data"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=  17.4s
Total time: 17.638599116999103
original data
              precision    recall  f1-score   support

           0       0.90      0.91      0.91      8793
           1       0.68      0.66      0.67      2491

    accuracy                           0.86     11284
   macro avg       0.79      0.78      0.79     11284
weighted avg       0.85      0.86      0.86     11284

Accuracy of the model is: 85.62566465792271 %
Balanced accuracy of the model is: 78.43363596722568 %
[[8029  764]
 [ 858 1633]]


### 6.4 Undersampling

In [46]:
rus = RandomUnderSampler(random_state=0) 
X_trainu, y_trainu = rus.fit_resample(X_train, y_train) 
X1_trainu, y1_trainu = rus.fit_resample(X1_train, y1_train) 
X2_trainu, y2_trainu = rus.fit_resample(X2_train, y2_train) 

LGBM (undersampling)

In [47]:
results['LGBM(undersampling, no outliers)'] = report(
    training(pipe_LGBM, X_trainu, y_trainu, X_test),
    y_test, "Rain outliers removed"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.4s
Total time: 0.5006803970009059
Rain outliers removed
              precision    recall  f1-score   support

           0       0.93      0.79      0.86     19284
           1       0.52      0.79      0.63      5458

    accuracy                           0.79     24742
   macro avg       0.73      0.79      0.74     24742
weighted avg       0.84      0.79      0.81     24742

Accuracy of the model is: 79.26198367148977 %
Balanced accuracy of the model is: 79.30060987302939 %
[[15279  4005]
 [ 1126  4332]]


In [48]:
results['LGBM(undersampling, outliers)'] = report(
    training(pipe_LGBM, X1_trainu, y1_trainu, X1_test),
    y1_test, "Rain with outliers"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.4s
Total time: 0.45632513700002164
Rain with outliers
              precision    recall  f1-score   support

           0       0.93      0.80      0.86     19284
           1       0.53      0.80      0.64      5458

    accuracy                           0.80     24742
   macro avg       0.73      0.80      0.75     24742
weighted avg       0.84      0.80      0.81     24742

Accuracy of the model is: 79.9733247110177 %
Balanced accuracy of the model is: 79.88173952527984 %
[[15436  3848]
 [ 1107  4351]]


In [49]:
results['LGBM(undersampling, original)'] = report(
    training(pipe_LGBM, X2_trainu, y2_trainu, X2_test),
    y2_test, "original data"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.2s
Total time: 0.22802988399962487
original data
              precision    recall  f1-score   support

           0       0.94      0.81      0.87      8793
           1       0.55      0.83      0.66      2491

    accuracy                           0.81     11284
   macro avg       0.74      0.82      0.76     11284
weighted avg       0.86      0.81      0.82     11284

Accuracy of the model is: 81.07054236086493 %
Balanced accuracy of the model is: 81.61048602445204 %
[[7091 1702]
 [ 434 2057]]


XGBoost (undersampling)

In [50]:
results['XGB(undersampling, no outliers)'] = report(
    training(pipe_XGB, X_trainu, y_trainu, X_test),
    y_test, "Rain outliers removed"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   1.6s
Total time: 1.6796609889988758
Rain outliers removed
              precision    recall  f1-score   support

           0       0.93      0.79      0.86     19284
           1       0.52      0.79      0.63      5458

    accuracy                           0.79     24742
   macro avg       0.73      0.79      0.74     24742
weighted avg       0.84      0.79      0.81     24742

Accuracy of the model is: 79.43173551046803 %
Balanced accuracy of the model is: 79.45548473383023 %
[[15314  3970]
 [ 1119  4339]]


In [51]:
results['XGB(undersampling, outliers)'] = report(
    training(pipe_XGB, X1_trainu, y1_trainu, X1_test),
    y1_test, "Rain with outliers"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   1.7s
Total time: 1.7715486109991616
Rain with outliers
              precision    recall  f1-score   support

           0       0.93      0.80      0.86     19284
           1       0.53      0.80      0.63      5458

    accuracy                           0.80     24742
   macro avg       0.73      0.80      0.75     24742
weighted avg       0.84      0.80      0.81     24742

Accuracy of the model is: 79.68232155848355 %
Balanced accuracy of the model is: 79.76073668174438 %
[[15354  3930]
 [ 1097  4361]]


In [52]:
results['XGB(undersampling, original)'] = report(
    training(pipe_XGB, X2_trainu, y2_trainu, X2_test),
    y2_test, "original data"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.5s
Total time: 0.5177567480004654
original data
              precision    recall  f1-score   support

           0       0.94      0.81      0.87      8793
           1       0.54      0.81      0.65      2491

    accuracy                           0.81     11284
   macro avg       0.74      0.81      0.76     11284
weighted avg       0.85      0.81      0.82     11284

Accuracy of the model is: 80.80467919177596 %
Balanced accuracy of the model is: 80.9795463829002 %
[[7093 1700]
 [ 466 2025]]


Random Forest (undersampling)

In [53]:
results['RF(undersampling, no outliers)'] = report(
    training(pipe_RF, X_trainu, y_trainu, X_test),
    y_test, "Rain outliers removed"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   6.6s
Total time: 7.097285071000442
Rain outliers removed
              precision    recall  f1-score   support

           0       0.93      0.79      0.85     19284
           1       0.52      0.78      0.62      5458

    accuracy                           0.79     24742
   macro avg       0.72      0.79      0.74     24742
weighted avg       0.84      0.79      0.80     24742

Accuracy of the model is: 79.02352275482984 %
Balanced accuracy of the model is: 78.80609514271605 %
[[15272  4012]
 [ 1178  4280]]


In [54]:
results['RF(undersampling, outliers)'] = report(
    training(pipe_RF, X1_trainu, y1_trainu, X1_test),
    y1_test, "Rain with outliers"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   6.6s
Total time: 7.141857091000929
Rain with outliers
              precision    recall  f1-score   support

           0       0.93      0.80      0.86     19284
           1       0.53      0.79      0.63      5458

    accuracy                           0.80     24742
   macro avg       0.73      0.80      0.75     24742
weighted avg       0.84      0.80      0.81     24742

Accuracy of the model is: 79.8641985288174 %
Balanced accuracy of the model is: 79.68037246810685 %
[[15429  3855]
 [ 1127  4331]]


In [55]:
results['RF(undersampling, original)'] = report(
    training(pipe_RF, X2_trainu, y2_trainu, X2_test),
    y2_test, "original data"
)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   3.3s
Total time: 3.4563412530005735
original data
              precision    recall  f1-score   support

           0       0.94      0.80      0.87      8793
           1       0.54      0.82      0.65      2491

    accuracy                           0.81     11284
   macro avg       0.74      0.81      0.76     11284
weighted avg       0.85      0.81      0.82     11284

Accuracy of the model is: 80.68060971286778 %
Balanced accuracy of the model is: 81.10134046538882 %
[[7065 1728]
 [ 452 2039]]


## 7. Results - comparison

### 7.1 Data

In [56]:
outliers = {'acc':{'simple':{}, 'oversampling':{}, 'undersampling':{}, 'weights':{}}, 
            'balanced':{'simple':{}, 'oversampling':{}, 'undersampling':{}, 'weights':{}}}
no_outliers = {'acc':{'simple':{}, 'oversampling':{}, 'undersampling':{}, 'weights':{}}, 
            'balanced':{'simple':{}, 'oversampling':{}, 'undersampling':{}, 'weights':{}}}
original = {'acc':{'simple':{}, 'oversampling':{}, 'undersampling':{}, 'weights':{}}, 
            'balanced':{'simple':{}, 'oversampling':{}, 'undersampling':{}, 'weights':{}}}
for x in results:
  t1 = results[x][0]
  t2 = results[x][1]
  if x.find('LGBM')!=-1:
    n='lgbm'
  elif x.find('XGB')!=-1:
    n='xgb'
  else:
    n='rf'
  if x.find('simple')!=-1:
    k='simple'
  elif x.find('oversampling')!=-1:
    k='oversampling'
  elif x.find('undersampling')!=-1:
    k='undersampling'
  else:
    k='weights'
  if x.find('no outliers')!=-1:
    no_outliers['acc'][k][n]=t1
    no_outliers['balanced'][k][n]=t2
  elif x.find('outliers')!=-1:
    outliers['acc'][k][n]=t1
    outliers['balanced'][k][n]=t2
  else:
    original['acc'][k][n]=t1
    original['balanced'][k][n]=t2

In [57]:
def transform(dict):
  return pd.DataFrame.from_dict({(i,j): dict[i][j] 
                           for i in dict.keys() 
                           for j in dict[i].keys()},
                       orient='index')

### 7.2 Accuracy and balanced accuracy

Rain with outliers

In [58]:
display(transform(outliers))

Unnamed: 0,Unnamed: 1,lgbm,xgb,rf
acc,simple,85.583219,85.797429,85.498343
acc,oversampling,85.542802,85.668095,85.478134
acc,undersampling,79.973325,79.682322,79.864199
acc,weights,80.692749,81.242422,85.251799
balanced,simple,74.075142,74.678893,72.707085
balanced,oversampling,74.443296,74.819236,74.920687
balanced,undersampling,79.88174,79.760737,79.680372
balanced,weights,80.179061,79.736952,71.833006


Rain outliers removed

In [59]:
display(transform(no_outliers))

Unnamed: 0,Unnamed: 1,lgbm,xgb,rf
acc,simple,85.37305,85.336674,85.328591
acc,oversampling,85.219465,85.332633,85.130547
acc,undersampling,79.261984,79.431736,79.023523
acc,weights,80.094576,80.951419,85.033546
balanced,simple,73.546233,74.035205,72.269784
balanced,oversampling,73.684155,74.190245,74.205101
balanced,undersampling,79.30061,79.455485,78.806095
balanced,weights,79.565442,79.359795,71.377727


Original dataset

In [60]:
display(transform(original))

Unnamed: 0,Unnamed: 1,lgbm,xgb,rf
acc,simple,86.414392,86.192839,86.139667
acc,oversampling,86.175115,86.130805,85.625665
acc,undersampling,81.070542,80.804679,80.68061
acc,weights,82.054236,83.188586,86.130805
balanced,simple,75.817976,75.905992,74.116792
balanced,oversampling,76.757775,76.571098,78.433636
balanced,undersampling,81.610486,80.979546,81.10134
balanced,weights,81.666233,80.82602,73.521283
