# Machine learning case study: rain in Australia dataset
### Anna Przybyłowska, Gurbet Gungoren, Wojciech Tomczak, Witold Taisner

## 1. Used libraries

In [5]:
# importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA

############ MODELS ############################

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

#################################################

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from collections import Counter



## 2. Importing preprocessed data

We decided to use one-hot encoding, as it performed slightly better than label-encoding. Here we are using our preprocessed dataset, without outliers.

In [7]:
df = pd.read_csv("/data/rain_outliers_removed.csv")

# encoding RainTomorrow and RainToday as binary values
df.RainToday.replace(("Yes", "No"), (1,0), inplace = True)
df.RainTomorrow.replace(("Yes", "No"), (1,0), inplace = True)

#################### ONE-HOT ENCODING #########################################

# columns to be changed to one-hot encoding
categorical_columns = ["Season", "WindGustDir", "WindDir9am", "WindDir3pm"]

# creating one-hot encoding
df = pd.get_dummies(df, columns = categorical_columns)

#################### LABEL ENCODER ############################################

# le = LabelEncoder()

# df["Season"] = le.fit_transform(df["Season"])
# df["WindDir9am"]= le.fit_transform(df["WindDir9am"])
# df["WindDir3pm"]= le.fit_transform(df["WindDir3pm"])
# df["WindGustDir"] = le.fit_transform(df["WindGustDir"])

###############################################################################

df.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Season_Autumn,Season_Spring,Season_Summer,Season_Winter,WindGustDir_E,WindGustDir_ENE,WindGustDir_ESE,WindGustDir_N,WindGustDir_NE,WindGustDir_NNE,WindGustDir_NNW,WindGustDir_NW,WindGustDir_S,WindGustDir_SE,WindGustDir_SSE,WindGustDir_SSW,WindGustDir_SW,WindGustDir_W,WindGustDir_WNW,WindGustDir_WSW,WindDir9am_E,WindDir9am_ENE,WindDir9am_ESE,WindDir9am_N,WindDir9am_NE,WindDir9am_NNE,WindDir9am_NNW,WindDir9am_NW,WindDir9am_S,WindDir9am_SE,WindDir9am_SSE,WindDir9am_SSW,WindDir9am_SW,WindDir9am_W,WindDir9am_WNW,WindDir9am_WSW,WindDir3pm_E,WindDir3pm_ENE,WindDir3pm_ESE,WindDir3pm_N,WindDir3pm_NE,WindDir3pm_NNE,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
0,13.4,22.9,0.6,7.0,10.5,44,20,24,71,22,1007.7,1007.1,8,7,16.9,21.8,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,7.4,25.1,0.0,7.6,13.3,44,4,22,44,25,1010.6,1007.8,0,5,17.2,24.3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,12.9,25.7,0.0,11.4,10.0,46,19,26,38,30,1007.6,1008.7,3,2,21.0,23.2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,9.2,28.0,0.0,6.8,12.2,24,11,9,45,16,1017.6,1012.8,7,1,18.1,26.5,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,17.5,32.3,1.0,8.0,5.0,41,7,20,82,33,1010.8,1006.0,7,8,17.8,29.7,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [67]:
df.shape

(123710, 70)

We split data to the training set (80%) and test set (20%).

In [9]:
y = df.RainTomorrow.to_numpy()
X = df.drop(columns=['RainTomorrow']).to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

## 3. Used metrics:
Apart from standard accuracy, we decided to also evaluate our models based on [balanced accuracy](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.balanced_accuracy_score.html), which is better suited for inbalanced data, as well as F1, Precision and Recall.

## 4. Tested approaches:
We decided to test 4 approaches, as our data is quite imbalanced. All the results were obtained for the preprocessed dataset without outliers.
***
### 4.1 Training on preprocessed only dataset

- SVC:
    - StandardScaler: accuracy: 85%, balanced accuracy: 72%;
- KNN:
    - StandardScaler: accuracy: 80%, balanced accuracy: 64%;
- MLP Classifier:
    - StandardScaler: accuracy: 83%, balanced accuracy: 72%;
- Decision Tree Classifier:
    - StandardScaler: accuracy: 78%, balanced accuracy: 69%;
- **Random Forest Classifier**:
    - StandardScaler: accuracy: 85%, balanced accuracy: 72%; // 100 estimators
    - StandardScaler: accuracy: 85%, balanced accuracy: 72%; // 300 estimators
- AdaBoost Classifier:
    - StandardScaler: accuracy: 84%, balanced accuracy: 71%; // 100 estimators
    - StandardScaler: accuracy: 84%, balanced accuracy: 72%; // 300 estimators
- **XGBoost Classifier**:
    - StandardScaler: accuracy: 85%, balanced accuracy: 74%;
- **Logistic Regression**:
    - StandardScaler: accuracy: 85%, balanced accuracy: 72.7%;
- **LGBMClassifier**:
    - StandardScaler: accuracy: 85.3%, balanced accuracy: 73.5%;
    
We managed to determine four classifiers, written in bold case, which manged to get the best results on this data.
***
### 4.2 Oversampling

Second of tested approaches focused only on previously found classifiers: *XGBoost*, *Random Forest*, *LGBM* and some of the more promising: *AdaBoost*. 
Oversampling creates copies of minority class, so that there is even number of each class instance.

- **Random Forest Classifier**:
    - StandardScaler: accuracy: 85.2%, balanced accuracy: 74.3%; 
- LGBMClassifier:
    - StandardScaler: accuracy: 85.2%, balanced accuracy: 73.5%;
- XGBoost:
    - StandardScaler: accuracy: 85.2%, balanced accuracy: 73.8%;
- AdaBoost:
    - StandardScaler: accuracy: 81.4%, balanced accuracy: 75%;
    
Easy to notice, oversampling did not improve our results in a significant way.

***
### 4.3 Undersampling
Similarly to oversampling, we focused on *XGBoost*, *Random Forest*, *LGBM* and *AdaBoost*. Undersampling is a method of removing similar instances of majority class, so that its cardinality is the same as minority class.

- **Random Forest Classifier**:
    - StandardScaler: accuracy: 79%, balanced accuracy 79%;
- LGBMClassifier:
    - StandardScaler: accuracy: 79.1%, balanced accuracy: 78.9%;
- **XGBoost**:
    - StandardScaler: accuracy: 79%, balanced accuracy: 79%;
- AdaBoost:
    - StandardScaler: accuracy: 78.2%, balanced accuracy: 77.4%;
    
Undersampling decreases overall accuracy, but at the same time increases balanced accuracy (better prediction of minority class)

***
### 4.4 Feature selection, grid search, class weights
In addition we tested some other approaches:
        
#### 4.4.1 Grid search 
##### (only for Random Forest and XGBoost)
We tried to determine the best parameters for Random Forest and XGBoost with GridSerchCV method connected with K-best feature selection. It managed to determine best parameters and correspodning results:

- RandomForest: 
    - 'criterion': 'entropy',
    - 'max_depth': None,
    - 'min_samples_leaf': 4,
    - 'n_estimators': 100,
    - 'feature_selection k': 20
    - StandardScaler: **accuracy: 85%, balanced accuracy: 72%**;
- XGBoost Classifier:
    - 'colsample_bytree': 0.6,
    - 'gamma': 0,
    - 'max_depth': 8,
    - 'min_child_weight': 2,
    - 'subsample': 1.0,
    - 'feature_selection k': 40}
    - StandardScaler: **accuracy: 85.2%, balanced accuracy: 73.6%**;
    
Grid searches managed to get similar results as training on dataset only.
        
#### 4.4.2 Class weights
In this approach we assigned weights to each class, so that the model would maximize its objective function with minority class having bigger weight.
- **XGBoost Classifier**:
    - StandardScaler: accuracy: 81%, balanced accuracy 79.4%;
- Random Forest Classifier:
    - StandardScaler: accuracy: 85.1%, balanced accuracy 71.5%;
- LGBMClassifier:
    - StandardScaler: accuracy: 80.1%, balanced accuracy: 79.6%;
    
Assigning weights to classes seems to produce the best trade-off between overall accuracy and balanced accuracy.

##5. Choosing the best approches.
  We decided to choose 3 classifiers which gave us the best results and check how they perform on the original dataset and the dataset with outliers.

  - LGBM Classifier
  - Random Forest
  - XGBoost Classifier


We will test the results with 4 differetn approaches:
  - just simple training on training set
  - oversampling
  - undersampling
  - class weights

Since grid search is quite expensive when it comes to time, we decided not to check how it performs on the other datasets.


## 5. Implementation

dataset with outliers (one-hot encoding)

In [10]:
df1 = pd.read_csv("/data/rain_with_outliers.csv")

# encoding RainTomorrow and RainToday as binary values
df1.RainToday.replace(("Yes", "No"), (1,0), inplace = True)
df1.RainTomorrow.replace(("Yes", "No"), (1,0), inplace = True)

#################### ONE-HOT ENCODING #########################################

# columns to be changed to one-hot encoding
categorical_columns = ["Season", "WindGustDir", "WindDir9am", "WindDir3pm"]

# creating one-hot encoding
df1 = pd.get_dummies(df1, columns = categorical_columns)

#################### LABEL ENCODER ############################################

# le = LabelEncoder()

# df["Season"] = le.fit_transform(df["Season"])
# df["WindDir9am"]= le.fit_transform(df["WindDir9am"])
# df["WindDir3pm"]= le.fit_transform(df["WindDir3pm"])
# df["WindGustDir"] = le.fit_transform(df["WindGustDir"])

###############################################################################

df1.shape

(123710, 70)

In [11]:
y1 = df1.RainTomorrow.to_numpy()
X1 = df1.drop(columns=['RainTomorrow']).to_numpy()

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.20, random_state=42)

original dataset  - we have to remove all na value to make all the classifiers work, but since we don't want to perform a lot of processing we will use label encoding instead of one-hot encoding

In [12]:
df2 = pd.read_csv("/data/weatherAUS_original_data.csv")
df2 = df2.drop("Date",axis = 1)
df2.dropna(inplace = True)

# encoding RainTomorrow and RainToday as binary values
df2.RainToday.replace(("Yes", "No"), (1,0), inplace = True)
df2.RainTomorrow.replace(("Yes", "No"), (1,0), inplace = True)

#################### LABEL ENCODER ############################################

le = LabelEncoder()

df2["Location"] = le.fit_transform(df2["Location"])
df2["WindDir9am"]= le.fit_transform(df2["WindDir9am"])
df2["WindDir3pm"]= le.fit_transform(df2["WindDir3pm"])
df2["WindGustDir"] = le.fit_transform(df2["WindGustDir"])

###############################################################################

df2.shape

(56420, 22)

In [13]:
y2 = df2.RainTomorrow.to_numpy()
X2 = df2.drop(columns=['RainTomorrow']).to_numpy()

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.20, random_state=42)

LGBM Classifier

In [14]:
pipe_LGBM = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('classifier', LGBMClassifier())

    ], 
    verbose=True
    ) 

pipe_LGBM_weight = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('classifier', LGBMClassifier(class_weight='balanced'))

    ], 
    verbose=True
    ) 

Random Forest

In [15]:
pipe_RF = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier())

    ], 
    verbose=True
    ) 

pipe_RF_weight = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier(class_weight='balanced'))

    ], 
    verbose=True
    ) 

XGBoost Classifier

In [16]:
pipe_XGB = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('classifier', XGBClassifier())

    ], 
    verbose=True
    ) 
def pipe_XGB_weight(y_train):
  pipe_XGB_weight = Pipeline(
      [
          ('scaler', StandardScaler()),
          ('classifier', XGBClassifier(use_label_encoder=False, scale_pos_weight = sum(y_train == 0)/sum(y_train == 1)))

      ], 
      verbose=True
      ) 
  return pipe_XGB_weight

Training the models - functions:

In [17]:
def training(pipe, X_train, y_train, X_test):
  %%time
  pipe.fit(X_train, y_train)
  y_predicted = pipe.predict(X_test)
  return y_predicted

In [18]:
def report(y_predicted, y_test, data):
  report = metrics.classification_report(y_test, y_predicted)
  print(data)
  print(report)
  p1=metrics.accuracy_score(y_test,y_predicted)*100
  p2=metrics.balanced_accuracy_score(y_test, y_predicted)*100
  print("Accuracy of the model is:",p1,"%")
  print("Balanced accuracy of the model is:",p2,"%")
  cm = metrics.confusion_matrix(y_test, y_predicted)
  print(cm)
  return ((p1,p2))


In [19]:
results = {}

### 5.1 Simple approach

LGBM (simple)

In [20]:
results['LGBM(simple, no outliers)']=report(training(pipe_LGBM, X_train, y_train, X_test), y_test, "Rain outliers removed")

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 7.15 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.2s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   4.8s
Rain outliers removed
              precision    recall  f1-score   support

           0       0.87      0.95      0.91     19284
           1       0.74      0.52      0.61      5458

    accuracy                           0.85     24742
   macro avg       0.81      0.73      0.76     24742
weighted avg       0.84      0.85      0.84     24742

Accuracy of the model is: 85.3164659283809 %
Balanced accuracy of the model is: 73.32602820398633 %
[[18278  1006]
 [ 2627  2831]]


In [21]:
results['LGBM(simple, outliers)']=report(training(pipe_LGBM, X1_train, y1_train, X1_test), y1_test, "Rain with outliers")

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.63 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.2s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   4.7s
Rain with outliers
              precision    recall  f1-score   support

           0       0.88      0.95      0.91     19284
           1       0.74      0.53      0.62      5458

    accuracy                           0.86     24742
   macro avg       0.81      0.74      0.77     24742
weighted avg       0.85      0.86      0.85     24742

Accuracy of the model is: 85.58726052865573 %
Balanced accuracy of the model is: 73.99891852010286 %
[[18269  1015]
 [ 2551  2907]]


In [22]:
results['LGBM(simple, original)']=report(training(pipe_LGBM, X2_train, y2_train, X2_test), y2_test, "original data")

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 5.25 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.9s
original data
              precision    recall  f1-score   support

           0       0.89      0.95      0.92      8799
           1       0.76      0.57      0.65      2485

    accuracy                           0.87     11284
   macro avg       0.82      0.76      0.78     11284
weighted avg       0.86      0.87      0.86     11284

Accuracy of the model is: 86.50301311591633 %
Balanced accuracy of the model is: 75.92554531644922 %
[[8344  455]
 [1068 1417]]


XGBoost (simple)

In [23]:
results['XGB(simple, no outliers)']=report(training(pipe_XGB, X_train, y_train, X_test), y_test, "Rain outliers removed")

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.87 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.2s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=  15.7s
Rain outliers removed
              precision    recall  f1-score   support

           0       0.87      0.95      0.91     19284
           1       0.74      0.49      0.59      5458

    accuracy                           0.85     24742
   macro avg       0.80      0.72      0.75     24742
weighted avg       0.84      0.85      0.84     24742

Accuracy of the model is: 84.94462856680947 %
Balanced accuracy of the model is: 72.12855439083424 %
[[18332   952]
 [ 2773  2685]]


In [24]:
results['XGB(simple, outliers)']=report(training(pipe_XGB, X1_train, y1_train, X1_test), y1_test, "Rain with outliers")

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.1s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=  15.5s
Rain with outliers
              precision    recall  f1-score   support

           0       0.87      0.95      0.91     19284
           1       0.74      0.50      0.60      5458

    accuracy                           0.85     24742
   macro avg       0.81      0.73      0.75     24742
weighted avg       0.84      0.85      0.84     24742

Accuracy of the model is: 85.13054724759517 %
Balanced accuracy of the model is: 72.51054592065418 %
[[18338   946]
 [ 2733  2725]]


In [25]:
results['XGB(simple, original)']=report(training(pipe_XGB, X2_train, y2_train, X2_test), y2_test, "original data")

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 27.4 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   3.0s
original data
              precision    recall  f1-score   support

           0       0.87      0.95      0.91      8799
           1       0.76      0.51      0.61      2485

    accuracy                           0.86     11284
   macro avg       0.81      0.73      0.76     11284
weighted avg       0.85      0.86      0.85     11284

Accuracy of the model is: 85.64338886919532 %
Balanced accuracy of the model is: 73.38186637726119 %
[[8385  414]
 [1206 1279]]


Random Forest (simple)

In [26]:
results['RF(simple, no outliers)']=report(training(pipe_RF, X_train, y_train, X_test), y_test, "Rain outliers removed")

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.15 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.2s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=  24.6s
Rain outliers removed
              precision    recall  f1-score   support

           0       0.87      0.96      0.91     19284
           1       0.76      0.49      0.59      5458

    accuracy                           0.85     24742
   macro avg       0.81      0.72      0.75     24742
weighted avg       0.84      0.85      0.84     24742

Accuracy of the model is: 85.26392369250667 %
Balanced accuracy of the model is: 72.16261832831186 %
[[18437   847]
 [ 2799  2659]]


In [27]:
results['RF(simple, outliers)']=report(training(pipe_RF, X1_train, y1_train, X1_test), y1_test, "Rain with outliers")

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.15 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.2s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=  24.7s
Rain with outliers
              precision    recall  f1-score   support

           0       0.87      0.96      0.91     19284
           1       0.76      0.49      0.60      5458

    accuracy                           0.85     24742
   macro avg       0.81      0.73      0.75     24742
weighted avg       0.85      0.85      0.84     24742

Accuracy of the model is: 85.40134184787001 %
Balanced accuracy of the model is: 72.52006402306266 %
[[18430   854]
 [ 2758  2700]]


In [28]:
results['RF(simple, original)']=report(training(pipe_RF, X2_train, y2_train, X2_test), y2_test, "original data")

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.63 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   9.8s
original data
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      8799
           1       0.78      0.53      0.63      2485

    accuracy                           0.86     11284
   macro avg       0.83      0.74      0.77     11284
weighted avg       0.86      0.86      0.85     11284

Accuracy of the model is: 86.29032258064517 %
Balanced accuracy of the model is: 74.37421666034392 %
[[8418  381]
 [1166 1319]]


### 5.2 Class weight approach

LGBM (class weight)

In [29]:
results['LGBM(weights, no outliers)']=report(training(pipe_LGBM_weight, X_train, y_train, X_test), y_test, "Rain outliers removed")

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.63 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.2s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   6.2s
Rain outliers removed
              precision    recall  f1-score   support

           0       0.93      0.80      0.86     19284
           1       0.53      0.79      0.63      5458

    accuracy                           0.80     24742
   macro avg       0.73      0.80      0.75     24742
weighted avg       0.84      0.80      0.81     24742

Accuracy of the model is: 79.98544984237328 %
Balanced accuracy of the model is: 79.52170765816373 %
[[15495  3789]
 [ 1163  4295]]


In [30]:
results['LGBM(weights, outliers)']=report(training(pipe_LGBM_weight, X1_train, y1_train, X1_test), y1_test, "Rain with outliers")

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 7.87 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.2s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   6.3s
Rain with outliers
              precision    recall  f1-score   support

           0       0.93      0.81      0.87     19284
           1       0.54      0.79      0.64      5458

    accuracy                           0.81     24742
   macro avg       0.74      0.80      0.75     24742
weighted avg       0.85      0.81      0.82     24742

Accuracy of the model is: 80.59574812060464 %
Balanced accuracy of the model is: 80.03801673377033 %
[[15627  3657]
 [ 1144  4314]]


In [31]:
results['LGBM(weights, original)']=report(training(pipe_LGBM_weight, X2_train, y2_train, X2_test), y2_test, "original data")

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.87 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   1.2s
original data
              precision    recall  f1-score   support

           0       0.94      0.82      0.88      8799
           1       0.56      0.80      0.66      2485

    accuracy                           0.82     11284
   macro avg       0.75      0.81      0.77     11284
weighted avg       0.85      0.82      0.83     11284

Accuracy of the model is: 81.86813186813187 %
Balanced accuracy of the model is: 81.32780773743495 %
[[7241 1558]
 [ 488 1997]]


XGBoost (class weight)

In [32]:
results['XGB(weights, no outliers)']=report(training(pipe_XGB_weight(y_train), X_train, y_train, X_test), y_test, "Rain outliers removed")

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.44 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.2s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=  15.5s
Rain outliers removed
              precision    recall  f1-score   support

           0       0.93      0.79      0.85     19284
           1       0.51      0.78      0.62      5458

    accuracy                           0.79     24742
   macro avg       0.72      0.78      0.74     24742
weighted avg       0.84      0.79      0.80     24742

Accuracy of the model is: 78.81739552178482 %
Balanced accuracy of the model is: 78.46368383132638 %
[[15253  4031]
 [ 1210  4248]]


In [33]:
results['XGB(weights, outliers)']=report(training(pipe_XGB_weight(y1_train), X1_train, y1_train, X1_test), y1_test, "Rain with outliers")

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.96 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.2s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=  15.5s
Rain with outliers
              precision    recall  f1-score   support

           0       0.93      0.80      0.86     19284
           1       0.52      0.78      0.62      5458

    accuracy                           0.79     24742
   macro avg       0.72      0.79      0.74     24742
weighted avg       0.84      0.79      0.81     24742

Accuracy of the model is: 79.27410880284536 %
Balanced accuracy of the model is: 78.78951304635599 %
[[15361  3923]
 [ 1205  4253]]


In [34]:
results['XGB(weights, original)']=report(training(pipe_XGB_weight(y2_train), X2_train, y2_train, X2_test), y2_test, "original data")

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.2 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   3.0s
original data
              precision    recall  f1-score   support

           0       0.94      0.80      0.86      8799
           1       0.53      0.81      0.64      2485

    accuracy                           0.80     11284
   macro avg       0.74      0.81      0.75     11284
weighted avg       0.85      0.80      0.82     11284

Accuracy of the model is: 80.27295285359801 %
Balanced accuracy of the model is: 80.56485292022622 %
[[7043 1756]
 [ 470 2015]]


Random Forest (class weight)

In [35]:
results['RF(weights, no outliers)']=report(training(pipe_RF_weight, X_train, y_train, X_test), y_test, "Rain outliers removed")

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 8.82 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.2s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=  24.0s
Rain outliers removed
              precision    recall  f1-score   support

           0       0.86      0.96      0.91     19284
           1       0.76      0.46      0.58      5458

    accuracy                           0.85     24742
   macro avg       0.81      0.71      0.74     24742
weighted avg       0.84      0.85      0.84     24742

Accuracy of the model is: 85.00121251313556 %
Balanced accuracy of the model is: 71.17964765577251 %
[[18496   788]
 [ 2923  2535]]


In [36]:
results['RF(weights, outliers)']=report(training(pipe_RF_weight, X1_train, y1_train, X1_test), y1_test, "Rain with outliers")

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.15 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.2s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=  24.3s
Rain with outliers
              precision    recall  f1-score   support

           0       0.87      0.96      0.91     19284
           1       0.76      0.48      0.59      5458

    accuracy                           0.85     24742
   macro avg       0.81      0.72      0.75     24742
weighted avg       0.84      0.85      0.84     24742

Accuracy of the model is: 85.21542316708431 %
Balanced accuracy of the model is: 71.96730340852578 %
[[18450   834]
 [ 2824  2634]]


In [37]:
results['RF(weights, original)']=report(training(pipe_RF_weight, X2_train, y2_train, X2_test), y2_test, "original data")

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 8.11 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   9.6s
original data
              precision    recall  f1-score   support

           0       0.87      0.96      0.91      8799
           1       0.78      0.50      0.61      2485

    accuracy                           0.86     11284
   macro avg       0.83      0.73      0.76     11284
weighted avg       0.85      0.86      0.85     11284

Accuracy of the model is: 85.9269762495569 %
Balanced accuracy of the model is: 72.98617480539562 %
[[8457  342]
 [1246 1239]]


### 5.3 Oversampling

In [38]:
oversample = SMOTE() 
X_traino, y_traino = oversample.fit_resample(X_train, y_train)
X1_traino, y1_traino = oversample.fit_resample(X1_train, y1_train) 
X2_traino, y2_traino = oversample.fit_resample(X2_train, y2_train)  



LGBM (oversampling)

In [39]:
results['LGBM(oversampling, no outliers)']=report(training(pipe_LGBM, X_traino, y_traino, X_test), y_test, "Rain outliers removed")

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.15 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.2s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=  13.0s
Rain outliers removed
              precision    recall  f1-score   support

           0       0.88      0.94      0.91     19284
           1       0.72      0.53      0.61      5458

    accuracy                           0.85     24742
   macro avg       0.80      0.74      0.76     24742
weighted avg       0.84      0.85      0.84     24742

Accuracy of the model is: 85.1386306684989 %
Balanced accuracy of the model is: 73.57975432540653 %
[[18178  1106]
 [ 2571  2887]]


In [40]:
results['LGBM(oversampling, outliers)']=report(training(pipe_LGBM, X1_traino, y1_traino, X1_test), y1_test, "Rain with outliers")

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 8.11 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.3s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=  12.8s
Rain with outliers
              precision    recall  f1-score   support

           0       0.88      0.94      0.91     19284
           1       0.73      0.54      0.62      5458

    accuracy                           0.86     24742
   macro avg       0.81      0.74      0.77     24742
weighted avg       0.85      0.86      0.85     24742

Accuracy of the model is: 85.57513539730014 %
Balanced accuracy of the model is: 74.39835863753828 %
[[18204  1080]
 [ 2489  2969]]


In [41]:
results['LGBM(oversampling, original)']=report(training(pipe_LGBM, X2_traino, y2_traino, X2_test), y2_test, "original data")

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.39 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   2.1s
original data
              precision    recall  f1-score   support

           0       0.89      0.94      0.91      8799
           1       0.73      0.60      0.66      2485

    accuracy                           0.86     11284
   macro avg       0.81      0.77      0.78     11284
weighted avg       0.86      0.86      0.86     11284

Accuracy of the model is: 86.21942573555476 %
Balanced accuracy of the model is: 76.65331687819838 %
[[8249  550]
 [1005 1480]]


XGBoost (oversampling)

In [42]:
results['XGB(oversampling, no outliers)']=report(training(pipe_XGB, X_traino, y_traino, X_test), y_test, "Rain outliers removed")

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.39 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.2s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=  31.3s
Rain outliers removed
              precision    recall  f1-score   support

           0       0.89      0.92      0.90     19284
           1       0.66      0.58      0.62      5458

    accuracy                           0.84     24742
   macro avg       0.77      0.75      0.76     24742
weighted avg       0.84      0.84      0.84     24742

Accuracy of the model is: 84.17266187050359 %
Balanced accuracy of the model is: 74.83852954457753 %
[[17653  1631]
 [ 2285  3173]]


In [43]:
results['XGB(oversampling, outliers)']=report(training(pipe_XGB, X1_traino, y1_traino, X1_test), y1_test, "Rain with outliers")

CPU times: user 0 ns, sys: 4 µs, total: 4 µs
Wall time: 7.63 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.2s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=  31.0s
Rain with outliers
              precision    recall  f1-score   support

           0       0.89      0.91      0.90     19284
           1       0.65      0.60      0.63      5458

    accuracy                           0.84     24742
   macro avg       0.77      0.75      0.76     24742
weighted avg       0.84      0.84      0.84     24742

Accuracy of the model is: 84.14841160779241 %
Balanced accuracy of the model is: 75.4732087364513 %
[[17548  1736]
 [ 2186  3272]]


In [44]:
results['XGB(oversampling, original)']=report(training(pipe_XGB, X2_traino, y2_traino, X2_test), y2_test, "original data")

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   6.5s
original data
              precision    recall  f1-score   support

           0       0.91      0.88      0.89      8799
           1       0.62      0.68      0.65      2485

    accuracy                           0.84     11284
   macro avg       0.76      0.78      0.77     11284
weighted avg       0.84      0.84      0.84     11284

Accuracy of the model is: 83.79120879120879 %
Balanced accuracy of the model is: 78.18610949707794 %
[[7761 1038]
 [ 791 1694]]


Random Forest (oversampling)

In [45]:
results['RF(oversampling, no outliers)']=report(training(pipe_RF, X_traino, y_traino, X_test), y_test, "Rain outliers removed")

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.72 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.3s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=  51.9s
Rain outliers removed
              precision    recall  f1-score   support

           0       0.88      0.94      0.91     19284
           1       0.71      0.55      0.62      5458

    accuracy                           0.85     24742
   macro avg       0.80      0.74      0.76     24742
weighted avg       0.84      0.85      0.84     24742

Accuracy of the model is: 85.106296984884 %
Balanced accuracy of the model is: 74.18954374598916 %
[[18074  1210]
 [ 2475  2983]]


In [46]:
results['RF(oversampling, outliers)']=report(training(pipe_RF, X1_traino, y1_traino, X1_test), y1_test, "Rain with outliers")

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.72 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.2s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=  51.3s
Rain with outliers
              precision    recall  f1-score   support

           0       0.88      0.94      0.91     19284
           1       0.71      0.56      0.63      5458

    accuracy                           0.85     24742
   macro avg       0.80      0.75      0.77     24742
weighted avg       0.85      0.85      0.85     24742

Accuracy of the model is: 85.33667448064021 %
Balanced accuracy of the model is: 74.79052953940896 %
[[18062  1222]
 [ 2406  3052]]


In [47]:
results['RF(oversampling, original)']=report(training(pipe_RF, X2_traino, y2_traino, X2_test), y2_test, "original data")

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.39 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=  21.6s
original data
              precision    recall  f1-score   support

           0       0.90      0.91      0.91      8799
           1       0.68      0.64      0.66      2485

    accuracy                           0.86     11284
   macro avg       0.79      0.78      0.78     11284
weighted avg       0.85      0.86      0.85     11284

Accuracy of the model is: 85.51931939028712 %
Balanced accuracy of the model is: 77.92255521994336 %
[[8051  748]
 [ 886 1599]]


### 5.4 Undersampling

In [48]:
rus = RandomUnderSampler(random_state=0) 
X_trainu, y_trainu = rus.fit_resample(X_train, y_train) 
X1_trainu, y1_trainu = rus.fit_resample(X1_train, y1_train) 
X2_trainu, y2_trainu = rus.fit_resample(X2_train, y2_train) 



LGBM (undersampling)

In [49]:
results['LGBM(undersampling, no outliers)']=report(training(pipe_LGBM, X_trainu, y_trainu, X_test), y_test, "Rain outliers removed")

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.63 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.1s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   2.8s
Rain outliers removed
              precision    recall  f1-score   support

           0       0.93      0.79      0.86     19284
           1       0.52      0.79      0.63      5458

    accuracy                           0.79     24742
   macro avg       0.73      0.79      0.74     24742
weighted avg       0.84      0.79      0.81     24742

Accuracy of the model is: 79.28219222374909 %
Balanced accuracy of the model is: 79.35955028039734 %
[[15277  4007]
 [ 1119  4339]]


In [50]:
results['LGBM(undersampling, outliers)']=report(training(pipe_LGBM, X1_trainu, y1_trainu, X1_test), y1_test, "Rain with outliers")

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.39 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.1s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   2.8s
Rain with outliers
              precision    recall  f1-score   support

           0       0.93      0.80      0.86     19284
           1       0.53      0.80      0.64      5458

    accuracy                           0.80     24742
   macro avg       0.73      0.80      0.75     24742
weighted avg       0.84      0.80      0.81     24742

Accuracy of the model is: 79.94503273785466 %
Balanced accuracy of the model is: 79.92927018101838 %
[[15419  3865]
 [ 1097  4361]]


In [51]:
results['LGBM(undersampling, original)']=report(training(pipe_LGBM, X2_trainu, y2_trainu, X2_test), y2_test, "original data")

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.39 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.6s
original data
              precision    recall  f1-score   support

           0       0.94      0.81      0.87      8799
           1       0.55      0.82      0.66      2485

    accuracy                           0.82     11284
   macro avg       0.75      0.82      0.77     11284
weighted avg       0.86      0.82      0.83     11284

Accuracy of the model is: 81.5225097483162 %
Balanced accuracy of the model is: 81.58265423887798 %
[[7169 1630]
 [ 455 2030]]


XGBoost (undersampling)

In [52]:
results['XGB(undersampling, no outliers)']=report(training(pipe_XGB, X_trainu, y_trainu, X_test), y_test, "Rain outliers removed")

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.39 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.1s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   6.4s
Rain outliers removed
              precision    recall  f1-score   support

           0       0.93      0.79      0.85     19284
           1       0.51      0.78      0.62      5458

    accuracy                           0.79     24742
   macro avg       0.72      0.78      0.73     24742
weighted avg       0.83      0.79      0.80     24742

Accuracy of the model is: 78.53043407970253 %
Balanced accuracy of the model is: 78.31900164397713 %
[[15176  4108]
 [ 1204  4254]]


In [53]:
results['XGB(undersampling, outliers)']=report(training(pipe_XGB, X1_trainu, y1_trainu, X1_test), y1_test, "Rain with outliers")

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.44 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.1s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   6.4s
Rain with outliers
              precision    recall  f1-score   support

           0       0.93      0.79      0.85     19284
           1       0.52      0.78      0.62      5458

    accuracy                           0.79     24742
   macro avg       0.72      0.79      0.74     24742
weighted avg       0.84      0.79      0.80     24742

Accuracy of the model is: 78.97098051895563 %
Balanced accuracy of the model is: 78.73298019254196 %
[[15265  4019]
 [ 1184  4274]]


In [54]:
results['XGB(undersampling, original)']=report(training(pipe_XGB, X2_trainu, y2_trainu, X2_test), y2_test, "original data")

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.48 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   1.4s
original data
              precision    recall  f1-score   support

           0       0.94      0.80      0.86      8799
           1       0.53      0.81      0.64      2485

    accuracy                           0.80     11284
   macro avg       0.73      0.80      0.75     11284
weighted avg       0.85      0.80      0.81     11284

Accuracy of the model is: 80.18433179723502 %
Balanced accuracy of the model is: 80.36364567676544 %
[[7043 1756]
 [ 480 2005]]


Random Forest (undersampling)

In [55]:
results['RF(undersampling, no outliers)']=report(training(pipe_RF, X_trainu, y_trainu, X_test), y_test, "Rain outliers removed")

CPU times: user 0 ns, sys: 4 µs, total: 4 µs
Wall time: 7.63 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.1s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   9.2s
Rain outliers removed
              precision    recall  f1-score   support

           0       0.93      0.79      0.86     19284
           1       0.52      0.79      0.63      5458

    accuracy                           0.79     24742
   macro avg       0.73      0.79      0.74     24742
weighted avg       0.84      0.79      0.81     24742

Accuracy of the model is: 79.30644248646027 %
Balanced accuracy of the model is: 79.21747421751468 %
[[15307  3977]
 [ 1143  4315]]


In [56]:
results['RF(undersampling, outliers)']=report(training(pipe_RF, X1_trainu, y1_trainu, X1_test), y1_test, "Rain with outliers")

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.25 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.1s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   9.3s
Rain with outliers
              precision    recall  f1-score   support

           0       0.93      0.80      0.86     19284
           1       0.53      0.79      0.63      5458

    accuracy                           0.80     24742
   macro avg       0.73      0.79      0.75     24742
weighted avg       0.84      0.80      0.81     24742

Accuracy of the model is: 79.62573761215747 %
Balanced accuracy of the model is: 79.41573919798938 %
[[15387  3897]
 [ 1144  4314]]


In [57]:
results['RF(undersampling, original)']=report(training(pipe_RF, X2_trainu, y2_trainu, X2_test), y2_test, "original data")

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.72 µs
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   4.1s
original data
              precision    recall  f1-score   support

           0       0.94      0.81      0.87      8799
           1       0.54      0.81      0.65      2485

    accuracy                           0.81     11284
   macro avg       0.74      0.81      0.76     11284
weighted avg       0.85      0.81      0.82     11284

Accuracy of the model is: 80.83126550868487 %
Balanced accuracy of the model is: 80.80734206351875 %
[[7114 1685]
 [ 478 2007]]


## 6. Results - comparison.

### 6.1 Data

In [59]:
outliers = {'acc':{'simple':{}, 'oversampling':{}, 'undersampling':{}, 'weights':{}}, 
            'balanced':{'simple':{}, 'oversampling':{}, 'undersampling':{}, 'weights':{}}}
no_outliers = {'acc':{'simple':{}, 'oversampling':{}, 'undersampling':{}, 'weights':{}}, 
            'balanced':{'simple':{}, 'oversampling':{}, 'undersampling':{}, 'weights':{}}}
original = {'acc':{'simple':{}, 'oversampling':{}, 'undersampling':{}, 'weights':{}}, 
            'balanced':{'simple':{}, 'oversampling':{}, 'undersampling':{}, 'weights':{}}}
for x in results:
  t1 = results[x][0]
  t2=results[x][1]
  if x.find('LGBM')!=-1:
    n='lgbm'
  elif x.find('XGB')!=-1:
    n='xgb'
  else:
    n='rf'
  if x.find('simple')!=-1:
    k='simple'
  elif x.find('oversampling')!=-1:
    k='oversampling'
  elif x.find('undersampling')!=-1:
    k='undersampling'
  else:
    k='weights'
  if x.find('no outliers')!=-1:
    no_outliers['acc'][k][n]=t1
    no_outliers['balanced'][k][n]=t2
  elif x.find('outliers')!=-1:
    outliers['acc'][k][n]=t1
    outliers['balanced'][k][n]=t2
  else:
    original['acc'][k][n]=t1
    original['balanced'][k][n]=t2

print(outliers)

{'acc': {'simple': {'lgbm': 85.58726052865573, 'xgb': 85.13054724759517, 'rf': 85.40134184787001}, 'oversampling': {'lgbm': 85.57513539730014, 'xgb': 84.14841160779241, 'rf': 85.33667448064021}, 'undersampling': {'lgbm': 79.94503273785466, 'xgb': 78.97098051895563, 'rf': 79.62573761215747}, 'weights': {'lgbm': 80.59574812060464, 'xgb': 79.27410880284536, 'rf': 85.21542316708431}}, 'balanced': {'simple': {'lgbm': 73.99891852010286, 'xgb': 72.51054592065418, 'rf': 72.52006402306266}, 'oversampling': {'lgbm': 74.39835863753828, 'xgb': 75.4732087364513, 'rf': 74.79052953940896}, 'undersampling': {'lgbm': 79.92927018101838, 'xgb': 78.73298019254196, 'rf': 79.41573919798938}, 'weights': {'lgbm': 80.03801673377033, 'xgb': 78.78951304635599, 'rf': 71.96730340852578}}}


In [60]:
def transform(dict):
  return pd.DataFrame.from_dict({(i,j): dict[i][j] 
                           for i in dict.keys() 
                           for j in dict[i].keys()},
                       orient='index')

### 6.2 Accuracy and balanced accuracy

In [68]:
print('\nrain with outliers')
display(transform(outliers))
print('\nrain outliers removed')
display(transform(no_outliers))
print('\noriginal dataset')
display(transform(original))


rain with outliers


Unnamed: 0,Unnamed: 1,lgbm,xgb,rf
acc,simple,85.587261,85.130547,85.401342
acc,oversampling,85.575135,84.148412,85.336674
acc,undersampling,79.945033,78.970981,79.625738
acc,weights,80.595748,79.274109,85.215423
balanced,simple,73.998919,72.510546,72.520064
balanced,oversampling,74.398359,75.473209,74.79053
balanced,undersampling,79.92927,78.73298,79.415739
balanced,weights,80.038017,78.789513,71.967303



rain outliers removed


Unnamed: 0,Unnamed: 1,lgbm,xgb,rf
acc,simple,85.316466,84.944629,85.263924
acc,oversampling,85.138631,84.172662,85.106297
acc,undersampling,79.282192,78.530434,79.306442
acc,weights,79.98545,78.817396,85.001213
balanced,simple,73.326028,72.128554,72.162618
balanced,oversampling,73.579754,74.83853,74.189544
balanced,undersampling,79.35955,78.319002,79.217474
balanced,weights,79.521708,78.463684,71.179648



original dataset


Unnamed: 0,Unnamed: 1,lgbm,xgb,rf
acc,simple,86.503013,85.643389,86.290323
acc,oversampling,86.219426,83.791209,85.519319
acc,undersampling,81.52251,80.184332,80.831266
acc,weights,81.868132,80.272953,85.926976
balanced,simple,75.925545,73.381866,74.374217
balanced,oversampling,76.653317,78.186109,77.922555
balanced,undersampling,81.582654,80.363646,80.807342
balanced,weights,81.327808,80.564853,72.986175
