In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn import metrics

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2, f_classif

from sklearn.decomposition import PCA

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv("data/rain_outliers_removed.csv")

# encoding RainTomorrow and RainToday as binary values
df.RainToday.replace(("Yes", "No"), (1,0), inplace = True)
df.RainTomorrow.replace(("Yes", "No"), (1,0), inplace = True)

# columns to be changed to one-hot encoding
categorical_columns = ["Season", "WindGustDir", "WindDir9am", "WindDir3pm"]

# creating one-hot encoding
df = pd.get_dummies(df, columns = categorical_columns)

# df.describe()

In [3]:
y = df.RainTomorrow.to_numpy()
X = df.drop(columns=['RainTomorrow']).to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [4]:
from collections import Counter
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)
count = Counter(y_train)
print(count)

Counter({0: 77034, 1: 77034})


## So far tested: 
### if you're testing some method, please add it here
- If not specified, default values of classifiers used
### Basic split (20%)
- SVC:
    - StandardScaler: accuracy: 85%, balanced accuracy: 72%;
- KNN:
    - StandardScaler: accuracy: 80%, balanced accuracy: 64%;
- MLP Classifier:
    - StandardScaler: accuracy: 83%, balanced accuracy: 72%;
- Decision Tree Classifier:
    - StandardScaler: accuracy: 78%, balanced accuracy: 69%;
- Random Forest Classifier:
    - StandardScaler: accuracy: 85%, balanced accuracy: 72%; // 100 estimators
    - StandardScaler: accuracy: 85%, balanced accuracy: 72%; // 300 estimators
- AdaBoost Classifier:
    - StandardScaler: accuracy: 84%, balanced accuracy: 71%; // 100 estimators
    - StandardScaler: accuracy: 84%, balanced accuracy: 72%; // 300 estimators
- Gausian NB:
    - StandardScaler: accuracy: 72%, balanced accuracy: 70%;
- QuadraticDiscriminantAnalysis:
    - StandardScaler: accuracy: 68%, balanced accuracy: 69%;
- XGBoost Classifier:
    - StandardScaler: accuracy: 85%, balanced accuracy: 74%;
- Logistic Regression:
    - StandardScaler: accuracy: 85%, balanced accuracy: 72.7%;
- LGBMClassifier:
    - StandardScaler: accuracy: 85.3%, balanced accuracy: 73.5%;
### Feature selection:
- K best (=20):
    - XGBoost Classifier:
        - StandardScaler: accuracy: 85%, balanced accuracy: 73%;
### Grid search
- RandomForest: 
    - 'criterion': 'entropy',
    - 'max_depth': None,
    - 'min_samples_leaf': 4,
    - 'n_estimators': 100,
    - 'feature_selection k': 20
    - StandardScaler: accuracy: 85%, balanced accuracy: 72%; 
- XGBoost Classifier
    - 'classifier__colsample_bytree': 0.6,
    - 'classifier__gamma': 0,
    - 'classifier__max_depth': 8,
    - 'classifier__min_child_weight': 2,
    - 'classifier__subsample': 1.0,
    - 'feature_selection__k': 40}
    - StandardScaler: accuracy: 85.2%, balanced accuracy: 73.6%;
### Oversampling:
- Random Forest Classifier:
    - StandardScaler: accuracy: 85.2%, balanced accuracy: 74.3%; 
- LGBMClassifier:
    - StandardScaler: accuracy: 85.2%, balanced accuracy: 73.5%;


## Still to test:
- over and under sampling
- PCA <- rather useless
- fervent prayer


In [5]:
pipe = Pipeline(
    [
        ('scaler', StandardScaler()),
        # ('feature_selection', SelectKBest(f_classif, k = 20)) ,
        # ('pca', PCA(0.95)),
        ('classifier', LGBMClassifier())
    ], 
    verbose=True
    ) 
    

params = {
    'feature_selection__k': [10, 20, 40],
    # 'pca__n_components': [.8, .85, .9, .95],
    #### for Random Forest 
    # 'classifier__n_estimators': [50, 100, 200, 300, 500],
    'classifier__max_depth': [2, 4, 8, None], # also for XGBoost
    # 'classifier__min_samples_leaf': [4,8,16],
    # 'classifier__criterion': ['gini', 'entropy'],
    #### for MLP Classifier
    # 'classifier__hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    # 'classifier__activation': ['tanh', 'relu'],
    # 'classifier__solver': ['sgd', 'adam'],
    # 'classifier__alpha': [0.0001, 0.05],
    # 'classifier__learning_rate': ['constant','adaptive'],
    #### for XGBoost
    # 'classifier__nthread': [6], ## CHANGE NUMBER OF THREADS YOU WANT TO USE 
    'classifier__min_child_weight': [1, 2],
    'classifier__gamma': [0, 0.5, 1],
    'classifier__subsample': [0.6, 1.0],
    'classifier__colsample_bytree': [0.6, 1.0],
}

grid = GridSearchCV(
    pipe, 
    params, 
    scoring="balanced_accuracy", 
    # n_jobs=4, 
    # verbose=4
)

%%time
grid.fit(X_train, y_train)
grid.best_score_
grid.best_params_

best = grid.best_estimator_
y_predicted = best.predict(X_test)
metrics.balanced_accuracy_score(y_test, y_predicted)

In [6]:
%%time
pipe.fit(X_train, y_train)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.3s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   3.6s
CPU times: user 34.3 s, sys: 827 ms, total: 35.1 s
Wall time: 3.98 s


Pipeline(steps=[('scaler', StandardScaler()), ('classifier', LGBMClassifier())],
         verbose=True)

In [7]:
y_predicted = pipe.predict(X_test)
metrics.balanced_accuracy_score(y_test, y_predicted)

0.735871888583818

In [8]:
report = metrics.classification_report(y_test, y_predicted)
print(report)
print("Accuracy of the model is:",metrics.accuracy_score(y_test,y_predicted)*100,"%")
cm = metrics.confusion_matrix(y_test, y_predicted)
cm

              precision    recall  f1-score   support

           0       0.88      0.94      0.91     19284
           1       0.73      0.53      0.61      5458

    accuracy                           0.85     24742
   macro avg       0.80      0.74      0.76     24742
weighted avg       0.84      0.85      0.84     24742

Accuracy of the model is: 85.19117290437313 %


array([[18195,  1089],
       [ 2575,  2883]])