# Analyzing classification models

In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')



In [46]:
class BaselineModel():
    def __init__(self, df, classifier = 'Knn', oversample = False) -> None:
        self.classifier = classifier
        self.df = df
        self.oversample = oversample
        if self.oversample:
            self.X_train, self.X_test, self.y_train, self.y_test = self.oversampled_split_df()
        else:
            self.X_train, self.X_test, self.y_train, self.y_test = self.split_df()
        self.model_types = ['Knn', 'SVM', 'LogisticRegression', 'DecisionTreeClassifier', 'GaussianNB', 'RandomForestClassifier']

    def model_knn(self, X_train, y_train, X_test, y_test):
        print('Model Knn')
        knn = KNeighborsClassifier(n_neighbors=5)
        model = knn.fit(X_train, y_train)
        return model
    
    def model_svm(self, X_train, y_train, X_test, y_test):
        print('Model SVM')
        svc = SVC()
        model = svc.fit(X_train, y_train)
        return model

    def model_logistic_regression(self, X_train, y_train, X_test, y_test):
        print('Model LogisticRegression')
        lr = LogisticRegression()
        model = lr.fit(X_train, y_train)
        return model
    
    def model_decision_tree(self, X_train, y_train, X_test, y_test):
        print('Model DecisionTreeClassifier')
        dt = DecisionTreeClassifier()
        model = dt.fit(X_train, y_train)
        return model
    
    def model_gaussian_nb(self, X_train, y_train, X_test, y_test):
        print('Model GaussianNB')
        gnb = GaussianNB()
        model = gnb.fit(X_train, y_train)
        return model
    
    def model_random_forest(self, X_train, y_train, X_test, y_test):
        print('Model RandomForestClassifier')
        rf = RandomForestClassifier(n_estimators=10, random_state=0, max_depth=2, class_weight='balanced')
        model = rf.fit(X_train, y_train)
        return model

    def select_model(self, X_train, y_train, X_test, y_test):
        if self.classifier == 'Knn':
            model = self.model_knn(X_train, y_train, X_test, y_test)
        elif self.classifier == 'SVM':
            model = self.model_svm(X_train, y_train, X_test, y_test)
        elif self.classifier == 'LogisticRegression':
            model = self.model_logistic_regression(X_train, y_train, X_test, y_test)
        elif self.classifier == 'DecisionTreeClassifier':
            model = self.model_decision_tree(X_train, y_train, X_test, y_test)
        elif self.classifier == 'GaussianNB':
            model = self.model_gaussian_nb(X_train, y_train, X_test, y_test)
        elif self.classifier == 'RandomForestClassifier':
            model = self.model_random_forest(X_train, y_train, X_test, y_test)
        else:
            print('Model not found')
        return model

    def predict(self, model, X_test):
        y_pred = model.predict(X_test)
        return y_pred

    def evaluate(self, y_test, y_pred):
        accuracy = accuracy_score(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        class_report = classification_report(y_test, y_pred)
        print(f'Confusion Matrix: \n {conf_matrix} \n')
        print(f'Classification Report: \n {class_report} \n')
        print(f'Accuracy: {accuracy}')

        return accuracy

    def split_df(self):
        y = self.df['Rain_Tomorrow_Num']
        X = self.df.drop(['Rain_Tomorrow_Num'], axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
        return X_train, X_test, y_train, y_test
    
    def oversampled_split_df(self):
        sm = SMOTE(sampling_strategy = 'minority', random_state = 34)
        y = self.df['Rain_Tomorrow_Num']
        X = self.df.drop(['Rain_Tomorrow_Num'], axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
        X_train, y_train = sm.fit_resample(X_train, y_train)
        return X_train, X_test, y_train, y_test
    
    def run(self):
        model = self.select_model(self.X_train, self.y_train, self.X_test, self.y_test)
        y_pred = self.predict(model, self.X_test)
        self.evaluate(self.y_test, y_pred)

    def run_all_models(self):
        for model in self.model_types:
            self.classifier = model
            self.run()
            print('---------------------------')



In [47]:
class BalanceClasses():
    def __init__(self, df, n_classes = 2) -> None:
        self.df = df
        self.n_classes = n_classes


In [48]:
df = pd.read_csv('../data/weather_cleaned_preprocessed.csv')

In [49]:
predictor = BaselineModel(df, classifier='Knn')
predictor.run_all_models()

Model Knn
Confusion Matrix: 
 [[28954   587]
 [ 3649   578]] 

Classification Report: 
               precision    recall  f1-score   support

           0       0.89      0.98      0.93     29541
           1       0.50      0.14      0.21      4227

    accuracy                           0.87     33768
   macro avg       0.69      0.56      0.57     33768
weighted avg       0.84      0.87      0.84     33768
 

Accuracy: 0.8745557924662403
---------------------------
Model SVM
Confusion Matrix: 
 [[29541     0]
 [ 4227     0]] 

Classification Report: 
               precision    recall  f1-score   support

           0       0.87      1.00      0.93     29541
           1       0.00      0.00      0.00      4227

    accuracy                           0.87     33768
   macro avg       0.44      0.50      0.47     33768
weighted avg       0.77      0.87      0.82     33768
 

Accuracy: 0.874822316986496
---------------------------
Model LogisticRegression
Confusion Matrix: 
 [[29215 

In [50]:
oversapled_predictor = BaselineModel(df, classifier='Knn', oversample=True)
oversapled_predictor.run_all_models()

Model Knn
Confusion Matrix: 
 [[22824  6729]
 [ 1779  2436]] 

Classification Report: 
               precision    recall  f1-score   support

           0       0.93      0.77      0.84     29553
           1       0.27      0.58      0.36      4215

    accuracy                           0.75     33768
   macro avg       0.60      0.68      0.60     33768
weighted avg       0.85      0.75      0.78     33768
 

Accuracy: 0.748045486851457
---------------------------
Model SVM
Confusion Matrix: 
 [[ 5343 24210]
 [  447  3768]] 

Classification Report: 
               precision    recall  f1-score   support

           0       0.92      0.18      0.30     29553
           1       0.13      0.89      0.23      4215

    accuracy                           0.27     33768
   macro avg       0.53      0.54      0.27     33768
weighted avg       0.82      0.27      0.29     33768
 

Accuracy: 0.26981165600568585
---------------------------
Model LogisticRegression
Confusion Matrix: 
 [[21160