In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import TomekLinks
from sklearn.utils.class_weight import compute_class_weight 
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import TomekLinks
from sklearn.utils.class_weight import compute_class_weight
df=pd.read_csv("weatherAUS.csv")

In [2]:
df.isnull().sum()

Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday         3261
RainTomorrow      3267
dtype: int64

In [3]:
df=df.dropna()

In [4]:
df=df.drop(columns=['Date'])

In [5]:
label_encoder = LabelEncoder()
df['Location'] = label_encoder.fit_transform(df['Location'])
df['WindGustDir'] = label_encoder.fit_transform(df['WindGustDir'])
df['WindDir9am'] = label_encoder.fit_transform(df['WindDir9am'])
df['WindDir3pm'] = label_encoder.fit_transform(df['WindDir3pm'])
df['RainToday'] = label_encoder.fit_transform(df['RainToday'])
df['RainTomorrow'] = label_encoder.fit_transform(df['RainTomorrow'])
df['Evaporation'] = label_encoder.fit_transform(df['Evaporation'])

In [6]:
df=df.drop(columns=['Sunshine', 'Cloud9am', 'Cloud3pm'])

In [10]:
from collections import Counter

In [8]:
X = df.drop('RainTomorrow', axis=1)
y = df['RainTomorrow']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

In [13]:
# Random Under Sampling
print("Original class distribution:", Counter(y_train))
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X_train, y_train) 
print("Original class distribution:", Counter(y_rus))

Original class distribution: Counter({0: 35194, 1: 9942})
Original class distribution: Counter({0: 9942, 1: 9942})


In [12]:
# Random Over Sampling
from imblearn.over_sampling import RandomOverSampler
print("Original class distribution:", Counter(y_train))
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X_train, y_train) 
print("Original class distribution:", Counter(y_ros))

Original class distribution: Counter({0: 35194, 1: 9942})
Original class distribution: Counter({0: 35194, 1: 35194})


In [14]:
# SMOTE(Synthetic Minority Oversampling Technique)
print("Original class distribution:", Counter(y_train))
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)
print("Original class distribution:", Counter(y_smote))

Original class distribution: Counter({0: 35194, 1: 9942})
Original class distribution: Counter({0: 35194, 1: 35194})


In [15]:
# Tomek Links
tl = TomekLinks()
X_tl, y_tl = tl.fit_resample(X_train, y_train) 

In [16]:
# Class Weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights)) 
class_weights

array([0.64124567, 2.2699658 ])

In [17]:
# Function to evaluate model performance
def evaluate_model(X_train, y_train, X_test, y_test, class_weights=None):
    model = RandomForestClassifier(class_weight=class_weights, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    print(classification_report(y_test, y_pred))
    print("AUC:", roc_auc_score(y_test, y_prob)) 

In [18]:
# Evaluate each sampling technique
print("Random Undersampling:")
evaluate_model(X_rus, y_rus, X_test, y_test)

print("\nRandom Oversampling:")
evaluate_model(X_ros, y_ros, X_test, y_test)

print("\nSMOTE:")
evaluate_model(X_smote, y_smote, X_test, y_test)

#print("\nTomek Links:")
#evaluate_model(X_tl, y_tl, X_test, y_test)

print("\nClass Weights:")
evaluate_model(X_train, y_train, X_test, y_test, class_weights_dict) 

Random Undersampling:
              precision    recall  f1-score   support

           0       0.93      0.80      0.86      8799
           1       0.53      0.79      0.63      2485

    accuracy                           0.80     11284
   macro avg       0.73      0.79      0.75     11284
weighted avg       0.84      0.80      0.81     11284

AUC: 0.8785745956589635

Random Oversampling:
              precision    recall  f1-score   support

           0       0.89      0.93      0.91      8799
           1       0.70      0.58      0.64      2485

    accuracy                           0.85     11284
   macro avg       0.80      0.76      0.77     11284
weighted avg       0.85      0.85      0.85     11284

AUC: 0.8854549732764128

SMOTE:
              precision    recall  f1-score   support

           0       0.90      0.90      0.90      8799
           1       0.64      0.64      0.64      2485

    accuracy                           0.84     11284
   macro avg       0.77     