In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import TomekLinks
from sklearn.utils.class_weight import compute_class_weight 
from sklearn.preprocessing import LabelEncoder


In [17]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import TomekLinks
from sklearn.utils.class_weight import compute_class_weight 

In [23]:
data=pd.read_csv("C:/Users/VANISHA CHOUDHARY/Downloads/weatherAUS.csv")

In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [27]:
data.isnull().sum()

Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday         3261
RainTomorrow      3267
dtype: int64

In [29]:
data = data.drop(columns=['Date'])
data = data.dropna()


In [31]:
# Encode categorical features
label_encoder = LabelEncoder()
for col in ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow']:
    data[col] = label_encoder.fit_transform(data[col])

# Split features and target variable
X = data.drop('RainTomorrow', axis=1)
y = data['RainTomorrow']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Initial Class Distribution:", Counter(y_train))

Initial Class Distribution: Counter({0: 35194, 1: 9942})


In [47]:
# Define function for evaluating model performance
def evaluate_model(X_train, y_train, X_test, y_test, class_weights=None):
    model = RandomForestClassifier(class_weight=class_weights, random_state=40)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    # Performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    
    print(classification_report(y_test, y_pred))
    print(f"Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, AUC: {auc:.4f}")
    return accuracy, f1, auc

# Initialize dictionary to store results
results = {'Technique': [], 'Accuracy': [], 'F1 Score': [], 'AUC': []}


In [35]:
# Doing Random Undersampling
print("\nApplying Random Undersampling...")
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X_train, y_train)
print("After Undersampling:", Counter(y_rus))
accuracy, f1, auc = evaluate_model(X_rus, y_rus, X_test, y_test)
results['Technique'].append('Random Undersampling')
results['Accuracy'].append(accuracy)
results['F1 Score'].append(f1)
results['AUC'].append(auc)


Applying Random Undersampling...
After Undersampling: Counter({0: 9942, 1: 9942})
              precision    recall  f1-score   support

           0       0.94      0.81      0.87      8799
           1       0.54      0.81      0.65      2485

    accuracy                           0.81     11284
   macro avg       0.74      0.81      0.76     11284
weighted avg       0.85      0.81      0.82     11284

Accuracy: 0.8066, F1 Score: 0.6487, AUC: 0.8925


In [None]:
# Doing Random Oversampling
print("\nApplying Random Oversampling...")
ros = RandomOverSampler(random_state=40)
X_ros, y_ros = ros.fit_resample(X_train, y_train)
print("After Oversampling:", Counter(y_ros))
accuracy, f1, auc = evaluate_model(X_ros, y_ros, X_test, y_test)
results['Technique'].append('Random Oversampling')
results['Accuracy'].append(accuracy)
results['F1 Score'].append(f1)
results['AUC'].append(auc)




Applying Random Oversampling...
After Oversampling: Counter({0: 35194, 1: 35194})


In [38]:
#  SMOTE  (Synthetic Minority Oversampling Technique)
print("\nApplying SMOTE...")
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)
print("After SMOTE:", Counter(y_smote))
accuracy, f1, auc = evaluate_model(X_smote, y_smote, X_test, y_test)
results['Technique'].append('SMOTE')
results['Accuracy'].append(accuracy)
results['F1 Score'].append(f1)
results['AUC'].append(auc)



Applying SMOTE...
After SMOTE: Counter({0: 35194, 1: 35194})
              precision    recall  f1-score   support

           0       0.90      0.91      0.90      8799
           1       0.66      0.66      0.66      2485

    accuracy                           0.85     11284
   macro avg       0.78      0.78      0.78     11284
weighted avg       0.85      0.85      0.85     11284

Accuracy: 0.8503, F1 Score: 0.6589, AUC: 0.8939


In [40]:
# Tomek Links
print("\nApplying Tomek Links...")
tl = TomekLinks()
X_tl, y_tl = tl.fit_resample(X_train, y_train)
print("After Tomek Links:", Counter(y_tl))
accuracy, f1, auc = evaluate_model(X_tl, y_tl, X_test, y_test)
results['Technique'].append('Tomek Links')
results['Accuracy'].append(accuracy)
results['F1 Score'].append(f1)
results['AUC'].append(auc)



Applying Tomek Links...
After Tomek Links: Counter({0: 33538, 1: 9942})
              precision    recall  f1-score   support

           0       0.89      0.94      0.91      8799
           1       0.74      0.57      0.64      2485

    accuracy                           0.86     11284
   macro avg       0.81      0.76      0.78     11284
weighted avg       0.85      0.86      0.85     11284

Accuracy: 0.8612, F1 Score: 0.6431, AUC: 0.8948


In [42]:
# Class Weighting
print("\nApplying Class Weighting...")
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights))
accuracy, f1, auc = evaluate_model(X_train, y_train, X_test, y_test, class_weights=class_weights_dict)
results['Technique'].append('Class Weighting')
results['Accuracy'].append(accuracy)
results['F1 Score'].append(f1)
results['AUC'].append(auc)

# Display final results in a DataFrame
results_df = pd.DataFrame(results)
print("\nPerformance Metrics for Different Resampling Techniques:")
print(results_df)


Applying Class Weighting...
              precision    recall  f1-score   support

           0       0.87      0.96      0.91      8799
           1       0.78      0.50      0.61      2485

    accuracy                           0.86     11284
   macro avg       0.83      0.73      0.76     11284
weighted avg       0.85      0.86      0.85     11284

Accuracy: 0.8596, F1 Score: 0.6108, AUC: 0.8952

Performance Metrics for Different Resampling Techniques:
              Technique  Accuracy  F1 Score       AUC
0  Random Undersampling  0.806629  0.648744  0.892489
1   Random Oversampling  0.859713  0.653989  0.896746
2                 SMOTE  0.850319  0.658857  0.893938
3           Tomek Links  0.861219  0.643118  0.894781
4       Class Weighting  0.859624  0.610811  0.895162
