In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import pickle

In [2]:
data = pd.read_csv("../data/weatherAUS.csv")

data.dropna(inplace=True)

X = data[['MinTemp','MaxTemp','Rainfall','Humidity9am','Humidity3pm']]
y = data['RainTomorrow']

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [4]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [5]:
models = {
    "Logistic": LogisticRegression(class_weight='balanced'),
    "DecisionTree": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier()
}

best_acc = 0
best_model = None

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, pred)
    
    print(name, acc)
    
    if acc > best_acc:
        best_acc = acc
        best_model = model


Logistic 0.7426444523218717
DecisionTree 0.7543424317617866
RandomForest 0.8331265508684863


In [6]:
param_grid = {
    'n_estimators':[50,100,150],
    'max_depth':[3,5,8,None]
}

grid = GridSearchCV(
    RandomForestClassifier(),
    param_grid,
    cv=3
)

grid.fit(X_train_scaled, y_train)

best_model = grid.best_estimator_


In [7]:
pred = best_model.predict(X_test_scaled)

print("Final Accuracy:", accuracy_score(y_test,pred))
print(classification_report(y_test,pred))


Final Accuracy: 0.837025877348458
              precision    recall  f1-score   support

          No       0.85      0.96      0.90      8799
         Yes       0.73      0.41      0.53      2485

    accuracy                           0.84     11284
   macro avg       0.79      0.68      0.71     11284
weighted avg       0.83      0.84      0.82     11284



In [9]:
pickle.dump(best_model, open("../model/best_model.pkl","wb"))
pickle.dump(scaler, open("../model/scaler.pkl","wb"))
pickle.dump(list(X.columns), open("../model/feature_columns.pkl","wb"))