In [1]:
# 📌 Cell 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Algorithms for comparison
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import xgboost as xgb  # If you have xgboost installed
from xgboost import XGBClassifier
import joblib


In [6]:
# 📌 Cell 2: Define constants
DATA_PATH = "Weather.csv" 

In [7]:
# 📌 Cell 3: Load and clean data
def load_and_clean(path=DATA_PATH):
    """Loads data, handles date features, and drops extraneous columns."""
    df = pd.read_csv(path)
    if '@dropdown' in df.columns:
        df = df.drop(columns=['@dropdown'])
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    df['Day'] = df['Date'].dt.day
    df['Month'] = df['Date'].dt.month
    # Usually Year is high cardinality, skipping it
    return df

df = load_and_clean()
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Day,Month
0,2008-12-01,Delhi,13.4,22.9,0.6,,,W,44.0,W,...,1007.7,1007.1,8.0,,16.9,21.8,No,No,1,12
1,2008-12-02,Delhi,7.4,25.1,0.0,,,WNW,44.0,NNW,...,1010.6,1007.8,,,17.2,24.3,No,No,2,12
2,2008-12-03,Delhi,12.9,25.7,0.0,,,WSW,46.0,W,...,1007.6,1008.7,,2.0,21.0,23.2,No,No,3,12
3,2008-12-04,Delhi,9.2,28.0,0.0,,,NE,24.0,SE,...,1017.6,1012.8,,,18.1,26.5,No,No,4,12
4,2008-12-05,Delhi,17.5,32.3,1.0,,,W,41.0,ENE,...,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No,5,12


In [8]:
# 📌 Cell 4: Prepare target & features
def prepare_target_and_features(df):
    """Prepares target, handles RainToday mapping, and defines feature columns."""
    df = df.dropna(subset=['RainTomorrow']).copy()
    df['RainTomorrow'] = df['RainTomorrow'].map({'No':0, 'Yes':1})
    df['RainToday'] = df['RainToday'].map({'No':0, 'Yes':1})
    
    feature_cols = [
        'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
        'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
        'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
        'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm',
        'Temp9am', 'Temp3pm', 'RainToday', 'Day', 'Month'
    ]
    feature_cols = [c for c in feature_cols if c in df.columns]
    X = df[feature_cols]
    y = df['RainTomorrow']
    return X, y, feature_cols

X, y, feature_cols = prepare_target_and_features(df)
X.head()


Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,Day,Month
0,Delhi,13.4,22.9,0.6,,,W,44.0,W,WNW,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,0.0,1,12
1,Delhi,7.4,25.1,0.0,,,WNW,44.0,NNW,WSW,...,25.0,1010.6,1007.8,,,17.2,24.3,0.0,2,12
2,Delhi,12.9,25.7,0.0,,,WSW,46.0,W,WSW,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,0.0,3,12
3,Delhi,9.2,28.0,0.0,,,NE,24.0,SE,E,...,16.0,1017.6,1012.8,,,18.1,26.5,0.0,4,12
4,Delhi,17.5,32.3,1.0,,,W,41.0,ENE,NW,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,0.0,5,12


In [9]:
# 📌 Cell 5: Define Preprocessor
def get_preprocessor(numeric_features, categorical_features):
    """Creates the ColumnTransformer for preprocessing."""
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    preprocessor = ColumnTransformer([
        ('num', num_pipeline, numeric_features),
        ('cat', cat_pipeline, categorical_features)
    ])
    return preprocessor


In [10]:
# 📌 Cell 6: Train/Test split
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = [c for c in X.columns if c not in numeric_features]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

preprocessor = get_preprocessor(numeric_features, categorical_features)


In [None]:
#  Evaluation function
def evaluate_model(pipeline, X_train, y_train, X_test, y_test, model_name):
    """Trains and evaluates a single model pipeline."""
    print(f"\n--- Training {model_name} ---")
    pipeline.fit(X_train, y_train)
    
    # Predictions
    y_pred_train = pipeline.predict(X_train)
    y_pred_test = pipeline.predict(X_test)
    y_prob_test = pipeline.predict_proba(X_test)[:,1] if hasattr(pipeline, 'predict_proba') else None

    # Evaluation
    train_acc = (y_pred_train == y_train).mean()
    test_acc = (y_pred_test == y_test).mean()
    roc_auc = roc_auc_score(y_test, y_prob_test) if y_prob_test is not None else 0.0

    print(f"{model_name} Accuracy (Train): {train_acc:.4f}")
    print(f"{model_name} Accuracy (Test): {test_acc:.4f}")
    print(f"{model_name} ROC AUC: {roc_auc:.4f}")
    
    # Detailed test metrics
    print("\nTest Classification Report:")
    print(classification_report(y_test, y_pred_test))
    print("Test Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))

    return {'model_name': model_name, 'pipeline': pipeline, 'roc_auc': roc_auc}


In [12]:
#  Define Models
models = {
    'Logistic Regression': LogisticRegression(solver='liblinear', random_state=42, class_weight='balanced'),
    'Decision Tree Classifier': DecisionTreeClassifier(random_state=42),
    'xgboost Classifier': XGBClassifier(eval_metric='logloss', random_state=42),
    'Random Forest Classifier': RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1, class_weight='balanced'),
    'KNeighbors Classifier': KNeighborsClassifier(n_neighbors=5),
    # 'SVC': SVC(probability=True)  # Uncomment if you want to test SVM
}

