In [None]:
import json
import os
from pathlib import Path
from typing import List, Dict, Any

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import RandomizedSearchCV
import joblib
from scipy.stats import randint
from scipy.stats import loguniform
from sklearn.preprocessing import LabelEncoder
import warnings

warnings.filterwarnings('ignore')

In [None]:
# Set up the root path for the project
root_path: Path = Path().resolve().parent

In [None]:
# Load the configuration file
config_path: str = os.path.join(root_path, 'config.json')
with open(config_path, 'r') as config_file:
    config: Dict[str, Any] = json.load(config_file)

In [None]:
# Load the weather data from CSV file
file_path: str = os.path.join(root_path, 'data', 'csv', 'weatherAUS.csv')
df: pd.DataFrame = pd.read_csv(file_path)

In [None]:
# Define column renaming dictionary
column_rename_dict: Dict[str, str] = {
    'Date': 'date',
    'Location': 'location',
    'MinTemp': 'min_temp',
    'MaxTemp': 'max_temp',
    'Rainfall': 'rainfall',
    'Evaporation': 'evaporation',
    'Sunshine': 'sunshine',
    'WindGustDir': 'wind_gust_dir',
    'WindGustSpeed': 'wind_gust_speed',
    'WindDir9am': 'wind_dir_9am',
    'WindDir3pm': 'wind_dir_3pm',
    'WindSpeed9am': 'wind_speed_9am',
    'WindSpeed3pm': 'wind_speed_3pm',
    'Humidity9am': 'humidity_9am',
    'Humidity3pm': 'humidity_3pm',
    'Pressure9am': 'pressure_9am',
    'Pressure3pm': 'pressure_3pm',
    'Cloud9am': 'cloud_9am',
    'Cloud3pm': 'cloud_3pm',
    'Temp9am': 'temp_9am',
    'Temp3pm': 'temp_3pm',
    'RainToday': 'rain_today',
    'RainTomorrow': 'rain_tomorrow'
}

# Rename columns
df = df.rename(columns=column_rename_dict)

In [None]:
# Filter the DataFrame to include only specific locations
locations_of_interest: List[str] = config['locations']
df = df[df['location'].isin(locations_of_interest)]

In [None]:
# Convert 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'])

In [None]:
# Display the first few rows of the DataFrame
df.head()

In [None]:
# Display the shape of the DataFrame
df.shape

In [None]:
# Data Info
df.info()

In [None]:
# Descriptive Statistics
df.describe()

In [None]:
# Missing Values
df.isna().sum()

In [None]:
# Drop rows with missing target values
df = df.dropna(subset=['rain_today', 'rain_tomorrow'])

In [None]:
# Rainfall over Time
plt.figure(figsize=(12, 6))
plt.plot(df['date'], df['rainfall'])
plt.title('Rainfall over Time')
plt.xlabel('Date')
plt.ylabel('Rainfall (mm)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Distribution of Min and Max Temperatures
plt.figure(figsize=(10, 6))
sns.histplot(df['min_temp'], kde=True, color='blue', alpha=0.5, label='Min Temp')
sns.histplot(df['max_temp'], kde=True, color='red', alpha=0.5, label='Max Temp')
plt.title('Distribution of Min and Max Temperatures')
plt.xlabel('Temperature (°C)')
plt.ylabel('Frequency')
plt.legend()
plt.show()

In [None]:
# Wind Gust Speed by Location
plt.figure(figsize=(12, 6))
sns.boxplot(x='location', y='wind_gust_speed', data=df)
plt.title('Wind Gust Speed by Location')
plt.xlabel('Location')
plt.ylabel('Wind Gust Speed (km/h)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Correlation Heatmap of Numerical Features
corr = df[df.select_dtypes(include=['int64', 'float64']).columns.tolist()].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', square=True)
plt.title('Correlation Heatmap of Numerical Features')
plt.tight_layout()
plt.show()

In [None]:
def create_preprocessor(numerical_columns: List[str],
                        categorical_columns: List[str]) -> ColumnTransformer:
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_columns),
            ('cat', categorical_transformer, categorical_columns),
        ])

    return preprocessor

In [None]:
def create_model_pipeline(model: BaseEstimator, preprocessor: ColumnTransformer, 
                          oversample: bool = False) -> ImbPipeline:
    steps = [('preprocessor', preprocessor)]
    
    if oversample:
        steps.append(('sampler', RandomOverSampler(random_state=42)))
    
    steps.append(('model', model))
    
    return ImbPipeline(steps)

In [None]:
def compare_models(models: Dict[str, BaseEstimator], X: pd.DataFrame, y: pd.Series, 
                   preprocessor: ColumnTransformer, 
                   param_grids: Dict[str, Dict[str, Any]],
                   n_iter: int = 10,
                   cv: int = 5) -> Dict[str, Dict[str, Any]]:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    results = {}
    
    for name, model in models.items():
        # Without oversampling
        pipeline_no_os = create_model_pipeline(model, preprocessor, oversample=False)
        search_no_os = RandomizedSearchCV(pipeline_no_os, param_grids[name], n_iter=n_iter, cv=cv, n_jobs=-1, random_state=42, scoring='f1')
        search_no_os.fit(X_train, y_train)
        
        # With oversampling
        pipeline_os = create_model_pipeline(model, preprocessor, oversample=True)
        search_os = RandomizedSearchCV(pipeline_os, param_grids[name], n_iter=n_iter, cv=cv, n_jobs=-1, random_state=42, scoring='f1')
        search_os.fit(X_train, y_train)
        
        # Evaluate best models
        results[f"{name} no oversampling"] = evaluate_model(search_no_os.best_estimator_, X_test, y_test)
        results[f"{name} with oversampling"] = evaluate_model(search_os.best_estimator_, X_test, y_test)
        
        # Store best parameters
        results[f"{name} no oversampling"]["best_params"] = search_no_os.best_params_
        results[f"{name} with oversampling"]["best_params"] = search_os.best_params_
    
    return results

In [None]:
def evaluate_model(model: BaseEstimator, X_test: pd.DataFrame, y_test: np.ndarray) -> Dict[str, float]:
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    return {
        'accuracy': model.score(X_test, y_test),
        'precision': classification_report(y_test, y_pred, output_dict=True, zero_division=0)['weighted avg']['precision'],
        'recall': classification_report(y_test, y_pred, output_dict=True, zero_division=0)['weighted avg']['recall'],
        'f1-score': classification_report(y_test, y_pred, output_dict=True, zero_division=0)['weighted avg']['f1-score'],
        'auc-roc': roc_auc_score(y_test, y_pred_proba)
    }

In [None]:
# Prepare data for modeling
X: pd.DataFrame = df.drop(['rain_tomorrow', 'date'], axis=1)
y: pd.Series = df['rain_tomorrow']

In [None]:
# Define column types for preprocessing
numerical_columns: List[str] = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_columns: List[str] = X.select_dtypes(include=['object', 'category']).columns.tolist()

In [None]:
# Display class distribution
y.value_counts(normalize=True)

In [None]:
# Encode target variable
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
# Define models to compare
models: Dict[str, BaseEstimator] = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42)
}

In [None]:
param_grids = {
    'Logistic Regression': {
        'model__C': loguniform(1e-5, 100),
        'model__penalty': ['l2'],
        'model__solver': ['lbfgs'],
        'model__max_iter': [1000, 2000, 5000]
    },
    'Decision Tree': {
        'model__max_depth': [3, 5, 7, 9, None],
        'model__min_samples_split': randint(2, 20),
        'model__min_samples_leaf': randint(1, 20)
    },
    'Random Forest': {
        'model__n_estimators': randint(10, 200),
        'model__max_depth': [3, 5, 7, 9, None],
        'model__min_samples_split': randint(2, 20),
        'model__min_samples_leaf': randint(1, 20)
    }
}

In [None]:
# Create preprocessor
preprocessor = create_preprocessor(numerical_columns, categorical_columns)

In [None]:
results = compare_models(models, X, y, preprocessor, param_grids, n_iter=20, cv=5)

In [None]:
for name, metrics in results.items():
    print(f"\n{name}:")
    for metric, value in metrics.items():
        if metric == 'best_params':
            print(f"\tBest parameters: {value}")
        else:
            print(f"\t\t{metric}: {value:.4f}")

In [None]:
# Visualize model comparison
def plot_model_comparison(results: Dict[str, Dict[str, Any]], metric: str = 'f1-score') -> None:
    models = list(results.keys())
    scores = [results[model][metric] for model in models]
    
    plt.figure(figsize=(12, 6))
    sns.barplot(x=models, y=scores)
    plt.title(f'Model Comparison - {metric}')
    plt.ylabel(metric.capitalize())
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    
plot_model_comparison(results)

In [None]:
best_model_name = max(results, key=lambda x: results[x]['f1-score'])
best_f1_score = results[best_model_name]['f1-score']
best_params = results[best_model_name]['best_params']

print(f"Best overall model: {best_model_name}")
print(f"Best F1-score: {best_f1_score:.4f}")
print(f"Best parameters: {best_params}")

In [None]:
def train_best_model(best_model_name: str, X: pd.DataFrame, y: pd.Series) -> BaseEstimator:
    model_class = type(models[best_model_name.split('_')[0]])
    
    cleaned_params = {k.replace('model__', ''): v for k, v in best_params.items()}
    
    best_model = model_class(**cleaned_params)
    
    if 'with_oversampling' in best_model_name:
        pipeline = create_model_pipeline(best_model, preprocessor, oversample=True)
    else:
        pipeline = create_model_pipeline(best_model, preprocessor, oversample=False)
    
    pipeline.fit(X, y)
    return pipeline

best_model_name = max(results, key=lambda x: results[x]['f1-score'])
best_params = results[best_model_name]['best_params']
best_model = train_best_model(best_model_name, X, y)

In [None]:
model_filename = os.path.join(root_path, "model", "best_model.joblib")
joblib.dump(best_model, model_filename)
print(f"Best model saved to {model_filename}")

In [None]:
def predict_rain_tomorrow(model: BaseEstimator, input_data: Dict[str, Any], label_encoder: LabelEncoder) -> str:
    input_df = pd.DataFrame([input_data])
    prediction = model.predict(input_df)
    return label_encoder.inverse_transform(prediction)[0]

In [None]:
# Example input data
example_input = {
    'location': 'Sydney',
    'min_temp': 15.0,
    'max_temp': 25.0,
    'rainfall': 0.0,
    'evaporation': 4.8,
    'sunshine': 8.5,
    'wind_gust_dir': 'SE',
    'wind_gust_speed': 30,
    'wind_dir_9am': 'E',
    'wind_dir_3pm': 'SE',
    'wind_speed_9am': 10,
    'wind_speed_3pm': 15,
    'humidity_9am': 70,
    'humidity_3pm': 55,
    'pressure_9am': 1015.0,
    'pressure_3pm': 1013.0,
    'cloud_9am': 3,
    'cloud_3pm': 4,
    'temp_9am': 18.0,
    'temp_3pm': 23.5,
    'rain_today': 'No'
}

prediction = predict_rain_tomorrow(best_model, example_input, le)
print(f"Rain Tomorrow Prediction: {prediction}")