In [1]:
import os

def set_project_rood_dir():
    notebook_dir = os.path.dirname(os.path.abspath("__file__"))
    paths = notebook_dir.split("/")

    # remove directory unless notebook/s directory is found
    while len(paths) > 0:
        if paths[-1] == 'notebook' or paths[-1] == 'notebooks':
            paths.pop()
            break
        paths.pop()

    # show error if paths is empty
    if len(paths) == 0:
        print("Current directory: ", notebook_dir)
        raise ValueError("Unable to find notebook/s directory in path")

    root = "/".join(paths)
    os.chdir(root)
    print("Successfully changed working directory: ", root)
    print("Current working directory: ", os.getcwd())


set_project_rood_dir()

Successfully changed working directory:  /Users/suraj/vscode/aiml/kaggle/binary_prediction_rainfall_dataset
Current working directory:  /Users/suraj/vscode/aiml/kaggle/binary_prediction_rainfall_dataset


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

DATA_PATH = "data/raw/train.csv"
TEST_DATA_PATH = "data/raw/test.csv"

df = pd.read_csv(DATA_PATH, index_col=0)
test_df = pd.read_csv(TEST_DATA_PATH, index_col=0)

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
import numpy as np

In [4]:
# Feature Engineering
df['temp_range'] = df['maxtemp'] - df['mintemp']
df['humidity_dewpoint_ratio'] = df['humidity'] / df['dewpoint']
df['cloud_sunshine_ratio'] = df['cloud'] / (df['sunshine']+0.0000001)

In [5]:
# Convert wind direction to sine and cosine components
df['wind_direction_sin'] = np.sin(df['winddirection'] * np.pi / 180)
df['wind_direction_cos'] = np.cos(df['winddirection'] * np.pi / 180)

# Drop original wind direction
df.drop('winddirection', axis=1, inplace=True)

In [6]:
# month, week, quater
df['week'] = df['day'] // 7
# df['week'].value_counts()

df['month'] = df['day'] // 31
# df['month'].value_counts()

df['quater'] = df['day'] // 92
# df['quater'].value_counts()

In [7]:
# Split data into features and target
X = df.drop('rainfall', axis=1)
y = df['rainfall']


In [8]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
# Define pipeline with feature selection (optional) and hyperparameter tuning
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('power_transform', PowerTransformer(method='yeo-johnson')),
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))  # Using RandomForest as an example
])

In [10]:
# Hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 5, 10],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)

In [18]:
# Evaluate on test set
y_pred = grid_search.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred)
print(f"ROC-AUC Score: {auc_score}")


ROC-AUC Score: 0.8643081056874161


In [16]:
# Use the best model for predictions
best_model = grid_search.best_estimator_

In [17]:
# show feature importances
importances = best_model.named_steps['classifier'].feature_importances_
features = X.columns
importances_df = pd.DataFrame({'feature': features, 'importance': importances})
importances_df = importances_df.sort_values('importance', ascending=False)
print(importances_df)

                    feature  importance
7                     cloud    0.324043
12     cloud_sunshine_ratio    0.216575
8                  sunshine    0.146682
6                  humidity    0.113595
11  humidity_dewpoint_ratio    0.030373
5                  dewpoint    0.027834
1                  pressure    0.019449
2                   maxtemp    0.018127
9                 windspeed    0.015785
3               temparature    0.015738
10               temp_range    0.015135
4                   mintemp    0.015010
0                       day    0.012294
15                     week    0.009372
14       wind_direction_cos    0.008420
13       wind_direction_sin    0.006624
16                    month    0.003566
17                   quater    0.001378


In [55]:
len(importances)

15

In [54]:
len(X.columns)

18

In [15]:
from sklearn.utils.class_weight import compute_class_weight

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)

# Adjust the classifier to use class weights
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 5, 10],
    'classifier__class_weight': ['balanced']  # Use balanced class weights
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)


In [19]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split

# Define multiple models
models = [
    RandomForestClassifier(random_state=42),
    GradientBoostingClassifier(random_state=42),
    LogisticRegression(max_iter=1000, random_state=42)
]

# Train each model and predict probabilities
y_pred_probas = []
for model in models:
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred_probas.append(y_pred_proba)

# Stack predictions (simple average)
y_pred_stacked = np.mean(y_pred_probas, axis=0)

# Evaluate stacked predictions
auc_score_stacked = roc_auc_score(y_test, y_pred_stacked)
print(f"Stacked Models ROC-AUC Score: {auc_score_stacked}")


Stacked Models ROC-AUC Score: 0.8659150180448355


In [20]:
from sklearn.feature_selection import mutual_info_classif, SelectKBest

# Calculate mutual information
mutual_info = mutual_info_classif(X_train, y_train)

# Select top K features
selector = SelectKBest(mutual_info_classif, k=10)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Re-train model with selected features
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 5, 10],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train_selected, y_train)

# Evaluate with selected features
y_pred_selected = grid_search.predict_proba(X_test_selected)[:, 1]
auc_score_selected = roc_auc_score(y_test, y_pred_selected)
print(f"Selected Features ROC-AUC Score: {auc_score_selected}")


Selected Features ROC-AUC Score: 0.86420273438529


In [23]:
# return selected features
selected_features = X.columns[selector.get_support()]

In [24]:
selected_features

Index(['day', 'temparature', 'dewpoint', 'humidity', 'cloud', 'sunshine',
       'temp_range', 'humidity_dewpoint_ratio', 'cloud_sunshine_ratio',
       'week'],
      dtype='object')

In [25]:
df['rainfall'].value_counts()

rainfall
1    1650
0     540
Name: count, dtype: int64

In [31]:
from imblearn.ensemble import EasyEnsembleClassifier

# Create pipeline
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('power_transform', PowerTransformer(method='yeo-johnson')),
    ('scaler', StandardScaler()),
    ('classifier', EasyEnsembleClassifier(random_state=42))  # Using EasyEnsembleClassifier from imblearn
])

# Calculate mutual information
mutual_info = mutual_info_classif(X_train, y_train)

# Select top K features
selector = SelectKBest(mutual_info_classif, k=10)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Re-train model with selected features
param_grid = {
    'classifier__n_estimators': [100,125, 150, 200],  # Adjusted for EasyEnsembleClassifier
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train_selected, y_train)

# Evaluate with selected features
y_pred_selected = grid_search.predict_proba(X_test_selected)[:, 1]
auc_score_selected = roc_auc_score(y_test, y_pred_selected)
print(f"Selected Features ROC-AUC Score: {auc_score_selected}")


Selected Features ROC-AUC Score: 0.8678380443086324


In [None]:
# best model
best_model = grid_search.best_estimator_
best_model.named_steps['classifier'].n_estimators

100