In [1]:
import os

def set_project_rood_dir():
    notebook_dir = os.path.dirname(os.path.abspath("__file__"))
    paths = notebook_dir.split("/")

    # remove directory unless notebook/s directory is found
    while len(paths) > 0:
        if paths[-1] == 'notebook' or paths[-1] == 'notebooks':
            paths.pop()
            break
        paths.pop()

    # show error if paths is empty
    if len(paths) == 0:
        print("Current directory: ", notebook_dir)
        raise ValueError("Unable to find notebook/s directory in path")

    root = "/".join(paths)
    os.chdir(root)
    print("Successfully changed working directory: ", root)
    print("Current working directory: ", os.getcwd())


set_project_rood_dir()

Successfully changed working directory:  /Users/suraj/vscode/aiml/kaggle/binary_prediction_rainfall_dataset
Current working directory:  /Users/suraj/vscode/aiml/kaggle/binary_prediction_rainfall_dataset


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

DATA_PATH = "data/raw/train.csv"
TEST_DATA_PATH = "data/raw/test.csv"

df = pd.read_csv(DATA_PATH, index_col=0)
test_df = pd.read_csv(TEST_DATA_PATH, index_col=0)

In [3]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

# Split data into features and target
X = df.drop('rainfall', axis=1)
y = df['rainfall']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE oversampling
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Train a model with the resampled data
model = RandomForestClassifier(random_state=42)
model.fit(X_train_res, y_train_res)

# Predict and evaluate
y_pred = model.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred)
print(f"ROC-AUC Score with SMOTE: {auc_score}")


ROC-AUC Score with SMOTE: 0.8730802665893943


In [4]:
from sklearn.ensemble import RandomForestClassifier

# Train a model with class weighting
model = RandomForestClassifier(class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred)
print(f"ROC-AUC Score with Class Weighting: {auc_score}")


ROC-AUC Score with Class Weighting: 0.8594873686151576


In [5]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# Assuming df is your DataFrame and 'rainfall' is your target variable

# Split data into features and target
X = df.drop('rainfall', axis=1)
y = df['rainfall']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify majority and minority classes
majority_class = y_train[y_train == 0]
minority_class = y_train[y_train == 1]

# Split majority class into subsets
num_subsets = 3  # Example: Split into 3 subsets
subset_size = len(majority_class) // num_subsets

# Initialize lists to hold predictions
predictions = []

for i in range(num_subsets):
    # Select subset of majority class
    start_idx = i * subset_size
    end_idx = (i + 1) * subset_size if i < num_subsets - 1 else len(majority_class)
    subset_majority = majority_class.iloc[start_idx:end_idx]
    
    # Combine subset with minority class
    X_subset = pd.concat([X_train[y_train == 0].iloc[start_idx:end_idx], X_train[y_train == 1]], ignore_index=True)
    y_subset = pd.concat([subset_majority, minority_class], ignore_index=True)
    
    # Train model on subset
    model = RandomForestClassifier(random_state=42)
    model.fit(X_subset, y_subset)
    
    # Predict on test set
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    predictions.append(y_pred_proba)

# Ensemble predictions (simple average)
y_pred_ensemble = np.mean(predictions, axis=0)

# Evaluate ensemble
auc_score_ensemble = roc_auc_score(y_test, y_pred_ensemble)
print(f"Ensemble ROC-AUC Score: {auc_score_ensemble}")


Ensemble ROC-AUC Score: 0.8579989989726298


In [6]:
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.metrics import roc_auc_score

# Initialize EasyEnsembleClassifier
eec = EasyEnsembleClassifier(random_state=42, n_estimators=10)  # n_estimators is the number of balanced subsets

# Fit the model
eec.fit(X_train, y_train)

# Predict probabilities
y_pred_proba = eec.predict_proba(X_test)[:, 1]

# Evaluate ROC-AUC
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"EasyEnsembleClassifier ROC-AUC Score: {auc_score}")


EasyEnsembleClassifier ROC-AUC Score: 0.8717631253128211


In [None]:
from imblearn.ensemble import BalanceCascade
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# Initialize BalanceCascade
bc = BalanceCascade(random_state=42, estimator=RandomForestClassifier(random_state=42))

# Fit and resample the data
X_resampled, y_resampled = bc.fit_resample(X_train, y_train)

# Train a classifier on the resampled data (example: RandomForest)
model = RandomForestClassifier(random_state=42)
model.fit(X_resampled, y_resampled)

# Predict probabilities
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Evaluate ROC-AUC
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"BalanceCascade ROC-AUC Score: {auc_score}")


ImportError: cannot import name 'BalanceCascade' from 'imblearn.ensemble' (/Users/suraj/vscode/aiml/kaggle/binary_prediction_rainfall_dataset/.venv/lib/python3.12/site-packages/imblearn/ensemble/__init__.py)

In [11]:
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

# Initialize BalancedBaggingClassifier
bbc = BalancedBaggingClassifier(
    estimator=DecisionTreeClassifier(),
    sampling_strategy='auto',
    random_state=42,
    n_estimators=10  # Number of balanced subsets
)

# Fit the model
bbc.fit(X_train, y_train)

# Predict probabilities
y_pred_proba = bbc.predict_proba(X_test)[:, 1]

# Evaluate ROC-AUC
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"BalancedBaggingClassifier ROC-AUC Score: {auc_score}")


BalancedBaggingClassifier ROC-AUC Score: 0.8445114723005189
