# Importing Libraries


# Memory managing

In [None]:
import sys

# Get the sizes of all variables
vars_with_sizes = {name: sys.getsizeof(obj) for name, obj in globals().items()}

# Sort by size in descending order
sorted_vars = sorted(vars_with_sizes.items(), key=lambda x: x[1], reverse=True)

# Print variables with their sizes
for name, size in sorted_vars:
    print(f"{name}: {size} bytes")


# del variable_name

In [None]:
# Standard Libraries
import warnings
import json
import logging
import re
from datetime import datetime

# Numerical and Statistical Libraries
import numpy as np
import pandas as pd
from scipy import stats

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning Models
# Regression Models
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# Classification Models
from sklearn.svm import SVC
from sklearn.ensemble import (
    RandomForestClassifier,
    StackingClassifier,
    VotingClassifier,
)
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Neural Networks (NN)
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras import layers, callbacks
import tensorflow_addons as tfa

# FastAI
from fastai.tabular.all import *

# Feature Engineering
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE, mutual_info_classif, mutual_info_regression
from sklearn.linear_model import LinearRegression
from sklearn.cluster import DBSCAN, KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import PowerTransformer, PolynomialFeatures
from geopy.distance import geodesic, great_circle

# Data Preprocessing and Encoding
# Scalers
from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    RobustScaler,
    MultiLabelBinarizer,
)

# Encoders
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from category_encoders import BinaryEncoder

# Imputation

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer


# Resampling
from imblearn.over_sampling import SMOTE, SMOTENC
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

# Model Tuning and Evaluation
from sklearn.model_selection import (
    GridSearchCV,
    StratifiedKFold,
    cross_val_score,
    train_test_split,
)

# Metrics
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
    log_loss,
    r2_score,
    mean_squared_error,
    mean_absolute_error,
    median_absolute_error,
    explained_variance_score,
    classification_report,
    roc_curve,
    make_scorer,
)

# AutoML
from autogluon.tabular import TabularPredictor

# Optuna for Hyperparameter Tuning
import optuna

# Importing Dataset

## input files

In [9]:
# Input data files are available in the read-only "../input/" directory

import os

for dirname, _, filenames in os.walk("../input"):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## read files

In [None]:
# pahts
train_path = "../"
test_path = "../"

# importing data

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

# train_ = pd.read_csv(train_path)  # Copies
# test_ = pd.read_csv(test_path)  # Copies

In [None]:
def convert_to_datetime(df, columns):
    df[columns] = df[columns].apply(pd.to_datetime)
    return df


# df = convert_to_datetime(df, ['col1', 'col2'])

## setting cols

In [None]:
target_col = ""
y = train[target_col]
train = train.drop(target_col, axis=1)

num_cols = train.select_dtypes(include=["number"]).columns
cat_cols = train.select_dtypes(include=["object", "category"]).columns
date_cols = train.select_dtypes(include=["datetime"]).columns

## some data info

In [None]:
train.head(5)

In [None]:
print("shape of train", train.shape)
print("shape of test", test.shape)

In [None]:
train.info()

In [None]:
train.describe().T

## reduce data size if it is possible

In [None]:
df = pd.concat([train, test], axis=0)


def reduce_memory_usage(df):
    for col in df.columns:
        col_type = df[col].dtype

        # Handle float64 type
        if col_type == "float64":
            # Convert float to int if no decimals
            if (df[col] % 1 == 0).all():
                print(f"Converting {col} from {col_type} to int64")
                df[col] = df[col].astype(np.int64)

        # Handle int64 type
        if col_type == "int64":
            # Convert to smaller int types based on value range
            if (
                df[col].min() >= np.iinfo(np.int8).min
                and df[col].max() <= np.iinfo(np.int8).max
            ):
                print(f"Converting {col} from int64 to int8")
                df[col] = df[col].astype(np.int8)
            elif (
                df[col].min() >= np.iinfo(np.int16).min
                and df[col].max() <= np.iinfo(np.int16).max
            ):
                print(f"Converting {col} from int64 to int16")
                df[col] = df[col].astype(np.int16)
            elif (
                df[col].min() >= np.iinfo(np.int32).min
                and df[col].max() <= np.iinfo(np.int32).max
            ):
                print(f"Converting {col} from int64 to int32")
                df[col] = df[col].astype(np.int32)

    return df


# df = reduce_memory_usage(df)
train = df.iloc[: len(train)]
test = df.iloc[len(train) :]

# Some init procces like jason & Regex & str_split

## jason

In [None]:
# Extracting jason features
import json
import ast


def extract_features(json_str):
    features_list = json.loads(json_str)  # Use json.loads() to parse the JSON string
    return {item["name"]: item["description"] for item in features_list}


# Assuming df is already defined and contains the 'features' column
# features_df = df['features'].apply(extract_features).apply(pd.Series)

# df = pd.concat([df, features_df], axis=1)
# df

## Regex

In [None]:
def extract_data_from_engine(df):
    df["horsepower"] = df["engine"].str.extract(r"(\d+\.\d+)(?=HP)").astype(float)
    df["engine_size"] = df["engine"].str.extract(r"(\d+\.\d+)(?=L)").astype(float)
    df["cylinders"] = (
        df["engine"].str.extract(r"(\d+)\s(Cylinder|V\d|Straight)")[0].astype(float)
    )
    return df

## str_split

In [None]:
def str_split(df):
    df[["Type", "Level"]] = (  # Create two new features
        df[
            "Policy"
        ].str.split(  # from the Policy feature  # through the string accessor
            " ", expand=True
        )  # by splitting on " "
        # and expanding the result into separate columns
    )

    df[["Policy", "Type", "Level"]].head(10)

## make other columns into yours!

In [None]:
def make_new_rows(row):
    crimes = [
        row["Crm Cd"],
        row["Crm Cd 1"],
        row["Crm Cd 2"],
        row["Crm Cd 3"],
        row["Crm Cd 4"],
    ]
    result = []

    row = row.drop(["Crm Cd 1", "Crm Cd 2", "Crm Cd 3", "Crm Cd 4"])
    for crime in crimes:
        if not np.isnan(crime):
            row["Crm Cd"] = crime
            result.append(row)
    return result


# expanded_train = train.apply(make_new_rows, axis=1)
# train = pd.DataFrame([item for sublist in expanded_train for item in sublist])

In [None]:
train.head(5)

# EDA

In [None]:
train.head(5)

In [None]:
train_target_combined = pd.concat([train, y], axis=1)
print("Duplicated Rows:", train_target_combined.duplicated().sum())
# train_target_combined = train_target_combined.drop_duplicates()

# Separate the train and target back
train = train_target_combined.iloc[:, :-1]
y = train_target_combined.iloc[:, -1]

In [None]:
print("shape of train", train.shape)
print("shape of test", test.shape)

## basic Data info

In [None]:
def aggregate_info(df):
    """Calculate
    count of unique,
    unique values,
    count & % of missing values,
    data types of the columns"""
    nunique = df.nunique()
    unique = df.apply(lambda x: x.unique())
    missing_count = df.isna().sum()
    missing_percentage = round((df.isna().sum() / len(df)) * 100, 2)
    dtypes = df.dtypes

    # combine metrics into a single DataFrame
    agg_df = pd.DataFrame(
        {
            "nunique values": nunique,
            "unique": unique,
            "missing_count": missing_count,
            "missing_percentage": missing_percentage,
            "dtypes": dtypes,
        }
    )

    return agg_df


aggregate_info(train)

In [None]:
def numeric_summary(df):
    obs = df.shape[0]

    numeric_df = df.select_dtypes(include="number")
    summary_df = pd.DataFrame(
        {
            "Dtype": numeric_df.dtypes,
            "Counts": numeric_df.apply(lambda x: x.count()),
            "Nulls": numeric_df.apply(lambda x: x.isnull().sum()),
            "NullPercent": (numeric_df.isnull().sum() / obs) * 100,
            "Mean": numeric_df.mean(),
            "Min": numeric_df.min(),
            "Max": numeric_df.max(),
            "Uniques": numeric_df.apply(lambda x: x.unique().shape[0]),
            "UniqueValues": numeric_df.apply(
                lambda x: list(x.unique()) if x.nunique() <= 10 else "-"
            ),
        }
    )

    return summary_df


numeric_summary(train)

## Missing values

In [None]:
df = pd.concat([train, test], axis=0)

In [None]:
def fill_na_values(df, fill_value):
    return df.fillna(fill_value)


# df = fill_na_values(df, -999)

In [1]:
def drop_na(df, drop="cols"):

    if drop == "cols":
        cols_with_missing = [col for col in df.columns if df[col].isnull().any()]
        return df.drop(cols_with_missing, axis=1)
    elif drop == "rows":
        return df.dropna(how="any")
    else:
        raise ValueError("Parameter 'drop' must be either 'rows' or 'cols'.")


# df = drop_na(df, "rows")
# df = drop_na(df, "cols")

In [None]:
def impute_train_test(df, strategy="mean"):  # can also use median or most_frequent

    imputer = SimpleImputer(strategy=strategy)
    df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

    return df


# df = impute_train_test(df)

# df = impute_train_test(df,"median")

# df = impute_train_test(df,"most_frequent")

In [None]:
# # for time series
# df = df.fillna(method="ffill")
# df = df.fillna(method="bfill")

In [None]:
# interpolate
# df = df.interpolate(method="linear")

In [None]:
imputer = KNNImputer(n_neighbors=5)
# df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [None]:
imputer = IterativeImputer()
# df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [None]:
train = df.iloc[: len(train)]
test = df.iloc[len(train) :]

## Visualizaion + balance + outliers

### Target

In [1]:
def plot_target_distribution(target):

    if pd.api.types.is_numeric_dtype(target):
        plt.figure(figsize=(8, 6))
        sns.histplot(target, kde=True, bins=30, color="blue")
        plt.title(f"Distribution of Numerical Target", fontsize=14)
        plt.xlabel(target.name, fontsize=12)
        plt.ylabel("Frequency", fontsize=12)
    else:
        plt.figure(figsize=(8, 6))
        sns.countplot(x=target, palette="Set2")
        plt.title(f"Distribution of Categorical Target", fontsize=14)
        plt.xlabel(target.name, fontsize=12)
        plt.ylabel("Count", fontsize=12)

    plt.show()


# plot_target_distribution(y)

### balance the target


In [12]:
def transform_target(target, method="log", reverse=False, c=0):
    target_transformed = target.copy()

    if not reverse:
        if method == "log":
            target_transformed = np.log1p(target_transformed + c)
        elif method == "sqrt":
            target_transformed = np.sqrt(target_transformed + c)
        elif method == "boxcox":
            target_transformed, _ = stats.boxcox(
                target_transformed + 1e-9
            )  # Add small value to handle 0
        elif method == "reciprocal":
            target_transformed = 1 / (target_transformed + 1e-9)
        elif method == "yeo-johnson":
            pt = PowerTransformer(method="yeo-johnson")
            target_transformed = pt.fit_transform(
                target_transformed.values.reshape(-1, 1)
            ).flatten()
        else:
            raise ValueError(
                "Invalid method. Choose from 'log', 'sqrt', 'boxcox', 'reciprocal', 'yeo-johnson'."
            )

        plt.figure(figsize=(8, 6))
        sns.histplot(target_transformed, kde=True, bins=30, color="green")
        plt.title(f"{method.capitalize()} Transformation of Target", fontsize=14)
        plt.show()

    else:
        if method == "log":
            target_transformed = np.expm1(target_transformed)
        elif method == "sqrt":
            target_transformed = target_transformed**2
        elif method == "boxcox":
            target_transformed = stats.inv_boxcox(target_transformed, _)
        elif method == "reciprocal":
            target_transformed = 1 / target_transformed
        elif method == "yeo-johnson":
            pt = PowerTransformer(method="yeo-johnson")
            target_transformed = pt.inverse_transform(
                target_transformed.values.reshape(-1, 1)
            ).flatten()
        else:
            raise ValueError(
                "Invalid method for reversing. Choose from 'log', 'sqrt', 'boxcox', 'reciprocal', 'yeo-johnson'."
            )

        plt.figure(figsize=(8, 6))
        sns.histplot(target_transformed, kde=True, bins=30, color="blue")
        plt.title(
            f"Reverse {method.capitalize()} Transformation of Target", fontsize=14
        )
        plt.show()

    return pd.Series(target_transformed, index=target.index)


# y = transform_target(y, method="log", reverse=False)

#### resampling / undersampling /both 

In [None]:
# RESAMPLING

smote = SMOTE(random_state=42)


cat_cols_idx = [train.columns.get_loc(col) for col in cat_cols]
smote_nc = SMOTENC(categorical_features=cat_cols_idx, random_state=42)

# train, y = smote.fit_resample(train, y)
# train, y = smote_nc.fit_resample(train, y)

In [None]:
# Under Sampling

undersample = RandomUnderSampler(random_state=42)
# train, y = undersample.fit_resample(train, y)

In [None]:
# Combination


smote_enn = SMOTEENN(random_state=42)
# train, y = smote_enn.fit_resample(train, y)

### Numerical columns

In [None]:
def compare_numerical_columns(train, test, num_cols):
    fig, axs = plt.subplots(len(num_cols), 2, figsize=(12, len(num_cols) * 4))

    # If there's only one numerical column, ensure axs is treated as 2D
    if len(num_cols) == 1:
        axs = np.expand_dims(axs, axis=0)

    for i, col in enumerate(num_cols):
        min_val = min(train[col].min(), test[col].min())
        max_val = max(train[col].max(), test[col].max())

        sns.histplot(
            train[col], ax=axs[i, 0], color="blue", kde=True, label="Train", bins=20
        )
        axs[i, 0].set_title(f"Train - {col}")
        axs[i, 0].set_xlim(min_val, max_val)

        sns.histplot(
            test[col], ax=axs[i, 1], color="red", kde=True, label="Test", bins=20
        )
        axs[i, 1].set_title(f"Test - {col}")
        axs[i, 1].set_xlim(min_val, max_val)

        axs[i, 0].set_xlabel("Value")
        axs[i, 0].set_ylabel("Frequency")
        axs[i, 1].set_xlabel("Value")
        axs[i, 1].set_ylabel("Frequency")
        axs[i, 0].legend()
        axs[i, 1].legend()

    plt.tight_layout()
    plt.show()


# compare_numerical_columns(train, test, num_cols)

#### Numerical transformations

In [None]:
def apply_transformation(train, test, num_cols, transformation=None, c=0):
    train_transformed = train.copy()
    test_transformed = test.copy()

    for col in num_cols:
        if transformation == "log":
            train_transformed[col] = np.log1p(train_transformed[col] + c)
            test_transformed[col] = np.log1p(test_transformed[col] + c)
        elif transformation == "sqrt":
            train_transformed[col] = np.sqrt(train_transformed[col] + c)
            test_transformed[col] = np.sqrt(test_transformed[col] + c)
        elif transformation == "square":
            train_transformed[col] = np.square(train_transformed[col] + c)
            test_transformed[col] = np.square(test_transformed[col] + c)

    fig, axs = plt.subplots(len(num_cols), 2, figsize=(12, len(num_cols) * 4))

    if len(num_cols) == 1:
        axs = np.expand_dims(axs, axis=0)

    for i, col in enumerate(num_cols):
        min_val = min(train_transformed[col].min(), test_transformed[col].min())
        max_val = max(train_transformed[col].max(), test_transformed[col].max())

        sns.histplot(
            train_transformed[col],
            ax=axs[i, 0],
            color="blue",
            kde=True,
            label="Train",
            bins=20,
        )
        axs[i, 0].set_title(f"Train - {col} ({transformation})")
        axs[i, 0].set_xlim(min_val, max_val)

        sns.histplot(
            test_transformed[col],
            ax=axs[i, 1],
            color="red",
            kde=True,
            label="Test",
            bins=20,
        )
        axs[i, 1].set_title(f"Test - {col} ({transformation})")
        axs[i, 1].set_xlim(min_val, max_val)

        axs[i, 0].set_xlabel("Value")
        axs[i, 0].set_ylabel("Frequency")
        axs[i, 1].set_xlabel("Value")
        axs[i, 1].set_ylabel("Frequency")
        axs[i, 0].legend()
        axs[i, 1].legend()

    plt.tight_layout()
    plt.show()

    return train_transformed, test_transformed

#### more plot on numericals

##### Scater like plots

In [None]:
def plot_histogram_kde(df, num_cols):
    plt.figure(figsize=(12, len(num_cols) * 4))
    for i, col in enumerate(num_cols):
        plt.subplot(len(num_cols), 1, i + 1)
        sns.histplot(df[col], kde=True, bins=30, color="blue")
        plt.title(f"Histogram and KDE of {col}")
        plt.xlabel(col)
        plt.ylabel("Frequency")
    plt.tight_layout()
    plt.show()


# plot_histogram_kde(train, num_cols)


def plot_scatter_with_numerical_target(df, num_cols, target):
    plt.figure(figsize=(12, len(num_cols) * 4))
    for i, col in enumerate(num_cols):
        plt.subplot(len(num_cols), 1, i + 1)
        sns.scatterplot(x=df[col], y=target, color="blue")
        plt.title(f"Scatter Plot: {col} vs Target")
        plt.xlabel(col)
        plt.ylabel("Target")
    plt.tight_layout()
    plt.show()


# plot_scatter_with_numerical_target(train, num_cols, y)


def plot_lmplot_with_numerical_target(df, num_cols, target):
    plt.figure(figsize=(12, len(num_cols) * 4))

    for i, col in enumerate(num_cols):
        plt.subplot(len(num_cols), 1, i + 1)
        sns.regplot(
            x=df[col],
            y=target,
            scatter_kws={"color": "blue"},
            line_kws={"color": "red"},
        )
        plt.title(f"Scatter Plot with Linear Regression: {col} vs Target")
        plt.xlabel(col)
        plt.ylabel("Target")

    plt.tight_layout()
    plt.show()


# plot_lmplot_with_numerical_target(train, num_cols, y)

##### Boxplot

In [None]:
def plot_boxplot(df, num_cols):
    plt.figure(figsize=(12, len(num_cols) * 4))
    for i, col in enumerate(num_cols):
        plt.subplot(len(num_cols), 1, i + 1)
        sns.boxplot(x=df[col], color="green")
        plt.title(f"Boxplot of {col}")
        plt.xlabel(col)
    plt.tight_layout()
    plt.show()


# plot_boxplot(train, num_cols)


def plot_boxplot_with_numerical_target(df, num_cols, target):
    plt.figure(figsize=(12, len(num_cols) * 4))
    for i, col in enumerate(num_cols):
        plt.subplot(len(num_cols), 1, i + 1)
        sns.boxplot(y=df[col], x=target, color="lightblue")
        plt.title(f"Boxplot of {col} by Target")
        plt.ylabel(col)
        plt.xlabel("Target")
    plt.tight_layout()
    plt.show()


# plot_boxplot_with_numerical_target(train, num_cols, y)


def plot_boxplot_with_categorical_target(df, num_cols, target):
    plt.figure(figsize=(12, len(num_cols) * 4))
    for i, col in enumerate(num_cols):
        plt.subplot(len(num_cols), 1, i + 1)
        sns.boxplot(x=target, y=df[col], color="lightgreen")
        plt.title(f"Boxplot of {col} by Categorical Target")
        plt.xlabel("Target")
        plt.ylabel(col)
    plt.tight_layout()
    plt.show()


# plot_boxplot_with_categorical_target(train, num_cols, y)

##### violinplot

In [None]:
def plot_violinplot(df, num_cols):
    plt.figure(figsize=(12, len(num_cols) * 4))
    for i, col in enumerate(num_cols):
        plt.subplot(len(num_cols), 1, i + 1)
        sns.violinplot(x=df[col], color="lightblue")
        plt.title(f"Violin Plot of {col}")
        plt.xlabel(col)
    plt.tight_layout()
    plt.show()


# plot_violinplot(train, num_cols)


def plot_violin_with_categorical_target(df, num_cols, target):
    plt.figure(figsize=(12, len(num_cols) * 4))
    for i, col in enumerate(num_cols):
        plt.subplot(len(num_cols), 1, i + 1)
        sns.violinplot(x=target, y=df[col], color="lightblue")
        plt.title(f"Violin Plot of {col} by Categorical Target")
        plt.xlabel("Target")
        plt.ylabel(col)
    plt.tight_layout()
    plt.show()


# plot_violin_with_categorical_target(train, num_cols, y)

#### outliers

In [None]:
def handle_outliers_iqr(df, target, num_cols, method="remove"):
    df_out = df.copy()

    for col in num_cols:
        Q1 = df_out[col].quantile(0.25)
        Q3 = df_out[col].quantile(0.75)
        IQR = Q3 - Q1

        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        if method == "remove":
            # Create a boolean mask for outliers
            mask = (df_out[col] >= lower_bound) & (df_out[col] <= upper_bound)
            df_out = df_out[mask]
            target = target[mask]  # Align the target with the remaining DataFrame
        elif method == "replace":
            df_out[col] = df_out[col].where(
                (df_out[col] >= lower_bound) & (df_out[col] <= upper_bound),
                other=df_out[
                    col
                ].mean(),  # Replace with the mean or other value if outlier
            )

    return df_out, target  # Return the modified DataFrame and target


train, y = handle_outliers_iqr(train, y, num_cols)

#### Pairplot

In [None]:
def plot_pairplot(df, num_cols):
    sns.pairplot(df[num_cols])
    plt.suptitle("Pair Plot of Numerical Features", y=1.02)
    plt.show()


# plot_pairplot(train, num_cols)

#### correlation heatmap

In [None]:
def plot_lower_triangle_heatmap(df, cols):

    corr = df[cols].corr()

    mask = np.triu(np.ones_like(corr, dtype=bool))

    plt.figure(figsize=(10, 8))

    sns.heatmap(
        corr,
        mask=mask,
        cmap="coolwarm",
        annot=True,
        fmt=".2f",
        square=True,
        cbar_kws={"shrink": 0.8},
    )

    plt.title("Correlation Heatmap (Lower Triangle)", fontsize=16)
    plt.show()


# plot_lower_triangle_heatmap(train, num_cols)


#### Drop highly corrolated features

In [None]:
# Drop highly corrolated features

drop_cols = []
train = train.drop(drop_cols, axis=1)
test = test.drop(drop_cols, axis=1)
num_cols = train.select_dtypes(include=np.number).columns

### Categorical columns

#### Compare target and cats distribuion

In [None]:
def compare_categorical_columns(train, test, cat_cols):
    for cat_col in cat_cols:
        # Get all unique categories from both train and test
        categories = sorted(
            list(set(train[cat_col].unique()) | set(test[cat_col].unique()))
        )

        fig, axs = plt.subplots(1, 2, figsize=(14, 6))

        # Plot for the train set with consistent category order
        sns.countplot(
            data=train, x=cat_col, palette="Set2", ax=axs[0], order=categories
        )
        axs[0].set_title(f"Train - {cat_col}", fontsize=16)
        axs[0].set_xlabel(cat_col, fontsize=12)
        axs[0].set_ylabel("Count", fontsize=12)
        axs[0].tick_params(axis="x", rotation=45)

        # Plot for the test set with consistent category order
        sns.countplot(data=test, x=cat_col, palette="Set2", ax=axs[1], order=categories)
        axs[1].set_title(f"Test - {cat_col}", fontsize=16)
        axs[1].set_xlabel(cat_col, fontsize=12)
        axs[1].set_ylabel("Count", fontsize=12)
        axs[1].tick_params(axis="x", rotation=45)

        plt.tight_layout()
        plt.show()


# compare_categorical_columns(train, test, cat_cols)

##### count + bar

In [None]:
def plot_countplot(df, cat_cols):

    for cat_col in cat_cols:
        plt.figure(figsize=(10, 6))
        sns.countplot(data=df, x=cat_col, palette="Set2")
        plt.title(f"Count Plot of {cat_col}", fontsize=16)
        plt.xlabel(cat_col, fontsize=12)
        plt.ylabel("Count", fontsize=12)
        plt.xticks(rotation=45)
        plt.show()


# plot_countplot(train, cat_cols)


def plot_barplot(df, cat_cols, target):  # numrical target

    for cat_col in cat_cols:
        sns.barplot(data=df, x=cat_col, y=target, palette="Set2")
        plt.title(f"Mean of {target.name} by {cat_col}", fontsize=16)
        plt.xlabel(cat_col, fontsize=12)
        plt.ylabel(f"Mean of {target.name}", fontsize=12)
        plt.xticks(rotation=45)
        plt.show()


# plot_barplot(train, cat_cols, y)


def plot_pie_chart(df, cat_cols):

    for cat_col in cat_cols:
        df[cat_col].value_counts().plot.pie(
            autopct="%1.1f%%", startangle=90, cmap="Set2"
        )
        plt.title(f"Pie Chart of {cat_col}", fontsize=16)
        plt.ylabel("")
        plt.show()


# plot_pie_chart(train, cat_cols)


def plot_boxplot(df, cat_cols, target):  # cat target

    for cat_col in cat_cols:
        sns.boxplot(data=df, x=cat_col, y=target, palette="Set2")
        plt.title(f"Box Plot of {target.name} by {cat_col}", fontsize=16)
        plt.xlabel(cat_col, fontsize=12)
        plt.ylabel(target.name, fontsize=12)
        plt.xticks(rotation=45)
        plt.show()


# plot_boxplot(train, cat_cols, y)


def plot_violinplot(df, cat_cols, target):  # num target

    for cat_col in cat_cols:
        sns.violinplot(data=df, x=cat_col, y=target, palette="Set2")
        plt.title(f"Violin Plot of {target.name} by {cat_col}", fontsize=16)
        plt.xlabel(cat_col, fontsize=12)
        plt.ylabel(target.name, fontsize=12)
        plt.xticks(rotation=45)
        plt.show()


# plot_violinplot(train, cat_cols, y)


def plot_pointplot(df, cat_cols, target):  # num target --> use to compare with num_cols

    for cat_col in cat_cols:
        plt.figure(figsize=(10, 6))
        sns.pointplot(data=df, x=cat_col, y=target, palette="Set2", estimator="mean")
        plt.title(f"Mean of {target.name} by {cat_col}", fontsize=16)
        plt.xlabel(cat_col, fontsize=12)
        plt.ylabel(f"Mean of {target.name}", fontsize=12)
        plt.xticks(rotation=45)
        plt.show()


# plot_pointplot(train, cat_cols, y)

#### examine uniqe values

In [None]:
def check_unique_values(df, cat_cols):
    for cat_col in cat_cols:
        print(f"Unique values in {cat_col}: {df[cat_col].unique()}")


# check_unique_values(train, cat_cols)

### You can do cat_cols vs num_cols by charts in cat

In [None]:
# code here

### Geo Data

In [None]:
def plot_lat_long_scatter(df, lat_col="geo.lat", long_col="geo.lng"):
    plt.figure(figsize=(12, 8))

    plt.scatter(df[long_col], df[lat_col], alpha=0.6, color="blue", edgecolors="k")
    plt.title("Scatter Plot of Latitude vs Longitude", fontsize=16)
    plt.xlabel("Longitude", fontsize=12)
    plt.ylabel("Latitude", fontsize=12)
    plt.grid()
    plt.show()


# plot_lat_long_scatter(train)


def plot_hexbin(df, lat_col="geo.lat", long_col="geo.lng", gridsize=50):
    plt.figure(figsize=(12, 8))

    plt.hexbin(df[long_col], df[lat_col], gridsize=gridsize, cmap="Blues", mincnt=1)
    plt.colorbar(label="Count in Hexbin")
    plt.title("Hexbin Plot of Latitude vs Longitude", fontsize=16)
    plt.xlabel("Longitude", fontsize=12)
    plt.ylabel("Latitude", fontsize=12)
    plt.grid()
    plt.show()


# plot_hexbin(train)


def plot_heatmap(df, lat_col="geo.lat", long_col="geo.lng", cmap="viridis"):
    plt.figure(figsize=(12, 8))

    sns.kdeplot(
        data=df, x=long_col, y=lat_col, cmap=cmap, fill=True, thresh=0, levels=100
    )
    plt.title("Heatmap of Latitude vs Longitude", fontsize=16)
    plt.xlabel("Longitude", fontsize=12)
    plt.ylabel("Latitude", fontsize=12)
    plt.grid()
    plt.show()


# plot_heatmap(train)


def plot_lat_long_scatter2(df, lat_col="geo.lat", long_col="geo.lng", target=None):
    x = df[(df[lat_col] != 0) & (df[long_col] != 0)][lat_col]  # Use lat_col
    y = df[(df[lat_col] != 0) & (df[long_col] != 0)][long_col]  # Use long_col
    plt.figure(figsize=(14, 6))
    sns.scatterplot(x=x, y=y, hue=target)
    sns.set(style="darkgrid")


# plot_lat_long_scatter2(train, lat_col="LAT", long_col="LON", target=y)

### DateTimes

In [None]:
def plot_time_series(df, date_col, target_series):
    plt.figure(figsize=(14, 7))
    combined_df = pd.concat([df[date_col], target_series], axis=1)
    combined_df.groupby(date_col)[target_series.name].mean().plot()

    plt.title(f"Time Series Plot of {target_series.name} Over Time", fontsize=16)
    plt.xlabel("Date", fontsize=12)
    plt.ylabel(target_series.name, fontsize=12)
    plt.grid()
    plt.show()


# plot_time_series(train, 'date', y)


def plot_datetime_histogram(df, date_col, granularity="hour"):

    if granularity == "hour":
        df["time_component"] = df[date_col].dt.hour
        xlabel = "Hour of Day"
    elif granularity == "day":
        df["time_component"] = df[date_col].dt.day
        xlabel = "Day of Month"
    elif granularity == "month":
        df["time_component"] = df[date_col].dt.month
        xlabel = "Month"
    elif granularity == "week":
        df["time_component"] = df[date_col].dt.isocalendar().week
        xlabel = "Week of Year"
    else:
        raise ValueError(
            "Invalid granularity. Choose from 'hour', 'day', 'month', or 'week'."
        )

    plt.figure(figsize=(12, 6))
    df["time_component"].hist(bins=30, color="blue", alpha=0.7)
    plt.title(f"Distribution of {xlabel}", fontsize=16)
    plt.xlabel(xlabel, fontsize=12)
    plt.ylabel("Frequency", fontsize=12)
    plt.grid()
    plt.show()


# plot_datetime_histogram(train, 'date', 'hour')


def plot_heatmap_by_day_hour(df, date_col, granularity="hour"):
    df["day_of_week"] = df[date_col].dt.day_name()

    # Handle granularity options
    if granularity == "hour":
        df["time_component"] = df[date_col].dt.hour
        xlabel = "Hour of Day"
    elif granularity == "day":
        df["time_component"] = df[date_col].dt.day
        xlabel = "Day of Month"
    elif granularity == "month":
        df["time_component"] = df[date_col].dt.month
        xlabel = "Month"
    elif granularity == "year":
        df["time_component"] = df[date_col].dt.year
        xlabel = "Year"
    else:
        raise ValueError(
            "Invalid granularity. Choose from 'hour', 'day', 'month', or 'year'."
        )

    heatmap_data = (
        df.groupby(["day_of_week", "time_component"]).size().unstack(fill_value=0)
    )

    ordered_days = [
        "Monday",
        "Tuesday",
        "Wednesday",
        "Thursday",
        "Friday",
        "Saturday",
        "Sunday",
    ]
    heatmap_data = heatmap_data.reindex(ordered_days)  # Reorder index, not columns

    # Plot heatmap
    plt.figure(figsize=(12, 6))
    sns.heatmap(heatmap_data, cmap="YlGnBu", annot=True, fmt="d")
    plt.title(f"Heatmap of Counts by Day of Week and {xlabel}", fontsize=16)
    plt.xlabel(xlabel, fontsize=12)
    plt.ylabel("Day of Week", fontsize=12)
    plt.show()


# plot_heatmap_by_day_hour(train, 'date', 'hour')

## Feature enggineering

#### Scaler

In [None]:
def scale_df(df, num_cols, scaler_type="standard"):
    df_copy = df.copy()

    if scaler_type == "standard":
        scaler = StandardScaler()
    elif scaler_type == "minmax":
        scaler = MinMaxScaler()
    elif scaler_type == "robust":
        scaler = RobustScaler()
    else:
        raise ValueError(
            "Invalid scaler_type. Choose from 'standard', 'minmax', or 'robust'."
        )

    df_copy[num_cols] = scaler.fit_transform(df_copy[num_cols])

    return df_copy

#### group transformations

In [None]:
def group_transform(df, group_by, target_col, method="mean"):
    valid_methods = ["mean", "sum", "count", "median", "min", "max", "mode"]
    if method not in valid_methods:
        raise ValueError(f"Invalid method. Choose one of {valid_methods}")

    if method == "mode":
        # Handle 'mode' separately with alignment
        grouped = df.groupby(group_by)[target_col].agg(
            lambda x: x.mode()[0] if not x.mode().empty else np.nan
        )
        transformed_series = df[group_by].map(grouped)
    else:
        # Ensure index alignment with the original DataFrame
        grouped = df.groupby(group_by, sort=False)[target_col].transform(method)
        transformed_series = pd.Series(grouped.values, index=df.index)

    return transformed_series

#### KNN + DBscan + kmeans

In [None]:
def create_features_knn_dbscan_kmeans(
    df,
    cols,
    method="knn",
    n_neighbors=5,
    eps=0.5,
    min_samples=5,
    n_clusters=3,
    elbow=False,
    geo_cols=None,
    datetime_cols=None,
):
    if geo_cols:
        df[geo_cols] = StandardScaler().fit_transform(df[geo_cols])

    if datetime_cols:
        for col in datetime_cols:
            df[col + "_year"] = df[col].dt.year
            df[col + "_month"] = df[col].dt.month
            df[col + "_day"] = df[col].dt.day
            df[col + "_hour"] = df[col].dt.hour
            df[col + "_dayofweek"] = df[col].dt.dayofweek

        cols.extend([col + "_year" for col in datetime_cols])
        cols.extend([col + "_month" for col in datetime_cols])
        cols.extend([col + "_day" for col in datetime_cols])
        cols.extend([col + "_hour" for col in datetime_cols])
        cols.extend([col + "_dayofweek" for col in datetime_cols])

    if method not in ["knn", "dbscan", "kmeans"]:
        raise ValueError("Invalid method. Choose either 'knn', 'dbscan', or 'kmeans'.")

    if method == "knn":
        if elbow:
            mean_distances = []
            neighbors_range = range(1, 15)
            for k in neighbors_range:
                knn = NearestNeighbors(n_neighbors=k)
                knn.fit(df[cols])
                distances, _ = knn.kneighbors(df[cols])
                mean_distances.append(distances.mean(axis=1).mean())

            plt.figure(figsize=(8, 6))
            plt.plot(neighbors_range, mean_distances, marker="o")
            plt.title("Elbow Method for Optimal K (KNN)")
            plt.xlabel("Number of Neighbors (K)")
            plt.ylabel("Mean Distance to Nearest Neighbors")
            plt.xticks(neighbors_range)
            plt.grid()
            plt.show()

        knn = NearestNeighbors(n_neighbors=n_neighbors)
        knn.fit(df[cols])
        distances, indices = knn.kneighbors(df[cols])

        df["knn_mean_distance"] = distances.mean(axis=1)
        df["knn_max_distance"] = distances.max(axis=1)

    elif method == "dbscan":
        if elbow:
            eps_values = [i / 100 for i in range(1, 200)]
            num_clusters = []
            for eps in eps_values:
                dbscan = DBSCAN(eps=eps, min_samples=min_samples)
                clusters = dbscan.fit_predict(df[cols])
                num_clusters.append(len(set(clusters)) - (1 if -1 in clusters else 0))

            plt.figure(figsize=(8, 6))
            plt.plot(eps_values, num_clusters, marker="o")
            plt.title("Elbow Method for Optimal Epsilon (DBSCAN)")
            plt.xlabel("Epsilon")
            plt.ylabel("Number of Clusters")
            plt.grid()
            plt.show()

        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        clusters = dbscan.fit_predict(df[cols])
        df["dbscan_cluster"] = clusters

    elif method == "kmeans":
        if elbow:
            inertias = []
            k_values = range(1, 15)
            for k in k_values:
                kmeans = KMeans(n_clusters=k, random_state=42)
                kmeans.fit(df[cols])
                inertias.append(kmeans.inertia_)

            plt.figure(figsize=(8, 6))
            plt.plot(k_values, inertias, marker="o")
            plt.title("Elbow Method for Optimal K (KMeans)")
            plt.xlabel("Number of Clusters (K)")
            plt.ylabel("Inertia")
            plt.xticks(k_values)
            plt.grid()
            plt.show()

        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        df["kmeans_cluster"] = kmeans.fit_predict(df[cols])

    return df

#### date time extraction

In [None]:
def preprocess_datetime(df, datetime_col):
    """Extract datetime components and return a DataFrame with those components."""
    df[datetime_col] = pd.to_datetime(df[datetime_col])
    df["Year"] = df[datetime_col].dt.year
    df["Month"] = df[datetime_col].dt.month
    df["Day"] = df[datetime_col].dt.day
    df["Hour"] = df[datetime_col].dt.hour
    df["Minute"] = df[datetime_col].dt.minute
    df["Day_of_Week"] = df[datetime_col].dt.dayofweek
    return df

In [None]:
# Time Delta
# df["Rptd - date occ"] = (df["Date Rptd"] - df["DATE OCC"]).dt.total_seconds() / 3600

#### PCA

In [None]:
def apply_pca(
    df,
    cols=None,
    n_components=2,
    datetime_col=None,
    lat_col=None,
    long_col=None,
    scaler_type="standard",
):
    """Apply PCA to specified columns after preprocessing datetime and geographical data."""

    if datetime_col and datetime_col in df.columns:
        df = preprocess_datetime(df, datetime_col)

    if lat_col in df.columns and long_col in df.columns:
        df = scale_df(df, [lat_col, long_col], scaler_type)

    if cols:
        df_numerical = df[cols]
    else:
        numerical_cols = df.select_dtypes(include=["float64", "int64"]).columns
        df_numerical = df[numerical_cols]

    df_numerical = scale_df(df_numerical, df_numerical.columns, scaler_type)

    pca = PCA(n_components=n_components)
    pca_features = pca.fit_transform(df_numerical)

    for i in range(pca_features.shape[1]):
        df[f"PCA_Component_{i+1}"] = pca_features[:, i]

    return df

### numerical

#### scale

In [2]:
# sclae here

df = pd.concat([train, test], axis=0)

# df = scale_df(df, num_cols)
train = df.iloc[: len(train)]
test = df.iloc[len(train) :]

#### mathemitical things

In [1]:
# do your feature engineering here

df = pd.concat([train, test], axis=0)

# df['new_feature'] = np.where(df['feature1'].isna() | df['feature2'].isna(), np.nan, df['feature1'] * df['feature2'])

# df['feature1_isna'] = df['feature1'].isna().astype(int)
# df['feature2_isna'] = df['feature2'].isna().astype(int)
# df['new_feature'] = df['feature1'] * df['feature2']

train = df.iloc[: len(train)]
test = df.iloc[len(train) :]

#### make bins out of numerical

In [None]:
df = pd.concat([train, test], axis=0)


def bin_numerical_feature(df, num_col, bins=None, new_col_name=None, bin_label="range"):

    if new_col_name is None:
        new_col_name = f"{num_col}_binned"

    if bins is None:
        bins = int(np.ceil(np.log2(len(df[num_col]))) + 1)

    # Create the binned feature (without labels for now)
    binned_feature = pd.cut(df[num_col], bins=bins, include_lowest=True)

    # Get the bin edges (bin intervals) from the pd.cut().categories
    bin_edges = pd.cut(df[num_col], bins=bins, include_lowest=True).cat.categories

    # Define how to calculate the labels based on the bin_label input
    if bin_label == "mean":
        bin_values = [(edge.left + edge.right) / 2 for edge in bin_edges]
    elif bin_label == "median":
        bin_values = [
            df[(df[num_col] >= edge.left) & (df[num_col] <= edge.right)][
                num_col
            ].median()
            for edge in bin_edges
        ]
    elif bin_label == "min":
        bin_values = [edge.left for edge in bin_edges]
    elif bin_label == "max":
        bin_values = [edge.right for edge in bin_edges]
    elif isinstance(bin_label, list):
        # Use custom bin labels if provided
        if len(bin_label) != len(bin_edges):
            raise ValueError("The number of bin labels must match the number of bins.")
        bin_values = bin_label
    else:
        bin_values = (
            bin_edges  # Default to range if no specific method or list is provided
        )

    # Map the bin indices to the selected bin labels (mean, median, etc.)
    df[new_col_name] = binned_feature.map(
        lambda x: bin_values[bin_edges.get_loc(x)] if pd.notna(x) else x
    )

    return df


# Example usage
bins = [0, 12, 18, 30, 50, 100]
custom_labels = [0, 1, 2, 3, 4]
df = bin_numerical_feature(
    df, num_cols[0], new_col_name="Age Groups", bins=bins, bin_label=custom_labels
)

train = df.iloc[: len(train)]
test = df.iloc[len(train) :]

#### plonomial

In [None]:
df = pd.concat([train, test], axis=0)


def create_polynomial_features(df, num_cols, degree=2):
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    poly_features = poly.fit_transform(df[num_cols])
    poly_feature_names = poly.get_feature_names_out(num_cols)

    # Create a new DataFrame for polynomial features
    df_new = pd.DataFrame(poly_features, columns=poly_feature_names)

    # Reset the index of both DataFrames to avoid index conflicts
    df_combined = pd.concat(
        [df.reset_index(drop=True), df_new.reset_index(drop=True)], axis=1
    )

    return df_combined, df_new


# df_combined, df_new = create_polynomial_features(df, num_cols, degree=2)

train = df.iloc[: len(train)]
test = df.iloc[len(train) :]

In [None]:
# group transfrom

df = pd.concat([train, test], axis=0)
# df["new_feature"] = group_transform(df, "feature", "target_col", "mean")

train = df.iloc[: len(train)]
test = df.iloc[len(train) :]

In [None]:
# kmean + knn + dbscan
df = pd.concat([train, test], axis=0)
# df = create_features_knn_dbscan_kmeans(df, cols,  method="kmeans", elbow=True, n_clusters=5)

train = df.iloc[: len(train)]
test = df.iloc[len(train) :]

In [None]:
df = pd.concat([train, test], axis=0)

# df = apply_pca(df, n_components=2))


train = df.iloc[: len(train)]
test = df.iloc[len(train) :]

#### RFE feature selection

In [None]:
df = pd.concat([train, test], axis=0)


def select_features_rfe(df, target, num_cols):
    model = LinearRegression()
    rfe = RFE(model, n_features_to_select=5)
    fit = rfe.fit(df[num_cols], target)

    # Select columns (features) based on RFE support_
    selected_columns = [col for col, support in zip(num_cols, fit.support_) if support]

    # Return DataFrame with only the selected columns
    return df[selected_columns]


# df = select_features_rfe(df, y, num_cols)
train = df.iloc[: len(train)]
test = df.iloc[len(train) :]

### date time

##### extract date time

In [None]:
df = pd.concat([train, test], axis=0)
# df = preprocess_datetime(df, datetime_col)
train = df.iloc[: len(train)]
test = df.iloc[len(train) :]

#### you can do pca with date time


In [1]:
df = pd.concat([train, test], axis=0)

# df = apply_pca(df, n_components=2))


train = df.iloc[: len(train)]
test = df.iloc[len(train) :]

In [None]:
# kmean + knn + dbscan
df = pd.concat([train, test], axis=0)
# df = create_features_knn_dbscan_kmeans(df, cols, method="kmeans", elbow=True, n_clusters=5)

train = df.iloc[: len(train)]
test = df.iloc[len(train) :]

### categorical

#### Encoders

In [None]:
df = pd.concat([train, test], axis=0)


# Label Encoding
def label_encode(df, cat_cols):
    le = LabelEncoder()
    for col in cat_cols:
        df[col] = le.fit_transform(df[col])
    return df


# One-Hot Encoding
def one_hot_encode(df, cat_cols):
    df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)
    return df_encoded


# Frequency Encoding
def frequency_encode(df, cat_cols):
    for col in cat_cols:
        freq = df[col].value_counts()
        df[col] = df[col].map(freq)
    return df


# Target Encoding
def target_encode(df, cat_cols, target):
    for col in cat_cols:
        mean_encoded = df.groupby(col)[target].mean()
        df[col] = df[col].map(mean_encoded)
    return df


# Binary Encoding
def binary_encode(df, cat_cols):
    encoder = BinaryEncoder(cols=cat_cols)
    df_encoded = encoder.fit_transform(df)
    return df_encoded


# Count Encoding
def count_encode(df, cat_cols):
    for col in cat_cols:
        counts = df[col].value_counts()
        df[col] = df[col].map(counts)
    return df


# Group Rare Categories
def group_rare_categories(df, cat_cols, threshold=0.05):
    for col in cat_cols:
        counts = df[col].value_counts(normalize=True)
        rare_categories = counts[counts < threshold].index
        df[col] = df[col].replace(rare_categories, "Other")
    return df


# Create Interaction Feature
def create_interaction_feature(df, col1, col2, new_col_name):
    df[new_col_name] = df[col1].astype(str) + "_" + df[col2].astype(str)
    return df


# Ordinal Encoding
def ordinal_encode(df, cat_cols, mapping_dict):
    for col in cat_cols:
        df[col] = df[col].map(mapping_dict[col])
    return df


# df = label_encode(df, cat_cols)
# df = one_hot_encode(df, cat_cols)
# df = frequency_encode(df, cat_cols)
# df = target_encode(df, cat_cols, y)
# df = binary_encode(df, cat_cols)
# df = count_encode(df, cat_cols)
# df = group_rare_categories(df, cat_cols)
# df = create_interaction_feature(df, col1, col2, new_col_name)
# df = ordinal_encode(df, cat_col, mapping)

train = df.iloc[: len(train)]
test = df.iloc[len(train) :]

In [None]:
# kmean + knn + dbscan
df = pd.concat([train, test], axis=0)
# df = create_features_knn_dbscan_kmeans(df, cols,  method="kmeans", elbow=True, n_clusters=5)

train = df.iloc[: len(train)]
test = df.iloc[len(train) :]

### Geo

##### Creating new features

In [None]:
df = pd.concat([train, test], axis=0)


def create_geo_distance_features(df, lat_col, lon_col, reference_point=None):
    geo_data = df[[lat_col, lon_col]].copy()

    if reference_point is None:
        # Set the reference point as the mean of latitude and longitude
        reference_point = (geo_data[lat_col].mean(), geo_data[lon_col].mean())

    # Feature: Distance from reference point
    df["distance_from_reference"] = geo_data.apply(
        lambda row: geodesic((row[lat_col], row[lon_col]), reference_point).kilometers,
        axis=1,
    )

    return df


# df = create_geo_distance_features(df, "lat_col", "lon_col", reference_point=None)

train = df.iloc[: len(train)]
test = df.iloc[len(train) :]

In [None]:
# kmean + knn + dbscan
df = pd.concat([train, test], axis=0)
# df = create_features_knn_dbscan_kmeans(df, cols,  method="kmeans", elbow=True, n_clusters=5)

train = df.iloc[: len(train)]
test = df.iloc[len(train) :]

In [None]:
df = pd.concat([train, test], axis=0)

# df = apply_pca(df, n_components=2))


train = df.iloc[: len(train)]
test = df.iloc[len(train) :]

In [None]:
def calculate_geometric_features(df, lat_col, lon_col, reference_point=None):

    if reference_point is None:
        reference_point = [df[lat_col].mean(), df[lon_col].mean()]

    def calculate_bearing(row):
        if reference_point:
            lat1, lon1 = np.radians(row[lat_col]), np.radians(row[lon_col])
            lat2, lon2 = np.radians(reference_point[0]), np.radians(reference_point[1])
            d_lon = lon2 - lon1
            x = np.sin(d_lon) * np.cos(lat2)
            y = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(
                d_lon
            )
            return (np.arctan2(x, y) + 2 * np.pi) % (2 * np.pi) * (180 / np.pi)
        return np.nan

    df["bearing"] = df.apply(calculate_bearing, axis=1)

    # Calculate relative positioning (N, S, E, W)
    def relative_position(row):
        if reference_point:
            if row[lat_col] > reference_point[0]:
                lat_pos = "N"
            else:
                lat_pos = "S"
            if row[lon_col] > reference_point[1]:
                lon_pos = "E"
            else:
                lon_pos = "W"
            return f"{lat_pos}_{lon_pos}"
        return np.nan

    df["relative_position"] = df.apply(relative_position, axis=1)

    return df


df = pd.concat([train, test], axis=0)
df = calculate_geometric_features(df, "LAT", "LON")

train = df.iloc[: len(train)]
test = df.iloc[len(train) :]

### if a feature contains a list of features

In [1]:
# df["Modus_Operandi"] = df["Modus_Operandi"].fillna("")
# df["Modus_Operandi"] = df["Modus_Operandi"].apply(lambda x: x.split() if x else [])

# mlb = MultiLabelBinarizer()
# temp = mlb.fit_transform(df["Modus_Operandi"])
# temp = pd.DataFrame(temp, columns=mlb.classes_, index=df.index)
# df = pd.concat([df, temp], axis=1)
# df = df.drop(columns=["Modus_Operandi"])


# test["Modus_Operandi"] = test["Modus_Operandi"].fillna("")
# test["Modus_Operandi"] = test["Modus_Operandi"].apply(lambda x: x.split() if x else [])

# temp2 = mlb.transform(test["Modus_Operandi"])
# temp2 = pd.DataFrame(temp2, columns=mlb.classes_, index=test.index)
# test = pd.concat([test, temp2], axis=1)
# test = test.drop(columns=["Modus_Operandi"])

### More feature engeenier on you...

### Mutual information 

In [None]:
def plot_mutual_information(df, target_series, task="classification"):
    # Select the mutual information function based on the task type
    if task == "classification":
        mutual_info_func = mutual_info_classif
    elif task == "regression":
        mutual_info_func = mutual_info_regression
    else:
        raise ValueError(
            "Invalid task. Choose either 'classification' or 'regression'."
        )

    # Calculate mutual information
    mi_scores = mutual_info_func(df, target_series)

    # Create a DataFrame for better visualization
    mi_df = pd.DataFrame({"Feature": df.columns, "Mutual Information": mi_scores})
    mi_df = mi_df.sort_values(by="Mutual Information", ascending=False)

    # Print the mutual information scores
    print(mi_df)

    # Plot the mutual information scores
    plt.figure(figsize=(10, 6))
    plt.barh(mi_df["Feature"], mi_df["Mutual Information"], color="skyblue")
    plt.xlabel("Mutual Information")
    plt.ylabel("Feature")
    plt.title("Mutual Information between Features and Target")
    plt.gca().invert_yaxis()  # Invert y-axis for better visualization
    plt.show()

### Feature selection

In [1]:
# ...

# Models !

## split data

In [None]:
# X, X_val, y, y_val = train_test_split(train, y, test_size=0.2, random_state=42)

## metrics

In [None]:
# Define your metrics
metrics = {
    "classification": {
        "f1_score": lambda y_true, y_pred: f1_score(y_true, y_pred),
        "f1_macro": lambda y_true, y_pred: f1_score(y_true, y_pred, average="macro"),
        "accuracy": accuracy_score,
        "precision": precision_score,
        "recall": recall_score,
        "roc_auc": roc_auc_score,  # For binary classification
        "roc_auc_ovr": lambda y_true, y_pred: roc_auc_score(
            y_true, y_pred, multi_class="ovr"
        ),  # For multi-class
        "roc_auc_ovo": lambda y_true, y_pred: roc_auc_score(
            y_true, y_pred, multi_class="ovo"
        ),  # For multi-class
        "log_loss": log_loss,
        "confusion_matrix": confusion_matrix,
        "classification_report": lambda y_true, y_pred: classification_report(
            y_true, y_pred
        ),
        "roc_curve": lambda y_true, y_pred: roc_curve(
            y_true, y_pred
        ),  # Returns FPR, TPR, thresholds
    },
    "regression": {
        "r2_score": r2_score,
        "mean_squared_error": mean_squared_error,
        "mean_absolute_error": mean_absolute_error,
        "median_absolute_error": median_absolute_error,
        "explained_variance_score": explained_variance_score,
        "root_mean_squared_error": lambda y_true, y_pred: mean_squared_error(
            y_true, y_pred, squared=False
        ),  # RMSE
        "mean_squared_log_error": mean_squared_log_error,
        "max_error": max_error,
    },
}

## models


In [1]:
models = []

### Model dicts

In [None]:
# Define regression models and parameters
regression_models = {
    "SVR": {
        "model": SVR(),
        "params": {
            "C": [0.1, 1, 10, 100],
            "kernel": ["linear", "rbf"],
            "gamma": ["scale", "auto", 0.01, 0.1],
            "epsilon": [0.1, 0.2, 0.5],  # Epsilon for margin of tolerance
        },
    },
    "RandomForest": {
        "model": RandomForestRegressor(),
        "params": {
            "n_estimators": [100, 200, 300],
            "max_depth": [None, 10, 20, 30],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 4],  # Minimum samples at leaf node
            "max_features": ["auto", "sqrt"],  # Number of features to consider
        },
    },
    "XGBoost": {
        "model": XGBRegressor(),
        "params": {
            "n_estimators": [100, 200, 300],
            "learning_rate": [0.01, 0.1, 0.2],
            "max_depth": [3, 6, 9, 12],
            "subsample": [0.6, 0.8, 1.0],  # Fraction of samples to use
            "colsample_bytree": [0.6, 0.8, 1.0],  # Fraction of features to use
        },
    },
    "LightGBM1": {
        "model": LGBMRegressor(),
        "params": {
            "num_leaves": [31, 50, 100],
            "learning_rate": [0.01, 0.1, 0.2],
            "n_estimators": [100, 200, 300],
            "max_depth": [-1, 10, 20],  # Default is -1 (no limit)
            "min_child_samples": [20, 30],  # Minimum number of data points in a leaf
        },
    },
    "LightGBM5": {
        "model": LGBMRegressor(),
        "params": {
            "num_leaves": [31, 50, 100],
            "learning_rate": [0.01, 0.1, 0.2],
            "n_estimators": [100, 200, 300],
            "boosting_type": ["gbdt", "dart"],
            "max_depth": [-1, 10, 20],
        },
    },
    "CatBoost": {
        "model": CatBoostRegressor(silent=True),
        "params": {
            "iterations": [100, 200, 300],
            "learning_rate": [0.01, 0.1, 0.2],
            "depth": [6, 8, 10],
            "l2_leaf_reg": [3, 5, 7],  # L2 regularization coefficient
        },
    },
    "Fastai": {
        "model": tabular_learner,
        "params": {
            "layers": [[200, 100], [400, 200]],
            "metrics": [rmse],
            "emb_drop": [0.1, 0.2],
            "drop_mult": [0.5, 0.75],
        },
    },
}

In [None]:
# Define classification models and parameters
classification_models = {
    "SVM": {
        "model": SVC(),
        "params": {
            "C": [0.1, 1, 10, 100],
            "kernel": ["linear", "rbf"],
            "gamma": ["scale", "auto", 0.001, 0.01],
            "class_weight": [None, "balanced"],
        },
    },
    "RandomForest": {
        "model": RandomForestClassifier(),
        "params": {
            "n_estimators": [100, 200, 300],
            "max_depth": [None, 10, 20, 30],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 4],
            "max_features": ["auto", "sqrt"],
            "class_weight": [None, "balanced"],
        },
    },
    "XGBoost": {
        "model": XGBClassifier(eval_metric="mlogloss"),
        "params": {
            "n_estimators": [100, 200, 300],
            "learning_rate": [0.01, 0.1, 0.2],
            "max_depth": [3, 6, 9, 12],
            "subsample": [0.6, 0.8, 1.0],
            "colsample_bytree": [0.6, 0.8, 1.0],
            "gamma": [0, 0.1, 0.2],
            "scale_pos_weight": [1, 10],
        },
    },
    "LightGBM1": {
        "model": LGBMClassifier(),
        "params": {
            "num_leaves": [31, 50, 70],
            "learning_rate": [0.01, 0.05, 0.1],
            "n_estimators": [100, 200, 300],
            "max_depth": [-1, 10, 20],
            "min_child_samples": [10, 20],
            "class_weight": [None, "balanced"],
        },
    },
    "LightGBM5": {
        "model": LGBMClassifier(),
        "params": {
            "num_leaves": [31, 50, 70],
            "learning_rate": [0.01, 0.05, 0.1],
            "n_estimators": [100, 200, 300],
            "boosting_type": ["gbdt", "dart"],
            "max_depth": [-1, 10, 20],
            "min_child_samples": [10, 20],
            "class_weight": [None, "balanced"],
        },
    },
    "CatBoost": {
        "model": CatBoostClassifier(silent=True),
        "params": {
            "iterations": [100, 200, 300],
            "learning_rate": [0.01, 0.05, 0.1],
            "depth": [6, 8, 10],
            "l2_leaf_reg": [3, 5, 10],
            "bagging_temperature": [0, 0.5, 1],
            "class_weights": [None, "balanced"],
        },
    },
    "Fastai": {
        "model": tabular_learner,
        "params": {"layers": [[200, 100], [300, 150]], "metrics": [accuracy]},
    },
}

### SearchCv 

In [None]:
def fit_and_train_models(
    X_train,
    y_train,
    X_val,
    y_val,
    models_dict,
    metrics,
    metric_name,
    early_stopping_rounds=None,
):
    models = []

    # Retrieve the correct scoring function based on the metric name
    if metric_name in metrics["classification"]:
        scorer = make_scorer(metrics["classification"][metric_name])
    elif metric_name in metrics["regression"]:
        scorer = make_scorer(metrics["regression"][metric_name])
    else:
        raise ValueError("Unsupported metric")

    for name, model_info in models_dict.items():
        model = model_info["model"]
        params = model_info["params"]

        # Use GridSearchCV for hyperparameter tuning
        grid_search = GridSearchCV(model, params, scoring=scorer, cv=10, n_jobs=-1)

        # Fit the model, handling early stopping where applicable
        if hasattr(model, "fit"):
            if "n_estimators" in params and early_stopping_rounds is not None:
                # For models that can accept an eval_set, include it
                eval_set = [(X_val, y_val)]
                if isinstance(model, (XGBRegressor, LGBMRegressor, CatBoostRegressor)):
                    grid_search.fit(
                        X_train,
                        y_train,
                        eval_set=eval_set,
                        early_stopping_rounds=early_stopping_rounds,
                        verbose=False,
                    )
                else:
                    # For models without eval_set, simply fit normally
                    grid_search.fit(X_train, y_train)
            else:
                grid_search.fit(X_train, y_train)

        best_model = grid_search.best_estimator_
        best_score = grid_search.best_score_
        best_params = grid_search.best_params_

        # Evaluate on the validation set
        y_pred = best_model.predict(X_val)

        # Calculate the validation score using the specified metric
        val_score = (
            metrics["classification"][metric_name](y_val, y_pred)
            if metric_name in metrics["classification"]
            else metrics["regression"][metric_name](y_val, y_pred)
        )

        models.append(
            {
                "name": name,
                "model": best_model,
                "best_params": best_params,
                "best_score": best_score,
                "val_score": val_score,
            }
        )

        # Print the results for the current model
        print(f"Model: {name}")
        print(f"Best Parameters: {best_params}")
        print(f"Best Score: {best_score:.4f}")
        print(f"Validation Score: {val_score:.4f}\n")

    return models


models.append(
    fit_and_train_models(
        X_train,
        y_train,
        X_val,
        y_val,
        models_dict,
        metrics,
        metric_name,
        early_stopping_rounds=20,
    )
)

### TabularPredictor

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display
import seaborn as sns


def train_and_evaluate_model(
    X_train,
    y_train,
    X_test,
    y_test,
    is_classification=True,
    eval_metric="f1_macro",
    time_limit=3600,
    included_model_types=None,
):
    X_train = X_train.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)

    # Define all possible model types
    if included_model_types is None:
        included_model_types = [
            "GBM",
            "CAT",
            "XGB",
            "RF",
            "KNN",
            "NN",
            "LR",
            "XT",
            "FASTAI",
            "NN_TORCH",
            "ET",
            "DTR",
            "AB",
            "GBR",
            "H2O",
            "LightGBM",
            "CatBoost",
        ]

    # Determine problem type for classification
    if is_classification:
        num_classes = len(set(y_train))
        problem_type = "binary" if num_classes == 2 else "multiclass"
    else:
        problem_type = "regression"

    # Set target column name
    y_train.name = "target_column_name"

    # Concatenate X and y for training data
    train_data = X_train.copy()
    train_data["target_column_name"] = y_train

    # Create a TabularPredictor with best parameters
    predictor = TabularPredictor(
        log_file_path="logs.txt",
        log_to_file=True,
        label="target_column_name",
        eval_metric=eval_metric,
        problem_type=problem_type,
    )

    # Fit the model with a time limit
    predictor.fit(
        train_data,
        time_limit=time_limit,
        presets="best_quality",
        num_bag_folds=10,
        num_bag_sets=2,
        # tuning_data=None,        # Disable internal splitting for validation
        # holdout_frac=0,
        num_stack_levels=2,
        keep_only_best=True,
        verbosity=2,
        # num_gpus=1,
        num_cpus=1,
        # excluded_model_types=["KNN", "NN", "XT", "FASTAI", "NN_TORCH"],
        included_model_types=included_model_types,
    )

    # Get the leaderboard
    leaderboard = predictor.leaderboard(silent=True)
    best_model = predictor.get_model_best()
    model_dict = predictor.get_model_full_dict()
    if best_model in model_dict:
        best_params = model_dict[best_model]
        best_score = predictor.info()["val_score"]
    else:
        print(f"Model '{best_model}' not found in model_dict.")
        best_params, val_score = None, None

    # You now have best_model, best_params, and val_score

    # For Jupyter notebooks, display the styled leaderboard
    try:
        display(
            leaderboard.style.background_gradient(subset=["score_val"], cmap="RdYlGn")
        )
    except ImportError:
        # Convert leaderboard DataFrame to image using seaborn and matplotlib
        plt.figure(figsize=(10, 8))
        sns.heatmap(
            leaderboard[["model", "score_val"]].set_index("model").T,
            annot=True,
            cmap="RdYlGn",
            cbar=False,
            fmt=".3f",
        )
        plt.title("Leaderboard")
        plt.savefig("leaderboard.png")
        print(
            "Leaderboard saved as 'leaderboard.png'. Open the file to view the table."
        )

    # Make predictions
    y_pred = predictor.predict(X_test)

    # Evaluate the model
    if is_classification:
        score = metrics["classification"][eval_metric](y_test, y_pred)
    else:
        score = metrics["regression"][eval_metric](y_test, y_pred)

    print(f"Model score: {score}")

    return {
        "name": best_model,
        "model": predictor,
        "best_params": best_params,
        "best_score": best_score,
        "val_score": score,
    }


# Append the model's results to the models list
model_result = train_and_evaluate_model(
    X, y, X_val, y_val, is_classification=True, eval_metric="roc_auc"
)
models.append(model_result)

### NN

In [None]:
# Custom R2 Score metric
class R2Score(tf.keras.metrics.Metric):
    def __init__(self, name="r2_score", **kwargs):
        super(R2Score, self).__init__(name=name, **kwargs)
        self.sum_squared_errors = self.add_weight(name="sse", initializer="zeros")
        self.total_variance = self.add_weight(name="tv", initializer="zeros")

    def update_state(self, y_true, y_pred, sample_weight=None):
        self.sum_squared_errors.assign_add(tf.reduce_sum(tf.square(y_true - y_pred)))
        self.total_variance.assign_add(
            tf.reduce_sum(tf.square(y_true - tf.reduce_mean(y_true)))
        )

    def result(self):
        return 1 - (
            self.sum_squared_errors / (self.total_variance + tf.keras.backend.epsilon())
        )

    def reset_states(self):
        self.sum_squared_errors.assign(0)
        self.total_variance.assign(0)


def custom_f1(y_true, y_pred):
    # Cast y_true to float32 to match y_pred's dtype
    y_true = K.cast(y_true, dtype="float32")

    # If multi-class, convert y_true to one-hot format
    if K.ndim(y_pred) > 1:
        y_true = K.one_hot(K.cast(y_true, "int32"), num_classes=K.shape(y_pred)[-1])

    # Round predictions to nearest integer
    y_pred = K.round(y_pred)

    def recall_m(y_true, y_pred):
        TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        Positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = TP / (Positives + K.epsilon())
        return recall

    def precision_m(y_true, y_pred):
        TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        Pred_Positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = TP / (Pred_Positives + K.epsilon())
        return precision

    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)

    # Return F1 Score
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))


# Metrics dictionary
metrics = {
    "classification": {
        # "f1_macro": tfa.metrics.F1Score(average='macro', num_classes=2),  # Updated instantiation
        "f1_macro": custom_f1,
        # "f1_macro": F1Macro(),
        "accuracy": "accuracy",
        "precision": "precision",
        "recall": "recall",
        "roc_auc": tf.keras.metrics.AUC(name="roc_auc"),  # Updated AUC instantiation
    },
    "regression": {
        "mean_squared_error": "mean_squared_error",
        "mean_absolute_error": "mean_absolute_error",
        "r2_score": R2Score(),
    },
}


class NeuralNetworkModel:
    def __init__(self, is_classification=True, metric_name="roc_auc", n_runs=1):
        self.is_classification = is_classification
        self.metric_name = metric_name
        self.n_runs = n_runs
        self.models = []
        self.ordinal_encoder = None

    def create_and_train(self, X_train, y_train, X_val, y_val):
        input_shape = (X_train.shape[1],)

        for i in range(self.n_runs):
            print(f"Training run {i + 1}/{self.n_runs}")
            try:
                if self.is_classification:
                    self.ordinal_encoder = OrdinalEncoder()
                    y_train_encoded = self.ordinal_encoder.fit_transform(
                        y_train.values.reshape(-1, 1)
                    ).ravel()
                    y_val_encoded = self.ordinal_encoder.transform(
                        y_val.values.reshape(-1, 1)
                    ).ravel()
                    y_train_encoded = y_train_encoded.astype(np.int32)
                    y_val_encoded = y_val_encoded.astype(np.int32)
                    num_classes = len(self.ordinal_encoder.categories_[0])
                    if num_classes == 2:
                        num_classes = 1
                else:
                    y_train_encoded = y_train
                    y_val_encoded = y_val
                    num_classes = 1

                model = keras.Sequential(
                    [
                        layers.Input(shape=input_shape),
                        layers.BatchNormalization(),
                        layers.Dense(128, activation="relu"),
                        layers.Dropout(0.3),
                        layers.BatchNormalization(),
                        layers.Dense(64, activation="relu"),
                        layers.Dropout(0.3),
                        layers.BatchNormalization(),
                        layers.Dense(32, activation="relu"),
                        layers.Dense(
                            num_classes,
                            activation=(
                                "sigmoid"
                                if self.is_classification and num_classes == 1
                                else "softmax" if self.is_classification else "linear"
                            ),
                        ),
                    ]
                )

                loss_function = (
                    "binary_crossentropy"
                    if self.is_classification and num_classes == 1
                    else (
                        "sparse_categorical_crossentropy"
                        if self.is_classification
                        else "mean_squared_error"
                    )
                )

                model.compile(
                    optimizer=keras.optimizers.Adam(learning_rate=0.001),
                    loss=loss_function,
                    metrics=[
                        (
                            metrics["classification"][self.metric_name]
                            if self.is_classification
                            else metrics["regression"][self.metric_name]
                        )
                    ],
                )

                # early_stopping = callbacks.EarlyStopping(
                #     monitor="val_loss", patience=7, restore_best_weights=True
                # )

                # model_checkpoint = callbacks.ModelCheckpoint(
                #     "best_model.keras", monitor="val_loss", save_best_only=True
                # )

                early_stopping = callbacks.EarlyStopping(
                    monitor=f"val_{self.metric_name}",
                    patience=7,
                    restore_best_weights=True,
                )

                model_checkpoint = callbacks.ModelCheckpoint(
                    "best_model.keras",
                    monitor=f"val_{self.metric_name}",
                    save_best_only=True,
                    # mode = "max",
                )

                history = model.fit(
                    X_train,
                    y_train_encoded,
                    validation_data=(X_val, y_val_encoded),
                    epochs=500,
                    batch_size=64,
                    callbacks=[early_stopping, model_checkpoint],
                    verbose=1,
                )

                model.load_weights("best_model.keras")
                y_pred = model.predict(X_val)

                if self.is_classification:
                    y_pred_classes = (
                        (y_pred > 0.5).astype(int)
                        if num_classes == 1
                        else y_pred.argmax(axis=1)
                    )
                    score = metrics["regression"][self.metric_name](y_val, y_pred)
                    y_pred_labels = self.ordinal_encoder.inverse_transform(
                        y_pred_classes.reshape(-1, 1)
                    ).ravel()
                else:
                    score = metrics["regression"][self.metric_name](y_val, y_pred)

                print("Score:", score)
                best_score = min(history.history["val_loss"])
                best_params = model.get_config()

                self.models.append(
                    {
                        "name": f"NN_Run_{i + 1}",
                        "model": model,
                        "best_params": best_params,
                        "best_score": best_score,
                        "val_score": score,
                    }
                )

            except KeyError as e:
                print(f"Error: Metric '{e}' not found.")
                return None

    def predict(self, X):
        predictions = []
        for model_info in self.models:
            model = model_info["model"]
            y_pred = model.predict(X)

            if self.is_classification:
                y_pred_classes = (
                    (y_pred > 0.5).astype(int)
                    if y_pred.shape[-1] == 1
                    else y_pred.argmax(axis=1)
                )
                y_pred_labels = self.ordinal_encoder.inverse_transform(
                    y_pred_classes.reshape(-1, 1)
                ).ravel()
                predictions.append(y_pred_labels)
            else:
                predictions.append(y_pred)

        return predictions


# Example usage
# nn_model = NeuralNetworkModel(is_classification=True, metric_name="f1_macro", n_runs=3)
# nn_model.create_and_train(X, y, X_val, y_val)
# predictions = nn_model.predict(X_test)

### stacking & More

#### Stack with ready models

In [None]:
class StackedModel:
    def __init__(self, trained_models, meta_model, is_classification=True):
        self.trained_models = trained_models  # List of trained models
        self.meta_model = meta_model
        self.is_classification = is_classification

    def fit(self, X_val, y_val):
        base_predictions = []

        for model_info in self.trained_models:
            model = model_info["model"]  # Use already trained models
            # Predict probabilities or fallback to class prediction
            if self.is_classification:
                if hasattr(model, "predict_proba"):
                    y_pred = model.predict_proba(X_val)
                else:
                    y_pred = model.predict(X_val)
                if y_pred.ndim == 1 or y_pred.shape[1] == 1:  # Binary classification
                    y_pred = y_pred.reshape(-1, 1)
                base_predictions.append(y_pred)
            else:
                y_pred = model.predict(X_val)
                base_predictions.append(y_pred.reshape(-1, 1))

        X_stack = np.hstack(base_predictions)
        self.meta_model.fit(X_stack, y_val)

    def predict(self, X):
        base_predictions = []

        for model_info in self.trained_models:
            model = model_info["model"]
            if self.is_classification:
                if hasattr(model, "predict_proba"):
                    y_pred = model.predict_proba(X)
                else:
                    y_pred = model.predict(X)
                if y_pred.ndim == 1 or y_pred.shape[1] == 1:
                    y_pred = y_pred.reshape(-1, 1)
                base_predictions.append(y_pred)
            else:
                y_pred = model.predict(X)
                base_predictions.append(y_pred.reshape(-1, 1))

        X_stack = np.hstack(base_predictions)
        return self.meta_model.predict(X_stack)


# Example usage:

# Assuming you already have your trained models in a list as described
# trained_models = [
#     {
#         "name": best_model,
#         "model": predictor,
#         "best_params": best_params,
#         "best_score": best_score,
#         "val_score": score,
#     },
#     ...
# ]

# Define your meta-model (for example, LogisticRegression for classification)
# meta_model = LogisticRegression(
#     C=1.0,
#     penalty="l2",
#     solver="lbfgs",
#     max_iter=100,
#     class_weight="balanced",
#     random_state=42,
# )

# # Create an instance of StackedModel for classification
# stacked_model = StackedModel(
#     trained_models=trained_models, meta_model=meta_model, is_classification=True
# )

# # Fit the stacked model with validation data
# stacked_model.fit(X_val, y_val)

# # Predict using the stacked model
# y_pred = stacked_model.predict(X_test)

#### stack without ready models

In [None]:
class StackedModel:
    def __init__(self, models, meta_model, is_classification=True):
        self.models = models
        self.meta_model = meta_model
        self.is_classification = is_classification
        self.fitted_models = []

    def fit(self, X_train, y_train, X_val, y_val):
        base_predictions = []

        for model_info in self.models:
            model = model_info["model"]
            model.fit(X_train, y_train)
            # Predict probabilities or fallback to class prediction
            if self.is_classification:
                if hasattr(model, "predict_proba"):
                    y_pred = model.predict_proba(X_val)
                else:
                    y_pred = model.predict(X_val)
                if y_pred.ndim == 1 or y_pred.shape[1] == 1:  # Binary classification
                    y_pred = y_pred.reshape(-1, 1)
                base_predictions.append(y_pred)
            else:
                y_pred = model.predict(X_val)
                base_predictions.append(y_pred.reshape(-1, 1))

            self.fitted_models.append(model)

        X_stack = np.hstack(base_predictions)
        self.meta_model.fit(X_stack, y_val)

    def predict(self, X):
        base_predictions = []

        for model in self.fitted_models:
            if self.is_classification:
                if hasattr(model, "predict_proba"):
                    y_pred = model.predict_proba(X)
                else:
                    y_pred = model.predict(X)
                if y_pred.ndim == 1 or y_pred.shape[1] == 1:
                    y_pred = y_pred.reshape(-1, 1)
                base_predictions.append(y_pred)
            else:
                y_pred = model.predict(X)
                base_predictions.append(y_pred.reshape(-1, 1))

        X_stack = np.hstack(base_predictions)
        return self.meta_model.predict(X_stack)


# Example usage:

# Define base models with expanded parameters including linear, logistic, and non-linear models
models = [
    {
        "name": "RandomForest",
        "model": RandomForestClassifier(
            n_estimators=200,
            max_depth=10,
            min_samples_split=5,
            min_samples_leaf=2,
            bootstrap=True,
            random_state=42,
            max_features="auto",
            oob_score=True,
            n_jobs=-1,
            class_weight="balanced",
        ),
    },
    {
        "name": "XGBoost",
        "model": XGBClassifier(
            n_estimators=200,
            learning_rate=0.1,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_alpha=0.01,
            reg_lambda=0.01,
            gamma=0.2,
            scale_pos_weight=1,
            random_state=42,
            use_label_encoder=False,
            eval_metric="logloss",
        ),
    },
    {
        "name": "LightGBM",
        "model": LGBMClassifier(
            n_estimators=200,
            learning_rate=0.1,
            max_depth=8,
            num_leaves=31,
            min_child_samples=20,
            min_child_weight=0.001,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_alpha=0.01,
            reg_lambda=0.01,
            random_state=42,
        ),
    },
    {
        "name": "CatBoost",
        "model": CatBoostClassifier(
            iterations=200,
            learning_rate=0.1,
            depth=6,
            l2_leaf_reg=3,
            bootstrap_type="Bayesian",
            subsample=0.8,
            random_state=42,
            verbose=0,
        ),
    },
    {
        "name": "LinearRegression",
        "model": LinearRegression(),
    },
    {
        "name": "LogisticRegression",
        "model": LogisticRegression(
            C=1.0,
            penalty="l2",
            solver="lbfgs",
            max_iter=100,
            class_weight="balanced",
            random_state=42,
        ),
    },
    {
        "name": "SVR",
        "model": SVR(
            kernel="rbf", C=1.0, epsilon=0.1
        ),  # Example of a non-linear regression model
    },
]

# Define your meta-model (for example, LogisticRegression for classification)
# meta_model = LogisticRegression(
#     C=1.0,
#     penalty="l2",
#     solver="lbfgs",
#     max_iter=100,
#     class_weight="balanced",
#     random_state=42,
# )

# # Create an instance of StackedModel for classification
# stacked_model = StackedModel(
#     models=models, meta_model=meta_model, is_classification=True
# )

# # Fit the stacked model (example with training and validation data)
# stacked_model.fit(X_train, y_train, X_val, y_val)

# # Predict using the stacked model
# y_pred = stacked_model.predict(X_test)

# # Plot meta-model weights
# stacked_model.plot_model_weights()

#### Optuna!

In [None]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")


class ModelOptimizer:
    def __init__(self, train, y, models=None, cv=5, n_trials=100, scoring="roc_auc"):
        logging.info(
            "Initializing ModelOptimizer with models=%s, cv=%d, n_trials=%d, and scoring=%s",
            models, cv, n_trials, scoring
        )
        self.train = train
        self.y = y
        self.cv = cv
        self.n_trials = n_trials
        self.scoring = scoring
        self.best_params = {}
        self.studies = {}

        # Allow a list of models to be passed
        self.models = models if models else ["LGBMClassifier", "XGBClassifier", "RandomForestClassifier", "CatBoostClassifier"]

    def optimize_model(self, name, objective):
        logging.info("Optimizing model: %s", name)
        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=self.n_trials)
        self.best_params[name] = study.best_params
        self.studies[name] = study
        logging.info(
            "Optimization for %s completed. Best params: %s",
            name,
            self.best_params[name],
        )
        return study.best_params

    def objective_lgbm(self, trial):
        lgbm_params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 500),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "num_leaves": trial.suggest_int("num_leaves", 31, 70),
            "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 100),
            "reg_alpha": trial.suggest_float("reg_alpha", 0.01, 1.0),
            "reg_lambda": trial.suggest_float("reg_lambda", 0.01, 1.0),
            "subsample": trial.suggest_float("subsample", 0.6, 0.9),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 0.9),
            "is_unbalance": trial.suggest_categorical("is_unbalance", [True, False]),
        }
        pipeline = Pipeline(
            steps=[("lgbm", LGBMClassifier(**lgbm_params, random_state=42, verbose=-1))]
        )
        return cross_val_score(
            pipeline, self.train, self.y, cv=self.cv, scoring=self.scoring
        ).mean()

    def objective_xgb(self, trial):
        xgb_params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 500),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "gamma": trial.suggest_float("gamma", 0.1, 1.0),
            "reg_alpha": trial.suggest_float("reg_alpha", 0.01, 1.0),
            "reg_lambda": trial.suggest_float("reg_lambda", 0.01, 1.0),
            "subsample": trial.suggest_float("subsample", 0.6, 0.9),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 0.9),
            "scale_pos_weight": trial.suggest_float("scale_pos_weight", 1, 10),
        }
        pipeline = Pipeline(
            steps=[("xgb", XGBClassifier(**xgb_params, random_state=42, verbosity=0))]
        )
        return cross_val_score(
            pipeline, self.train, self.y, cv=self.cv, scoring=self.scoring
        ).mean()

    def objective_rf(self, trial):
        rf_params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 500),
            "max_depth": trial.suggest_categorical("max_depth", [10, 20, 30]),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 4),
            "max_features": trial.suggest_categorical(
                "max_features", ["auto", "sqrt", "log2"]
            ),
            "class_weight": trial.suggest_categorical(
                "class_weight", ["balanced", "balanced_subsample", None]
            ),
            "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
        }
        pipeline = Pipeline(
            steps=[("rf", RandomForestClassifier(**rf_params, random_state=42))]
        )
        return cross_val_score(
            pipeline, self.train, self.y, cv=self.cv, scoring=self.scoring
        ).mean()

    def objective_catboost(self, trial):
        catboost_params = {
            "iterations": trial.suggest_int("iterations", 100, 500),
            "depth": trial.suggest_int("depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
            "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
            "random_strength": trial.suggest_float("random_strength", 1, 10),
            "border_count": trial.suggest_int("border_count", 32, 254),
            "scale_pos_weight": trial.suggest_float("scale_pos_weight", 1, 10),
        }
        pipeline = Pipeline(
            steps=[
                (
                    "catboost",
                    CatBoostClassifier(**catboost_params, silent=True, random_state=42),
                )
            ]
        )
        return cross_val_score(
            pipeline, self.train, self.y, cv=self.cv, scoring=self.scoring
        ).mean()

    def run_optimization(self):
        logging.info("Starting optimization for selected models: %s", self.models)
        
        # Optimize only selected models
        if "LGBMClassifier" in self.models:
            self.optimize_model("LGBMClassifier", self.objective_lgbm)
        if "XGBClassifier" in self.models:
            self.optimize_model("XGBClassifier", self.objective_xgb)
        if "RandomForestClassifier" in self.models:
            self.optimize_model("RandomForestClassifier", self.objective_rf)
        if "CatBoostClassifier" in self.models:
            self.optimize_model("CatBoostClassifier", self.objective_catboost)
        
        logging.info("Model optimizations completed for: %s", self.models)

    def save_best_params(self, filename="best_model_params.json"):
        logging.info("Saving best parameters to %s", filename)
        with open(filename, "w") as file:
            json.dump(self.best_params, file, indent=4)
        logging.info("Best parameters saved")

    def get_voting_classifier(self):
        logging.info("Creating VotingClassifier with best models")
        best_lgbm = LGBMClassifier(
            **self.best_params.get("LGBMClassifier", {}), random_state=42, verbose=-1
        )
        best_xgb = XGBClassifier(**self.best_params.get("XGBClassifier", {}), random_state=42)
        best_rf = RandomForestClassifier(
            **self.best_params.get("RandomForestClassifier", {}), random_state=42
        )
        best_catboost = CatBoostClassifier(
            **self.best_params.get("CatBoostClassifier", {}), random_state=42, silent=True
        )

        voting_clf = VotingClassifier(
            estimators=[
                ("xgb", best_xgb),
                ("lgbm", best_lgbm),
                ("rf", best_rf),
                ("catboost", best_catboost),
            ],
            voting="soft",
        )

        return voting_clf


# # Example usage:
# train, y = ...  # Your train data and target variable
# selected_models = ["LGBMClassifier", "XGBClassifier"]  # List of models to optimize
# optimizer = ModelOptimizer(train, y, models=selected_models, cv=5, n_trials=100, scoring="roc_auc")
# optimizer.run_optimization()
# optimizer.save_best_params()

# # Voting Classifier
# voting_clf = optimizer.get_voting_classifier()

# # Cross-validation
# pipeline = Pipeline(steps=[("classifier", voting_clf)])
# scores = cross_val_score(pipeline, train, y, cv=5, scoring="roc_auc")
# print(f"Ensemble Model AUC: {scores.mean()} ± {scores.std()}")


In [None]:
# import catboost
# from catboost import CatBoostClassifier
# from sklearn.model_selection import KFold
# import numpy as np

# # Assuming you have your features X and target y
# n_splits = 5
# kf = KFold(n_splits=n_splits)
# oof_predictions = np.zeros(X.shape[0])

# for train_index, valid_index in kf.split(X):
#     X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
#     y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

#     model = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6)
#     model.fit(X_train, y_train, eval_set=(X_valid, y_valid), verbose=0)

#     # Store the out-of-fold predictions
#     oof_predictions[valid_index] = model.predict(X_valid)

# # Now oof_predictions can be used as a feature in another model

In [None]:
# import optuna
# import lightgbm as lgb


# def objective(trial):
#     param = {
#         "objective": "regression",
#         "metric": "rmse",
#         "verbosity": -1,
#         "boosting_type": "gbdt",
#         "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
#         "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
#         "num_leaves": trial.suggest_int("num_leaves", 2, 256),
#         "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.1),
#     }

#     dtrain = lgb.Dataset(X_train, label=y_train)
#     gbm = lgb.train(param, dtrain)
#     preds = gbm.predict(X_test)
#     rmse = mean_squared_error(y_test, preds, squared=False)
#     return rmse


# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials=100)
# print(study.best_trial)

In [None]:
# to do add hyper parameter optimization by auto gen ! test all codes, define ways for OOF !!!!

# Submission


In [None]:
# submission = pd.DataFrame({'crm cd': pred})
# submission.to_csv('submission.csv', index=False)
# submission.head()