# _Kaggle Competition Classification

## Importing Basic Libraries

In [None]:
from sklearn.metrics import (confusion_matrix, accuracy_score,
                             precision_score, f1_score, recall_score,
                             roc_auc_score, roc_curve, auc
                             )
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import VALID_METRICS

from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import (AdaBoostClassifier, ExtraTreesClassifier,
                              BaggingClassifier, RandomForestClassifier,
                              GradientBoostingClassifier,
                              HistGradientBoostingClassifier,
                              StackingClassifier
                              )
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from catboost import CatBoostClassifier

## Testing algorithms

In [None]:
def select_model(dataframe, test_size=.3, target_feature=target_feature, model = None):
    !mkdir -p output/data
    algos = [
            #  SVC(), NuSVC(),
            #  LinearRegression(),
            DecisionTreeClassifier(), ExtraTreeClassifier(),
            AdaBoostClassifier(), BaggingClassifier(),
            ExtraTreesClassifier(), RandomForestClassifier(),
            GradientBoostingClassifier(),
            HistGradientBoostingClassifier(),
            LGBMClassifier(), XGBClassifier(), XGBRFClassifier(),
            CatBoostClassifier()
            ]
    names = list(map(lambda x: x.__class__.__name__, algos))

    history = { 'algo_name': names,
                'train_score' : [],
                'test_score' : [],
                'train_metrics': [],
                'test_metrics': [],
                }
    X_train, X_test, y_train, y_test = train_test_split(dataframe.drop(target_feature, axis=1), dataframe[target_feature], test_size=test_size)
    for model in algos:
        print(f"{model.__class__.__name__} has started!")
        model.fit(X_train, y_train)
        pred_train = model.predict(X_train)
        pred_test = model.predict(X_test)
        history['train_score'].append(accuracy_score(y_train, pred_train))
        history['test_score'].append(accuracy_score(y_test, pred_test))
        try:
            y_prob = model.predict_proba(X_train)[:, 1]
            roc_auc = roc_auc_score(y_train, y_prob)
            history['train_metrics'].append(roc_auc)
            y_prob_ = model.predict_proba(X_test)[:, 1]
            roc_auc_ = roc_auc_score(y_test, y_prob_)
            history['test_metrics'].append(roc_auc_)
        except Exception as e:
            history['train_metrics'].append(np.nan)
            history['test_metrics'].append(np.nan)
    return pd.DataFrame(history)

In [None]:
get_score = lambda model: (model.score(X_train, y_train), model.score(X_test, y_test))

In [None]:
# optuna objective function for hyperparameter tuning of LGBM
def objective(trial):
    params = {
        'boosting_type'    : trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
        'num_leaves'        : trial.suggest_int('num_leaves', 10, 100),
        'max_depth'         : trial.suggest_categorical('max_depth', [-1,3, 4, 5]),
        'learning_rate'     : trial.suggest_float('learning_rate', .001, 2, log=True),
        # 'n_estimators'      : trial.suggest_int('n_estimators', 100, 1000),


        'min_split_gain'    : trial.suggest_float('min_split_gain', 0, 1),
        'min_child_weight'  : trial.suggest_float('min_child_weight', 0, 1),
        'min_child_samples' : trial.suggest_int('min_child_samples', 20, 100),

        'subsample'         : trial.suggest_categorical('subsample', [.4, .5, .67, .75, .8, .9, .95, 1.0]),
        'colsample_bytree'  : trial.suggest_categorical('colsample_bytree', [.4, .5, .67, .75, .8, .9, .95, 1.0]),

        'reg_alpha'         : trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda'        : trial.suggest_float('reg_lambda', 0, 1),
        }

    model = LGBMClassifier(**params, verbose=-1,random_state=6547)
    model.fit(X_train, y_train)
    train_score, test_score = get_score(model)
    return test_score, test_score-train_score


study_name = 'LGBM_study'
storage = optuna.storages.RDBStorage(f"sqlite:///output/logs/{study_name}.db")
study = optuna.create_study(directions= ['maximize', 'maximize' ],storage = storage, study_name = study_name, load_if_exists=True)
study.optimize(objective, n_trials = 200)

In [None]:
def objective(trial):

    params = {
        'max_depth'         : trial.suggest_categorical('max_depth', [3, 4, 5, 6, None]),
        'max_leaves'        : trial.suggest_int('max_leaves', 0,100),
        'grow_policy'       : trial.suggest_categorical('grow_policy', ['depthwise','lossguide', None]),
        'learning_rate'     : trial.suggest_float('learning_rate', .001, 2, log=True),
        'tree_method'       : trial.suggest_categorical('tree_method', ['exact', 'approx', 'hist', None]),
        'gamma'             : trial.suggest_categorical('gamma',[0, 1e-4, 1e-3, 1e-1, None] ),
        'subsample'         : trial.suggest_categorical('subsample',[ .4, .5, .67, .75, .8, .9, 1.0] ),
        'colsample_bytree'  : trial.suggest_categorical('colsample_bytree',[ .4, .5, .67, .75, .8, .9, 1.0] ),
        'colsample_bylevel' : trial.suggest_categorical('colsample_bylevel',[ .4, .5, .67, .75, .8, .9, 1.0] ),
        'colsample_bynode'  : trial.suggest_categorical('colsample_bynode',[ .4, .5, .67, .75, .8, .9, 1.0] ),
        'reg_alpha'         : trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda'        : trial.suggest_float('reg_lambda', 0, 1),
        'importance_type'   : trial.suggest_categorical('importance_type', ['gain', 'weight', 'cover', 'total_gain', 'total_cover', None]),
    }

    model = XGBClassifier(**params)

    try:
        model.fit(X_train, y_train)
        train_score, test_score = get_score(model)
        return test_score, test_score-train_score
    except Exception as e:
        return -np.inf, -np.inf


In [None]:
def objective(trial):
    params = {
        'learning_rate'                 : trial.suggest_float('learning_rate', .001, 2, log=True),
        'objective'                     : trial.suggest_categorical('objective', ['Logloss', 'CrossEntropy']),
        'depth'                         : trial.suggest_int('depth', 3, 10),
        'reg_lambda'                    : trial.suggest_float('reg_lambda', 0, 10),
        'subsample'                     : trial.suggest_float('subsample', 0, 1),
        'colsample_bylevel'             : trial.suggest_float('colsample_bylevel', 0, 1),
        'min_child_samples'             : trial.suggest_int('min_child_samples', 10, 100),
        'leaf_estimation_iterations'    : trial.suggest_int('leaf_estimation_iterations', 5, 20),


    }

    model = CatBoostClassifier(**params,iterations=100,verbose=0,random_state = 84987)

    try:
        model.fit(X_train, y_train)
        train_score, test_score = get_score(model)
        return test_score, test_score-train_score
    except Exception as e:
        return -np.inf, -np.inf

In [None]:
params = [
    {'learning_rate': 0.19068478145864712, 'objective': 'Logloss', 'depth': 9, 'reg_lambda': 1.3927531091623635, 'subsample': 0.6651469314640843, 'colsample_bylevel': 0.9706601048429702, 'min_child_samples': 50, 'leaf_estimation_iterations': 12},
    {'learning_rate': 0.174982304220691, 'objective': 'Logloss', 'depth': 8, 'reg_lambda': 7.512299289102047, 'subsample': 0.4877580683039361, 'colsample_bylevel': 0.6151802353705157, 'min_child_samples': 70, 'leaf_estimation_iterations': 12},
    {'learning_rate': 0.19068478145864712, 'objective': 'Logloss', 'depth': 7, 'reg_lambda': 8.947085765030208, 'subsample': 0.3934334059883876, 'colsample_bylevel': 0.8078464837389985, 'min_child_samples': 94, 'leaf_estimation_iterations': 8},
    {'learning_rate': 0.05895050294121335, 'objective': 'Logloss', 'depth': 10, 'reg_lambda': 9.058283844063986, 'subsample': 0.08931663638769871, 'colsample_bylevel': 0.9694763077811444, 'min_child_samples': 26, 'leaf_estimation_iterations': 12},
    {'learning_rate': 0.06614189893121368, 'objective': 'Logloss', 'depth': 6, 'reg_lambda': 9.042807154267283, 'subsample': 0.5314518336396845, 'colsample_bylevel': 0.6119706476986938, 'min_child_samples': 22, 'leaf_estimation_iterations': 6}
]

for i, p in enumerate(params):
    name = f'CatBoost_{i}'
    model = CatBoostClassifier(**p,iterations=100,verbose=0,random_state = 84987)
    model.fit(X_train,y_train)
    a, b = get_score(model)
    c = matthews_corrcoef(train['class'], model.predict(train.drop('class',axis=1)))
    experimental_study['name'].append(name)
    experimental_study['train_score'].append(a)
    experimental_study['test_score'].append(b)
    experimental_study['score'].append(c)
    experimental_study['params'].append(p)

# _Kaggle Competition Regression

In [None]:
from sklearn.metrics import (root_mean_squared_log_error as RMLE,
                             r2_score)
# 'neg_root_mean_squared_log_error'
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import VALID_METRICS

from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import (AdaBoostRegressor, ExtraTreesRegressor,
                              BaggingRegressor, RandomForestRegressor,
                              GradientBoostingRegressor,
                              HistGradientBoostingRegressor,
                              StackingRegressor
                              )
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor, XGBRFRegressor
from catboost import CatBoostRegressor

In [None]:
def select_model(dataframe, test_size=.3, target_feature=target_feature, model = None):
    !mkdir -p output/data
    algos = [
            #  SVC(), NuSVC(),
            #  LinearRegression(),
            DecisionTreeRegressor(), ExtraTreeRegressor(),
            AdaBoostRegressor(), BaggingRegressor(),
            ExtraTreesRegressor(), RandomForestRegressor(),
            GradientBoostingRegressor(),
            HistGradientBoostingRegressor(),
            LGBMRegressor(verbose=-1), XGBRegressor(), XGBRFRegressor(),
            CatBoostRegressor(verbose=0)
            ]
    names = list(map(lambda x: x.__class__.__name__, algos))

    history = { 'algo_name': names,
                'train_score' : [],
                'test_score' : [],
                # 'Min_1': [],
                # 'Min_2': [],
                'train_metrics': [],
                'test_metrics': [],
                }
    X_train, X_test, y_train, y_test = train_test_split(dataframe.drop(target_feature, axis=1), dataframe[target_feature], test_size=test_size)
    for model in algos:
        print(f"{model.__class__.__name__} has started!")
        model.fit(X_train, y_train)
        pred_train = model.predict(X_train)
        pred_test = model.predict(X_test)
        history['train_score'].append(r2_score(y_train, pred_train))
        history['test_score'].append(r2_score(y_test, pred_test))
        try:
            # history['Min_1'].append(np.min(pred_train))
            # history['Min_2'].append(np.min(pred_test))

            history['train_metrics'].append(RMLE(y_train, pred_train))
            history['test_metrics'].append(RMLE(y_test, pred_test))
        except Exception as e:
            # history['Min_1'].append(np.nan)
            # history['Min_2'].append(np.nan)
            history['train_metrics'].append(np.nan)
            history['test_metrics'].append(np.nan)
    clear_output()
    return pd.DataFrame(history)

#_Common functions

In [None]:
# @title Kaggle Competition Data Downloading  { display-mode: "form",run :"auto" }
# @markdown Put the competition name
kaggle_username = 'Kaggle_Username'     # @param ['Kaggle_Username', 'kaggle_username', 'kaggle_2']
kaggle_token = "Kaggle"     # @param ["Kaggle", 'kaggle', 'kaggle_2_pass']

project_name = 'playground-series-s4e5'  # @param {type: "string"}


from google.colab import userdata
from IPython import display
import os
token = {"username":userdata.get(kaggle_username),"key":userdata.get(kaggle_token)}
import os, json
os.environ['KAGGLE_CONFIG_DIR']='.'
with open('kaggle.json', "w") as f:
    json.dump(token, f)
!chmod 600 ./kaggle.json
!kaggle competitions download -c $project_name
filename = project_name + ".zip"
!unzip $project_name && rm $filename
os.environ['MLFLOW_TRACKING_PASSWORD'] = userdata.get('MLFLOW_TRACKING_PASSWORD')
os.environ['MLFLOW_TRACKING_USERNAME'] = userdata.get('MLFLOW_TRACKING_USERNAME')
display.clear_output()
print("Files have been downloaded!")

In [None]:
!kaggle datasets download brijlaldhankour/flood-prediction-factors && unzip *.zip && rm -d *.zip

In [None]:
!pip install optuna-dashboard optuna
!pip install mlflow dagshub
!pip install catboost
!mkdir -p output/models  output/logs output/plots
!pip install --upgrade gdown
display.clear_output()
import dagshub
import mlflow
import optuna

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import anderson,kstest, shapiro
from IPython.display import Audio,display as dis
from IPython.display import clear_output

from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge

In [None]:
# Classification
from sklearn.metrics import (confusion_matrix, accuracy_score,
                             precision_score, f1_score, recall_score,
                             roc_auc_score, roc_curve, auc
                             )
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import VALID_METRICS

from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import (AdaBoostClassifier, ExtraTreesClassifier,
                              BaggingClassifier, RandomForestClassifier,
                              GradientBoostingClassifier,
                              HistGradientBoostingClassifier,
                              StackingClassifier
                              )
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from catboost import CatBoostClassifier

# Regression Models
from sklearn.metrics import (root_mean_squared_log_error as RMLE,
                             r2_score)
# 'neg_root_mean_squared_log_error'
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import VALID_METRICS

from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import (AdaBoostRegressor, ExtraTreesRegressor,
                              BaggingRegressor, RandomForestRegressor,
                              GradientBoostingRegressor,
                              HistGradientBoostingRegressor,
                              StackingRegressor
                              )
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor, XGBRFRegressor
from catboost import CatBoostRegressor

In [None]:
# plotly libraries setup
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go
import plotly.io as pio
pio.renderers.default = "colab"
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
# import plotly.offline as pyo
# pyo.init_notebook_mode()

In [None]:
dagshub.init("Kaggle-Competitions-Lab", "SantanuK", mlflow=True)
experiment_name = ''
try:
    mlflow.create_experiment(experiment_name)
    mlflow.set_experiment(experiment_name)
except Exception as e:
    mlflow.set_experiment(experiment_name)
    # print("Experiment has been created or some issue occured!\n","Error: ", e )

In [None]:
df_train = pd.read_csv('train.csv').drop('id',axis=1)
df_test = pd.read_csv('test.csv').drop('id',axis=1)
df_extra = pd.read_csv('original.csv')

In [None]:
print(f"""The number of features: {df_test.shape[1]}
-----------------
Training DataSet
-----------------
The number of samples:              {df_train.shape[0]}
The number of duplicated samples:   {df_train.duplicated().sum()}
The number of null samples:         {df_train.isna().sum().sum()} ({round(df_train.isna().sum().sum()/df_train.shape[0]*100,2)}%)
The number of unique samples:       {df_train.nunique().sum()}

-----------------
Testing DataSet
-----------------
The number of samples:              {df_test.shape[0]}
The number of duplicated samples:   {df_test.duplicated().sum()}
The number of null samples:         {df_test.isna().sum().sum()}({round(df_test.isna().sum().sum()/df_test.shape[0]*100,2)}%)
The number of unique samples:       {df_test.nunique().sum()}

""")

In [None]:
num_features =[]
cat_features =[]
target_feature = 'FloodProbability'

In [None]:
features = df_test.columns


def visualization(dataframe, features=features):
    summary = {
        'Columns'   :   [],
        'Count'     :   [],
        'Unique count': [],
        'Max'       :   [],
        'Min'       :   [],
        'Mean'      :   [],
        'Std'       :   [],
        '5%'        :   [],
        '25%'        :   [],
        '50%'        :   [],
        '75%'        :   [],
        '95%'        :   [],
        # IQR, Low Bound, High Bound
        'IQR'       :   [],
        'Low Bound' :   [],
        'High Bound':   [],
        '< Low Bound' :   [],
        '> High Bound':   [],
        'Outliers'  : [],
        '% Outliers'  : [],
    }
    for col in features:
        summary['Columns'].append(col)
        summary['Count'].append(dataframe[col].notnull().sum())
        summary['Unique count'].append(dataframe[col].nunique())
        summary['Max'].append(dataframe[col].max())
        summary['Min'].append(dataframe[col].min())
        summary['Mean'].append(dataframe[col].mean())
        summary['Std'].append(dataframe[col].std())
        for num in [5, 25, 50, 75, 95]:
            summary[f"{num}%"].append(np.percentile(dataframe[col], num))
        summary['IQR'].append(summary['75%'][-1] - summary['25%'][-1])
        summary['Low Bound'].append(summary['50%'][-1]-1.5*summary['IQR'][-1])
        summary['High Bound'].append(summary['50%'][-1]+1.5*summary['IQR'][-1])

        summary["< Low Bound"].append(len(dataframe[dataframe[col]<summary["Low Bound"][-1]]))
        summary["> High Bound"].append(len(dataframe[dataframe[col]>summary["High Bound"][-1]]))
        summary["Outliers"].append(summary["< Low Bound"][-1]+ summary["> High Bound"][-1])
        summary["% Outliers"].append(summary["Outliers"][-1]/len(dataframe)*100.0)
    summary = pd.DataFrame(summary)

    return summary

def missing_values_table(dataframe):
    x = dataframe.isna().sum()/len(dataframe)*100
    x = x.reset_index()
    x.columns = ['feature', 'percentage-of-null']
    x = x.sort_values(by='percentage-of-null', ascending=False).reset_index(drop=True)
    x['unique'] = [dataframe[i].nunique() for i in x['feature']]
    x['unique-vals'] = [",".join (sorted(map(str,dataframe[i].unique()))) for i in x['feature']]
    # return x[x['percentage-of-null']>0]
    return x

# Data Visualization and Analysis
# Numerical and Categorical visualization
def visualize_categorical_distributions(df):
    """
    Visualizes the distribution of categorical features in the DataFrame.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the categorical features to visualize.

    Returns:
    - None: Displays the plots.
    """
    # Identify categorical columns
    categorical_columns = df.select_dtypes(include=['object']).columns

    # Set up the figure for multiple subplots
    num_cols = 3  # Number of columns for the subplot grid
    num_rows = (len(categorical_columns) + num_cols - 1) // num_cols  # Calculate number of rows needed

    fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, num_rows * 5))
    fig.suptitle('Distribution of Categorical Features', fontsize=16)

    # Flatten axes array for easy iteration
    axes = axes.flatten()

    # Iterate over each categorical column and create a bar plot
    for i, col in enumerate(categorical_columns):
        sns.barplot(
            x=df[col].value_counts().index,
            y=df[col].value_counts().values,
            ax=axes[i],
            palette="viridis"
        )
        # Setting titles and labels
        axes[i].set_title(f'Distribution of {col}', fontsize=14)
        axes[i].set_xlabel(col, fontsize=12)
        axes[i].set_ylabel('Count', fontsize=12)
        axes[i].tick_params(axis='x')

    # Remove unused axes
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    # Adjust layout
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust the main title space
    plt.show()

def visualize_numerical_distributions(df, exclude_columns='id'):
    """
    Visualizes the distribution of numerical features in the DataFrame using histograms and KDE plots,
    excluding specified columns.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the numerical features to visualize.
    - exclude_columns (list): List of column names to exclude from visualization.

    Returns:
    - None: Displays the plots.
    """
    # Ensure exclude_columns is a list
    if exclude_columns is None:
        exclude_columns = []

    # Identify numerical columns and exclude specified ones
    numerical_columns = df.select_dtypes(include=['number']).columns
    numerical_columns = [col for col in numerical_columns if col not in exclude_columns]

    # Set up the figure for multiple subplots
    num_cols = 3  # Number of columns for the subplot grid
    num_rows = (len(numerical_columns) + num_cols - 1) // num_cols  # Calculate number of rows needed

    fig, axes = plt.subplots(num_rows, num_cols, figsize=(18, 5 * num_rows))
    fig.suptitle('Distribution of Numerical Features', fontsize=16)

    # Flatten axes array for easy iteration
    axes = axes.flatten()

    # Iterate over each numerical column and create a histogram with KDE
    for i, col in enumerate(numerical_columns):
        sns.histplot(df[col], kde=True, ax=axes[i], color="skyblue", element="step", stat="density")
        axes[i].set_title(f'Distribution of {col}', fontsize=14)
        axes[i].set_xlabel(col, fontsize=12)
        axes[i].set_ylabel('Density', fontsize=12)

    # Remove unused axes
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    # Adjust layout
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust the main title space
    plt.show()


In [None]:
# Optional
def preprocess_categorical_data(df, df_t):
    """
    Preprocesses the DataFrame by replacing non-alphabetical characters in categorical columns
    and imputing missing values in categorical columns using mode and in numeric columns using median.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.

    Returns:
    - pd.DataFrame: The processed DataFrame.
    """
    def replace_non_alphabetical(value):
        """Replaces non-alphabetical values with NaN if they are not a single character."""
        if isinstance(value, str) and (len(value) == 1 and value.isalpha()):
            return value
        return np.nan

    # Identify categorical and numeric columns
    categorical_columns = df_t.select_dtypes(include=['object']).columns
    numeric_columns = df_t.select_dtypes(include=[np.number]).columns

    # Replace non-alphabetical values with NaN in categorical columns
    for col in categorical_columns:
        df[col] = df[col].apply(replace_non_alphabetical)
        df_t[col] = df_t[col].apply(replace_non_alphabetical)

    # Impute missing values in categorical columns using mode
    for col in categorical_columns:
        if df[col].isnull().any():
            mode_value = df[col].mode()[0] if not df[col].mode().empty else 'Unknown'  # Safe mode handling
            df[col].fillna(mode_value, inplace=True)
            df_t[col].fillna(mode_value, inplace=True)

    # Impute missing values in numeric columns using median
    for col in numeric_columns:
        median_value = df[col].median()
        if df[col].isnull().any():
            df[col].fillna(median_value, inplace=True)
        df_t[col].fillna(median_value, inplace=True)

    return df, df_t


# Apply the functions to train and test data
print("Imputing noise with mode for categoricals and median for numericals...")
train, test = preprocess_categorical_data(df_train, df_test)


# Print summary of missing values after processing
print("\nMissing values in train_data after processing: ", train.isnull().sum().sum())
print("\nMissing values in test_data after processing: ", test.isnull().sum().sum())


In [None]:
def categorize_low_frequency_values(df_train, df_test, threshold=100):
    """
    Replaces categories with less than a given threshold in all categorical columns
    with the category 'Other', applying the same transformation to both training and test datasets.

    Parameters:
    - df_train (pd.DataFrame): The training DataFrame.
    - df_test (pd.DataFrame): The test DataFrame.
    - threshold (int): Frequency threshold below which categories are replaced with 'Other'.

    Returns:
    - df_train (pd.DataFrame): Modified training DataFrame.
    - df_test (pd.DataFrame): Modified test DataFrame.
    - mapping_dict (dict): Dictionary containing the mappings of replaced values for each column.
    """
    mapping_dict = {}

    # Identify categorical columns
    categorical_columns = df_train.select_dtypes(include=['object']).columns

    for col in categorical_columns:
        # Find values to replace
        value_counts = df_train[col].value_counts()
        values_to_replace = value_counts[value_counts < threshold].index

        # Determine the mode of the column
        mode_value = df_train[col].mode()[0]

        # Create mapping for the current column if there are values to replace
        if len(values_to_replace) > 0:
            # Store the mapping of original values to the mode
            mapping_dict[col] = {value: mode_value for value in values_to_replace}

            # Replace in training and test data
            df_train[col] = df_train[col].replace(values_to_replace, mode_value)
            df_test[col] = df_test[col].replace(values_to_replace, mode_value)

    return df_train, df_test, mapping_dict

train, test, mappings = categorize_low_frequency_values(train, test)

In [None]:
from scipy.stats import boxcox
def handle_skewness(df, df_t, threshold=1.0):
    """
    Applies Box-Cox transformation to numerical columns in the DataFrame where skewness exceeds a threshold.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - threshold (float): Skewness threshold to decide which columns to transform.

    Returns:
    - pd.DataFrame: DataFrame with transformed columns.
    - dict: Dictionary of lambda values used for Box-Cox transformation for each column.
    """
    numeric_cols = df.select_dtypes(include=['number']).columns

    for col in numeric_cols:
        skewness = df[col].skew()
        # Check the skewness and ensure positive values for Box-Cox
        if skewness > threshold:
            # Adding 1 to shift all data to positive if there are zero or negative values
            df[col] = df[col] + 1
            df[col], fitted_lambda = boxcox(df[col])
            df_t[col] = df_t[col] + 1
            df_t[col] = boxcox(df_t[col], lmbda=fitted_lambda)


    return df, df_t

# Example usage:
# df is your DataFrame containing the numerical data
train, test = handle_skewness(train, test)


In [None]:
def visualize_box_plots(df):
    """
    Visualizes the distribution of numerical features in the DataFrame using box plots to identify outliers.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the numerical features to visualize.

    Returns:
    - None: Displays the box plots.
    """
    # Identify numerical columns
    numerical_columns = df.select_dtypes(include=['number']).columns

    # Set up the figure for multiple subplots
    num_cols = 3  # Number of columns for the subplot grid
    num_rows = (len(numerical_columns) + num_cols - 1) // num_cols  # Calculate number of rows needed

    fig, axes = plt.subplots(num_rows, num_cols, figsize=(18, 5 * num_rows))
    fig.suptitle('Box Plot of Numerical Features', fontsize=16)

    # Flatten axes array for easy iteration
    axes = axes.flatten()

    # Iterate over each numerical column and create a box plot
    for i, col in enumerate(numerical_columns):
        sns.boxplot(x=df[col], ax=axes[i], color="skyblue")
        axes[i].set_title(f'Box Plot of {col}', fontsize=14)
        axes[i].set_xlabel(col, fontsize=12)

    # Remove unused axes
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    # Adjust layout
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust the main title space
    plt.show()

visualize_box_plots(train)

In [None]:
def calculate_outliers_percentage(df):
    """
    Calculates the percentage of data considered outliers based on the IQR method for each numerical column.

    Parameters:
    - df (pd.DataFrame): DataFrame to analyze.

    Returns:
    - None: Prints the percentage of outliers for each numerical column.
    """
    outlier_counts = {}
    for column in df.select_dtypes(include=['number']).columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Calculate outliers
        outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
        outlier_counts[column] = len(outliers)

    # Print the percentage of outliers for each column
    for column in outlier_counts:
        percentage = (outlier_counts[column] / len(df)) * 100
        print(f"Percentage of outliers in {column}: {percentage:.2f}%")

# Example usage:
calculate_outliers_percentage(train)

In [None]:
def handle_outliers(df,df_t):
    """
    Handles outliers in a DataFrame by capping based on the IQR method.

    Parameters:
    - df (pd.DataFrame): DataFrame to process.

    Returns:
    - pd.DataFrame: DataFrame with outliers handled.
    """
    for column in ['cap-diameter', 'stem-height', 'stem-width']:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Capping
        df[column] = df[column].clip(lower=lower_bound, upper=upper_bound)
        df_t[column] = df_t[column].clip(lower=lower_bound, upper=upper_bound)

    return df, df_t

# Apply to both training and test datasets
train, test = handle_outliers(train, test)

In [None]:
train.to_csv('train_processed.csv', index=False)
test.to_csv('test_processed.csv', index=False)

In [None]:
with mlflow.start_run() as run:
    artifact_uri, run_id = run.info.artifact_uri, run.info.run_id
    mlflow.log_artifact("train_processed.csv")
    mlflow.log_artifact("test_processed.csv")

# _Plotting and Analysis (Common)

In [None]:
num_rows = (len(num_features)+1)//2
fig = make_subplots(rows=num_rows, cols=2)
for i in range(num_rows):
    # fig.add_trace(px.box(df_1,y=numerical_cols[i]),row=i//2+1,col=i%2+1)
    fig.add_trace(go.Box(y=df_train[num_features[i]], name=num_features[i]),row=i//2+1,col=i%2+1)

fig.update_layout(title_text='Box plot of Numerical columns',height=800, width=1100)
fig.show()

In [None]:
for i, df in enumerate([df_train, df_extra, df_test]):
    # If target_feature is  categorical, you will nedd to change the following code
    try:
        corr = df[num_features+[target_feature,]].corr()
    except KeyError as e:
        corr = df[num_features].corr()
    # Generate a mask for the upper triangle
    mask = np.triu(np.ones_like(corr, dtype=bool))

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(11, 9))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(230, 20, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr, mask=mask, cmap='coolwarm', vmax=1, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": .5})

    plt.title('Correlation Matrix', fontsize=20)
    !mkdir -p output/images
    plt.savefig(f"output/images/Correlation matrix_{i+1}.png")
    plt.show()

In [None]:
### $\chi^2$ for categorical features vs target feature
# $H_0$ : The categorical feature and taget feature are distributed independently.

df = pd.concat([df_train, df_extra]).reset_index(drop=True)

from scipy.stats import chi2_contingency
temp = pd.DataFrame([], columns = ['Feature', 'Test Statistic', 'P Value', 'Rejected'])
threshold = 0.05
for i, col in enumerate(cat_features):
    res = chi2_contingency(pd.crosstab(df[col], df[target_feature]))
    temp.loc[i] = [col, round(res.statistic,3), round(res.pvalue,3), res.pvalue<threshold]
temp

In [None]:
"""
**ANOVA test**

$H_0$ : The population mean of all of the groups are equal.

**The Kruskal-Wallis H-test**

$H_0$ : The population median of all of the groups are equal.
"""
from scipy.stats import f_oneway, kruskal
temp = pd.DataFrame()
temp['Feature'] = num_features
groups = []
for i in df[target_feature].unique():
    groups.append(df[df[target_feature]==i][num_features])
F, p = f_oneway(groups[0], groups[1], groups[2], groups[3], groups[4], groups[5], groups[6] )
temp['Anova_stats'] = np.round(F,3)
temp['Anova_pvalue'] = np.round(p,3)
F, p = kruskal(groups[0], groups[1], groups[2], groups[3], groups[4], groups[5], groups[6] )
temp['Krushkal_stats'] = np.round(F,3)
temp['Krushkal_pvalue'] = np.round(p,3)
temp

# _Optimisation

In [None]:
res_ = {
    'trial_id'  : [],
    'accuracy'  : [],
    'margin'    : [],
    'params'    : [],
}
for trial in study.best_trials:
    res_['trial_id'].append(trial.number)
    res_['params'].append(trial.params)
    res_['accuracy'].append(trial.values[0])
    res_['margin'].append(trial.values[1])

result_df = pd.DataFrame(res_)
select_cols = ['trial_id', 'accuracy', 'test_acc', 'margin', 'params']
result_df['test_acc'] = result_df['accuracy'] + result_df['margin']
result_df = result_df[select_cols]

result_df.sort_values(['accuracy','margin'], ascending=[False, False]).reset_index(drop=True)
result_df.sort_values(['test_acc','margin'], ascending=[False, False]).reset_index(drop=True)
result_df.sort_values(['margin', 'test_acc',], ascending=[False, False]).reset_index(drop=True)

# _Saving your data

In [None]:
description = " "
with mlflow.start_run(description= description):
    mlflow.log_artifact('output')