***IMPORTS***

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


def show(*args):
    """A function that displays the arguments in a Jupyter notebook or prints them in a console depending on the environment."""
    try:
        from IPython.display import display

        display(*args)
    except ImportError:
        print(*args)

***LOADING DATA***

In [None]:
try:
    bank_data = pd.read_csv("./data/train.csv")
    bank_query = pd.read_csv("./data/test.csv")
except FileNotFoundError:
    raise Exception(
        "Kaggle data files not found. Please download the data from https://www.kaggle.com/competitions/playground-series-s4e1/data and place them in the data folder."
    )

***EDA***

In [None]:
# --- DESCRIPTIONS ---
show(bank_data.head())
show(bank_data.describe())
show(bank_data.info())

show(bank_data.dtypes)

# --- MISSING VALUES ---
show(bank_data.isnull().sum())

# --- EACH ROW DUPLICATES ---
for col in bank_data.columns:
    print(f"{col}: {bank_data[bank_data.duplicated(subset=[col])].shape[0]}")


In [None]:
# --- HISTOGRAPH ---
n_columns = len(bank_data.columns)
n_hist_columns = 3
n_rows = -(-n_columns // n_hist_columns)  # Ceiling division to calculate number of rows


fig, axes = plt.subplots(nrows=n_rows, ncols=n_hist_columns, figsize=(15, 5 * n_rows))
if n_rows == 1:
    axes = [axes]

for i, col in enumerate(bank_data.columns):
    ax = axes[i // n_hist_columns][i % n_hist_columns]
    sns.histplot(bank_data[col], ax=ax)
    ax.set_title(col)


In [None]:
# --- CORRELATION MATRIX ---
plt.figure(figsize=(15, 15))
sns.heatmap(pd.get_dummies(bank_data.drop(columns=["Surname"])).corr("spearman"), annot=True, vmax=1, vmin=-1, center=0)