In [None]:
# import packages
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler, TargetEncoder
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, cross_val_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold

from utils import feature_engineering, additional_feature_engineering
# set the aesthetic style of the plots
sns.set_style()

# filter warning messages
import warnings
warnings.filterwarnings('ignore')

In [None]:
df_credit = pd.read_csv('saved/preprocessed_bank_data.csv')
df_credit.info()


In [None]:
X_processed = feature_engineering(df_credit)
X_processed.info()

In [None]:
print(X_processed[X_processed['target_default'].isnull()])

In [None]:
X_processed.nunique().sort_values()


In [None]:
import pandas as pd
import numpy as np

class ManualTargetEncoder:
    def __init__(self, smoothing=1.0):
        """
        Initialize the encoder.
        :param smoothing: Smoothing parameter to balance between category mean and global mean.
        """
        self.smoothing = smoothing
        self.encodings = {}  # Store encodings for each categorical column
        self.global_mean = None  # Store the global mean of the target

    def fit(self, X, y):
        """
        Fit the encoder on the training data.
        :param X: DataFrame containing categorical columns.
        :param y: Target variable.
        """
        self.global_mean = y.mean()

        for col in X.columns:
            # Calculate the mean target for each category
            category_means = y.groupby(X[col]).mean()
            # Calculate the count of each category
            category_counts = y.groupby(X[col]).count()
            # Apply smoothing
            smoothed_encoding = (category_means * category_counts + self.global_mean * self.smoothing) / (
                        category_counts + self.smoothing)
            # Store the encodings
            self.encodings[col] = smoothed_encoding

    def transform(self, X):
        """
        Transform the categorical columns using the learned encodings.
        :param X: DataFrame containing categorical columns.
        :return: Transformed DataFrame.
        """
        X_transformed = X.copy()
        for col in X.columns:
            # Replace categories with their encodings
            X_transformed[col] = X[col].map(self.encodings[col]).fillna(self.global_mean)
        return X_transformed

    def fit_transform(self, X, y):
        """
        Fit the encoder and transform the data in one step.
        :param X: DataFrame containing categorical columns.
        :param y: Target variable.
        :return: Transformed DataFrame.
        """
        self.fit(X, y)
        return self.transform(X)

In [None]:
import pickle
import category_encoders as ce
X_processed["target_default"] = X_processed["target_default"].astype(int)

X = X_processed.drop(columns=["target_default"])
y = X_processed["target_default"]

categorical_cols = X.select_dtypes(exclude=['float64', 'int64']).columns.tolist()
# print(categorical_cols)
# # Check for missing values in each categorical column
# nan_counts = X_processed[categorical_cols].isnull().sum()

# # Print the columns with NaNs and their counts
# print(nan_counts[nan_counts > 0])
# Define the one-out encoder
encoder = ManualTargetEncoder(smoothing=1.0)
X_processed[categorical_cols] = encoder.fit_transform(X[categorical_cols], y)

pd.set_option('display.max_rows', None)  # None means unlimited rows
pd.set_option('display.max_columns', None) # None means unlimited columns
pd.set_option('display.width', None)      # None means auto-detect width
pd.set_option('display.max_colwidth', None) # None means unlimited column width
print(X.head(10))

# Save label encoders to disk
with open('saved/label_encoders.pkl', 'wb') as f:
    pickle.dump(encoder, f)

In [None]:
X_processed.nunique().sort_values()


In [None]:
non_numeric = X_processed[['state', 'real_state']].applymap(lambda x: isinstance(x, str)).sum()
print(non_numeric[non_numeric > 0])  # Show columns that still have categorical values


In [None]:
X_processed2 = additional_feature_engineering(X_processed)


In [None]:
# import pandas as pd
# from imblearn.over_sampling import SMOTE


# # Separate features (X) and target (y)
# X = df_credit.drop('target_default', axis=1)
# y = df_credit['target_default']

# # Apply SMOTE
# smote = SMOTE(random_state=42)
# X_resampled, y_resampled = smote.fit_resample(X, y)

# # Create a new DataFrame with resampled data
# df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
# df_resampled['target_default'] = y_resampled

# Save the resampled data to a new CSV file
# df_resampled.to_csv('saved/feature_engineered_data.csv', index=False)

# print("SMOTE applied and saved to 'saved/feature_engineered_data.csv'")

In [None]:
df_credit.to_csv('saved/feature_engineered_data.csv', index = False)