In [None]:
# import packages
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns


from utils import feature_engineering
# set the aesthetic style of the plots
sns.set_style()
pd.set_option('display.max_rows', None)  # None means unlimited rows
pd.set_option('display.max_columns', None) # None means unlimited columns
pd.set_option('display.width', None)      # None means auto-detect width
pd.set_option('display.max_colwidth', None) # N
# filter warning messages
import warnings
warnings.filterwarnings('ignore')

In [None]:
df_credit = pd.read_csv('saved/preprocessed_bank_data.csv')

In [None]:
X_processed = feature_engineering(df_credit)

In [None]:
from utils import plot_credit_correlation

# Example usage:
ordinal_cols = ['EmploymentStatus', 'EducationLevel']
target_cols = ['MaritalStatus', 'LoanPurpose', 'HomeOwnershipStatus']
# plot_credit_correlation(df_credit, ordinal_cols, target_cols, 'RiskScore')
plot_credit_correlation(df_credit, ordinal_cols, target_cols, 'RiskScore')


In [None]:
X_processed.head(5)

In [None]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from encoders import CustomTargetEncoder, CustomOrdinalEncoder

# List of categorical columns
c_columns = ['EmploymentStatus', 
             'MaritalStatus', 'HomeOwnershipStatus', 'EducationLevel', 'LoanPurpose']

targetencoder = CustomTargetEncoder(columns_to_encode=['MaritalStatus', 'LoanPurpose'])
X_processed = targetencoder.fit_transform(X_processed, X_processed['RiskScore'])

feature_order_dict = {
    'EducationLevel': ['High School', 'Associate', 'Bachelor', 'Master', 'Doctorate'],
    'EmploymentStatus': ['Unemployed', 'Employed', 'Self-Employed'],
    'HomeOwnershipStatus': ['Other', 'Rent', 'Mortgage', 'Own']
}

ordinalencoder = CustomOrdinalEncoder(feature_order_dict)
X_processed = ordinalencoder.fit_transform(X_processed)


In [None]:
import joblib

joblib.dump(targetencoder, 'saved/targetencoder.joblib')
joblib.dump(ordinalencoder, 'saved/ordinalencoder.joblib')

In [None]:
correlation_matrix = X_processed.corr(method="spearman")
        
plt.figure(figsize=(15, 18))  # Adjust figure size as needed
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=.5)
plt.title("Correlation Matrix of Credit Features")
plt.show()

In [None]:
df = X_processed.applymap(lambda x: x if x > 0 else np.nan)

# Take the log of each feature
df_log = df.applymap(np.log1p)

# Calculate correlation between original and log-transformed features
correlation_results = {}
for col in df.columns:
    correlation = df[col].corr(df_log[col])
    correlation_results[col] = correlation

# Rank the correlations from highest to lowest
ranked_correlations = pd.Series(correlation_results).sort_values(ascending=False)

# Display the results
# print("Correlation between original and log-transformed features:")
# print(ranked_correlations)

In [None]:
print(X_processed[X_processed['RiskScore'].isnull()])

In [None]:
X_processed.nunique().sort_values()

In [None]:
X_processed.to_csv('saved/feature_engineered_data.csv', index = False)