In [None]:
# import packages
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns


from utils import feature_engineering
# set the aesthetic style of the plots
sns.set_style()
pd.set_option('display.max_rows', None)  # None means unlimited rows
pd.set_option('display.max_columns', None) # None means unlimited columns
pd.set_option('display.width', None)      # None means auto-detect width
pd.set_option('display.max_colwidth', None) # N
# filter warning messages
import warnings
warnings.filterwarnings('ignore')

In [None]:
df_credit = pd.read_csv('saved/preprocessed_bank_data.csv')

In [None]:
X_processed = feature_engineering(df_credit)

In [None]:
from utils import plot_credit_correlation

# Example usage:
ordinal_cols = ['EmploymentStatus', 'EducationLevel']
target_cols = ['MaritalStatus', 'LoanPurpose', 'HomeOwnershipStatus']
# plot_credit_correlation(df_credit, ordinal_cols, target_cols, 'RiskScore')
plot_credit_correlation(df_credit, ordinal_cols, target_cols, 'RiskScore')


In [None]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# List of categorical columns
c_columns = ['EmploymentStatus', 
             'MaritalStatus', 'HomeOwnershipStatus', 'EducationLevel', 'LoanPurpose']

# Create and fit the encoder
encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
encoder.fit(X_processed[c_columns])

# Get feature names from the encoder
feature_names = encoder.get_feature_names_out(c_columns)
# During training
def transform_with_encoder(X, categorical_cols, other_cols, encoder):
    # Apply encoding to categorical columns
    encoded_array = encoder.transform(X[categorical_cols])
    
    # Convert to DataFrame with proper column names
    encoded_df = pd.DataFrame(
        encoded_array, 
        columns=encoder.get_feature_names_out(categorical_cols),
        index=X.index
    )
    
    # Combine with non-categorical columns
    if other_cols:
        result = pd.concat([X[other_cols], encoded_df], axis=1)
    else:
        result = encoded_df
        
    return result

# Get non-categorical columns
other_columns = [col for col in X_processed.columns if col not in c_columns]
print(X_processed.columns)
print(c_columns)
print(other_columns)

# Apply the transformation
X_processed = transform_with_encoder(X_processed, c_columns, other_columns, encoder)

In [None]:
import pickle

# Save label encoders to disk
with open('saved/encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)

print(X_processed.head(1))

In [None]:
df = X_processed.applymap(lambda x: x if x > 0 else np.nan)

# Take the log of each feature
df_log = df.applymap(np.log1p)

# Calculate correlation between original and log-transformed features
correlation_results = {}
for col in df.columns:
    correlation = df[col].corr(df_log[col])
    correlation_results[col] = correlation

# Rank the correlations from highest to lowest
ranked_correlations = pd.Series(correlation_results).sort_values(ascending=False)

# Display the results
# print("Correlation between original and log-transformed features:")
# print(ranked_correlations)

In [None]:
print(X_processed[X_processed['RiskScore'].isnull()])

In [None]:
X_processed.nunique().sort_values()

In [None]:
X_processed.to_csv('saved/feature_engineered_data.csv', index = False)