# Preprocessing Section

## Data Loading 

In [17]:
import sys
import os
import importlib

# Add project to path so we can import our modules
sys.path.append(os.path.abspath(".."))

# Import functionality from our source code
import src.data_loading.data_loader as data_loader
importlib.reload(data_loader)
from src.data_loading.data_loader import load_data, get_numerical_features, get_categorical_features


# Load the data
df = load_data()

list_numerical_features = get_numerical_features(df)
list_categorical_features = get_categorical_features(df)

Loading data from c:\Users\Utente\Desktop\STUDIO\LUISS\ANNO_3\Advanced_Coding\Credit_Score_Classification\data\raw\credit_score_dataset.csv
Loaded dataset with 100000 rows and 27 columns


## Categorical features

In [18]:
import src.preprocessing.categorical.categorical_features as categorical_features
importlib.reload(categorical_features)
from src.preprocessing.categorical.categorical_features import *

visualize_unique_categories(df, list_categorical_features);


Column 'ID' (90058 uniques):
the length of the list is too long to be printed entirely. Sample:
['0x1603', '0x1604', '0x1605', '0x1607', '0x1608', '0x1609', '0x160e', '0x160f', '0x1610', '0x1611']

Column 'Customer_ID' (12500 uniques):
the length of the list is too long to be printed entirely. Sample:
['CUS_0xd40', 'CUS_0x21b1', 'CUS_0x2dbc', 'CUS_0xb891', 'CUS_0x1cdb', 'CUS_0x95ee', 'CUS_0x284a', 'CUS_0x5407', 'CUS_0x4157', 'CUS_0xba08']

Column 'Month' (8 uniques):
['February', 'April', 'May', 'June', 'July', 'August', 'January', 'March']

Column 'Name' (10139 uniques):
the length of the list is too long to be printed entirely. Sample:
['Aaron Maashoh', 'Rick Rothackerj', 'Langep', 'Jasond', 'Deepaa', 'Np', 'Nadiaq', 'Annk', 'Charlie Zhur', 'Jamesj']

Column 'City' (4 uniques):
['Lonton', 'Standhampton', 'BadShire', 'ZeroVille']

Column 'Street' (31 uniques):
['Oxford Street', 'Old Street', 'Downing Street', 'Jubilee Street', 'Elm Street', 'Market Street', 'King Street', 'Abbey Driv

### Drop Identifiers Columns

In [19]:
IDENTIFIERS_TO_DROP = [
    'ID',
    'Customer_ID',
    'Name',
    'SSN'
]

# drop the columns that are purely identifiers and that can add bias to the model
df = df.drop(columns=IDENTIFIERS_TO_DROP)

### Handle "Street" feature

In [20]:
df = add_space_before_word(df, feature_column='Street', separator_word='Street')

### Handle "Credit History Age" feature

In [21]:
print("Credit_History_Age information before parsing:")
df["Credit_History_Age"].info()

print("\n"+"-"*100+"\n")

df["Credit_History_Age"] = df["Credit_History_Age"].apply(parse_string_time_period)

print("Credit_History_Age information after parsing:")
df["Credit_History_Age"].info()

Credit_History_Age information before parsing:
<class 'pandas.core.series.Series'>
RangeIndex: 100000 entries, 0 to 99999
Series name: Credit_History_Age
Non-Null Count  Dtype 
--------------  ----- 
81791 non-null  object
dtypes: object(1)
memory usage: 781.4+ KB

----------------------------------------------------------------------------------------------------

Credit_History_Age information after parsing:
<class 'pandas.core.series.Series'>
RangeIndex: 100000 entries, 0 to 99999
Series name: Credit_History_Age
Non-Null Count  Dtype  
--------------  -----  
81791 non-null  float64
dtypes: float64(1)
memory usage: 781.4 KB


### Handle "Type of Loan" feature - Converting in Multiple Binary Numeric Features

In [22]:
# Handling Type of Loan feature
original_count_unique_categories = df['Type_of_Loan'].nunique()

visualize_top_n_categories(df, column='Type_of_Loan', top_n=25);

df['Type_of_Loan'] = df['Type_of_Loan'].fillna('Missing_Loan')

unique_loan_types_and_counts = get_unique_values_and_counts(df, column='Type_of_Loan')

# Print the discovered loan types and their frequencies
print(f"Discovered {len(unique_loan_types_and_counts)} unique values")
for unique_value, count in list(unique_loan_types_and_counts.items()): 
    print(f"- {unique_value}: {count:,} occurrences")


df = transform_to_binary_features(df, column='Type_of_Loan', unique_values=unique_loan_types_and_counts.keys(), feature_name='Loan')


# add "Loan" suffix to the column "Has_Not_Specified" for better readability
df.rename(columns={'Has_Not_Specified': 'Has_Not_Specified_Loan'}, inplace=True)

# Verify the transformation
print(f"\nOriginal unique values:", {original_count_unique_categories})
print("\nBinary Features (numeric) created from the categories of 'Type of Loan':")
for col in [c for c in df.columns if c.startswith('Has_')]:
    print(f"- {col}: {df[col].sum():,} records - type: {df[col].dtype}")


# Drop the original Type_of_Loan column
df.drop(columns=['Type_of_Loan'], inplace=True)





Top 25 Categories (out of 6261 unique values):
Total records analyzed: 100000

Detailed breakdown:
 1. Missing                                                                20,312 records (20.31%)
 2. Not Specified                                                           1,272 records (1.27%)
 3. Credit-Builder Loan                                                     1,143 records (1.14%)
 4. Debt Consolidation Loan                                                 1,133 records (1.13%)
 5. Personal Loan                                                           1,128 records (1.13%)
 6. Student Loan                                                            1,117 records (1.12%)
 7. Payday Loan                                                             1,061 records (1.06%)
 8. Mortgage Loan                                                           1,058 records (1.06%)
 9. Auto Loan                                                               1,024 records (1.02%)
10. Home Equity L

### Handle object columns with problematic-numeric values and convert to numeric

In [23]:
import src.preprocessing.categorical.problematic_numeric_values as problematic_numeric_values
importlib.reload(problematic_numeric_values)
from src.preprocessing.categorical.problematic_numeric_values import convert_non_numeric_strings_to_nan, identify_problematic_characters,  remove_characters, convert_to_numeric

# Features that should be numerical but are currently 'object'
PROBLEMATIC_NUMERICAL_FEATURES = [
    'Annual_Income',
    'Num_of_Loan',
    'Num_of_Delayed_Payment',
    'Changed_Credit_Limit',
    'Outstanding_Debt',
    'Amount_invested_monthly'
]

# Print info of the columns before cleaning and conversion to numeric
print("\nInfo for columns to be converted to numeric:")
print("-" * 50)
df[PROBLEMATIC_NUMERICAL_FEATURES].info()

# Convert strings that don't contain digits to NaN
df = convert_non_numeric_strings_to_nan(df, PROBLEMATIC_NUMERICAL_FEATURES)

# Identify problematic elements in the columns
print("Identifying problematic characters...\n")
problematic_characters = identify_problematic_characters(df, PROBLEMATIC_NUMERICAL_FEATURES)
print(f"\nProblematic characters found: {problematic_characters}")

print(f"\nRemoving problematic characters: {problematic_characters}")
df = remove_characters(df, PROBLEMATIC_NUMERICAL_FEATURES, problematic_characters)

non_numeric_characters = identify_problematic_characters(df, PROBLEMATIC_NUMERICAL_FEATURES)
print(f"\nNon-numeric characters found after cleaning: {non_numeric_characters}")

print("\nConverting to numeric...")
df = convert_to_numeric(df, PROBLEMATIC_NUMERICAL_FEATURES)

# Print info of the converted columns
print("\nInfo for numeric converted columns:")
print("-" * 50)
df[PROBLEMATIC_NUMERICAL_FEATURES].info()


Info for columns to be converted to numeric:
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Annual_Income            89906 non-null  object
 1   Num_of_Loan              89809 non-null  object
 2   Num_of_Delayed_Payment   83782 non-null  object
 3   Changed_Credit_Limit     89933 non-null  object
 4   Outstanding_Debt         90037 non-null  object
 5   Amount_invested_monthly  85880 non-null  object
dtypes: object(6)
memory usage: 4.6+ MB
Identifying problematic characters...


Column 'Annual_Income': 6292 values with problematic characters
--------------------------------------------------
Example 1: '34847.84_'
Example 2: '30689.89_'
Example 3: '35547.71_'

Column 'Num_of_Loan': 4267 values with problematic characters
-------------------------------------------

### Remaining categorical features

In [24]:
list_categorical_features = get_categorical_features(df)

print("\nCategorical columns:")
for col in list_categorical_features:
    print(f"{col}: {df[col].dtype}")


Categorical columns:
Month: object
City: object
Street: object
Occupation: object
Credit_Mix: object
Payment_of_Min_Amount: object


## Numeric Features (addressing inconsistencies)

In [25]:
import src.preprocessing.numeric_features as numeric_features
importlib.reload(numeric_features)
from src.preprocessing.numeric_features import *

list_numerical_features = get_numerical_features(df)

print("\nList of Numeric columns:")
for col in list_numerical_features:
    print(f"{col}: {df[col].dtype}")


List of Numeric columns:
Age: float64
Annual_Income: float64
Monthly_Inhand_Salary: float64
Num_Bank_Accounts: float64
Num_Credit_Card: float64
Interest_Rate: float64
Num_of_Loan: float64
Delay_from_due_date: float64
Num_of_Delayed_Payment: float64
Changed_Credit_Limit: float64
Num_Credit_Inquiries: float64
Outstanding_Debt: float64
Credit_Utilization_Ratio: float64
Credit_History_Age: float64
Amount_invested_monthly: float64
Credit_Score: int64
Has_Payday_Loan: int64
Has_Credit_Builder_Loan: int64
Has_Not_Specified_Loan: int64
Has_Home_Equity_Loan: int64
Has_Mortgage_Loan: int64
Has_Personal_Loan: int64
Has_Student_Loan: int64
Has_Debt_Consolidation_Loan: int64
Has_Auto_Loan: int64
Has_Missing_Loan: int64


### Applying Constraints (excluding loans-related binary columns previously created)

In [26]:
# Identify loan-related columns
loan_columns = [col for col in df.columns if 'Has' in col and 'Loan' in col]
print(f"\nLoan-related columns identified ({len(loan_columns)}):")

df_copy = df.copy()

# Create a separate dataframe for loan-related columns
# This preserves the original index for later merging if needed
df_loans = df_copy[loan_columns]

# Create a dataframe without loan-related columns
df_no_loans = df_copy.drop(columns=loan_columns)


# Define constraints by feature
CONSTRAINTS = {
    'Age': {'min': 10, 'max': 120, 'integer': True},
    'Annual_Income': {'min': 0, 'max': None, 'integer': False}, 
    'Monthly_Inhand_Salary': {'min': 0, 'max': None, 'integer': False}, 
    'Num_Bank_Accounts': {'min': 0, 'max': 50, 'integer': True},
    'Num_Credit_Card': {'min': 0, 'max': 50, 'integer': True},
    'Interest_Rate': {'min': 0, 'max': 100, 'integer': False},
    'Num_of_Loan': {'min': 0, 'max': 50, 'integer': True},
    'Delay_from_due_date': {'min': 0, 'max': 180, 'integer': True},
    'Num_of_Delayed_Payment': {'min': 0, 'max': None, 'integer': True}, 
    'Changed_Credit_Limit': {'min': None, 'max': None, 'integer': False}, 
    'Num_Credit_Inquiries': {'min': 0, 'max': 50, 'integer': True},
    'Outstanding_Debt': {'min': 0, 'max': None, 'integer': False},
    'Credit_Utilization_Ratio': {'min': 0, 'max': 100, 'integer': False},
    'Credit_History_Age': {'min': 0, 'max': 110, 'integer': False},
    'Amount_invested_monthly': {'min': 0, 'max': None, 'integer': False}, 
    'Credit_Score': {'min': 0, 'max': 3, 'integer': True}
}

# TODO: CREATE A BEFORE/AFTER PLOTS VISUALIZATION REMOVE THE STATISTICS

numeric_features_before = get_numerical_features(df_no_loans)
print("\nNumeric columns Statistics - Before Setting Constraints:\n")
for col in numeric_features_before:
    print(f"{col}: {df_no_loans[col].describe()}\n")
print("\n"+"-"*100+"\n")

print("Setting constraints for numeric features...\n")
df_no_loans = set_constraints_for_numeric_features(df_no_loans, numeric_features_before, CONSTRAINTS)


numeric_features_after = get_numerical_features(df_no_loans)
print("\n"+"-"*100+"\n")
print("\nNumeric columns Statistics - After Setting Constraints:\n")
for col in numeric_features_after:
    print(f"{col}: {df_no_loans[col].describe()}\n")

# Merge back with the dataframe containing loans binary columns
df = pd.concat([df_no_loans, df_loans], axis=1)


Loan-related columns identified (10):

Numeric columns Statistics - Before Setting Constraints:

Age: count    85512.000000
mean       110.226845
std        684.907588
min       -500.000000
25%         24.000000
50%         33.000000
75%         42.000000
max       8698.000000
Name: Age, dtype: float64

Annual_Income: count    8.990600e+04
mean     1.765448e+05
std      1.429013e+06
min      7.005930e+03
25%      1.945305e+04
50%      3.757975e+04
75%      7.276004e+04
max      2.419806e+07
Name: Annual_Income, dtype: float64

Monthly_Inhand_Salary: count    76405.000000
mean      4185.789272
std       3178.560506
min        303.645417
25%       1625.023750
50%       3086.683333
75%       5940.317500
max      15204.633330
Name: Monthly_Inhand_Salary, dtype: float64

Num_Bank_Accounts: count    90167.000000
mean        17.105172
std        117.728215
min         -1.000000
25%          3.000000
50%          6.000000
75%          7.000000
max       1798.000000
Name: Num_Bank_Accounts, dt

## Reduced dataset (25 000 rows) for faster computation in next sections

In [27]:
# Randomly select 25000 rows from the original dataset
reduced_dataset_25k = df.sample(n=25000, random_state=42)

df = reduced_dataset_25k

df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 25000 entries, 75721 to 11627
Data columns (total 32 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Month                        22559 non-null  object 
 1   City                         22503 non-null  object 
 2   Street                       22513 non-null  object 
 3   Age                          20816 non-null  float64
 4   Occupation                   22511 non-null  object 
 5   Annual_Income                22479 non-null  float64
 6   Monthly_Inhand_Salary        19028 non-null  float64
 7   Num_Bank_Accounts            22191 non-null  float64
 8   Num_Credit_Card              21946 non-null  float64
 9   Interest_Rate                22131 non-null  float64
 10  Num_of_Loan                  21511 non-null  float64
 11  Delay_from_due_date          22398 non-null  float64
 12  Num_of_Delayed_Payment       20801 non-null  float64
 13  Changed_Credit_Li

## Missing Values Handling

### Handle Missing values for numeric features

In [28]:
import src.preprocessing.missing_values.knn_imputation as knn_imputation
importlib.reload(knn_imputation)
from src.preprocessing.missing_values.knn_imputation import *

import src.preprocessing.missing_values.missing_values_processing as missing_values
importlib.reload(missing_values)
from src.preprocessing.missing_values.missing_values_processing import *


# Analyze missing values for numeric features
missing_stats_for_num_features = analyze_missing_values(df[list_numerical_features])
print("\nMissing Value Statistics for Numerical features:\n")
print(missing_stats_for_num_features)


num_columns_with_missing = missing_stats_for_num_features.index.tolist()

print(f"\nApplying KNN imputation method for {len(num_columns_with_missing)} numeric columns\n")
df, _ = apply_and_evaluate_knn_imputation(df=df, columns=num_columns_with_missing, plot=False)

print(f"""
Check remaining missing values:
{df[list_numerical_features].isnull().sum()}
""")

Total features with missing values: 15

Missing Value Statistics for Numerical features:

                          Missing Count  Missing Percentage Data Type
Monthly_Inhand_Salary              5972               23.89   float64
Credit_History_Age                 4547               18.19   float64
Num_of_Delayed_Payment             4199               16.80   float64
Age                                4184               16.74   float64
Amount_invested_monthly            3519               14.08   float64
Num_of_Loan                        3489               13.96   float64
Num_Credit_Inquiries               3264               13.06   float64
Num_Credit_Card                    3054               12.22   float64
Changed_Credit_Limit               2906               11.62   float64
Interest_Rate                      2869               11.48   float64
Num_Bank_Accounts                  2809               11.24   float64
Delay_from_due_date                2602               10.41   float64


### Handle Missing values for categorical features

##### Here I could consider to create another class Apply another class for Hyperparameter Tuning

In [29]:

# Visualize the proportion of unique categories for each categorical feature
visualize_proportion_of_unique_categories(df, list_categorical_features)

df_before_missing = df.copy()

def fill_missing_with_category(df, feature_column, value_to_fill):
    
    df[feature_column] = df[feature_column].fillna(value_to_fill)
    return df

df = fill_missing_with_category(df, feature_column='Credit_Mix', value_to_fill='_')
df = fill_missing_with_category(df, feature_column='Payment_of_Min_Amount', value_to_fill='NM')


df[list_categorical_features] = df[list_categorical_features].fillna('Unknown')

#improve it
#plot_distribution_comparison_for_categorical(df_before_missing, df, list_categorical_features)





Column 'Month' has 9 unique categories:

All categories (proportion) for column 'Month':

February: 11.48%
June: 11.44%
April: 11.28%
January: 11.26%
March: 11.23%
July: 11.20%
August: 11.19%
May: 11.15%
Missing: 9.76%

----------------------------------------------------------------------------------------------------


Column 'City' has 5 unique categories:

All categories (proportion) for column 'City':

Standhampton: 47.55%
BadShire: 26.23%
Lonton: 15.85%
Missing: 9.99%
ZeroVille: 0.38%

----------------------------------------------------------------------------------------------------


Column 'Street' has 32 unique categories:

Showing top 10 most frequent categories (proportion) for column 'Street':

Missing: 9.95%
Quality Street: 5.36%
Jubilee Street: 5.12%
Memory Street: 5.04%
Fleet Street: 5.04%
Old Street: 5.01%
High Street: 4.97%
New Street: 4.93%
Grove Street: 4.90%
Imperial Street: 4.85%

----------------------------------------------------------------------------------

## Save Initial Processed Dataset (with 25k rows)

In [30]:
import pathlib

script_dir = pathlib.Path.cwd()
project_root = script_dir.parent

# 4) Build the path to data/dummy_dataset
save_dir = project_root / 'data' / 'processed'
save_dir.mkdir(parents=True, exist_ok=True)         # just in case


# Save to CSV
save_path = save_dir / 'initial_processed_dataset_25k_rows.csv'
df.to_csv(save_path, index=False)
print(f"Saved Intermediate Dataset to {save_path}")

Saved Intermediate Dataset to c:\Users\Utente\Desktop\STUDIO\LUISS\ANNO_3\Advanced_Coding\Credit_Score_Classification\data\processed\initial_processed_dataset_25k_rows.csv


## Save Initial Processed Dataset (with 10k rows)

In [31]:
# Randomly select 25000 rows from the original dataset
reduced_dataset_10k = df.sample(n=10000, random_state=42)

df = reduced_dataset_10k

import pathlib

script_dir = pathlib.Path.cwd()
project_root = script_dir.parent

# 4) Build the path to data/dummy_dataset
save_dir = project_root / 'data' / 'processed'
save_dir.mkdir(parents=True, exist_ok=True)         # just in case


# Save to CSV
save_path = save_dir / 'initial_processed_dataset_10k_rows.csv'
df.to_csv(save_path, index=False)
print(f"Saved Intermediate Dataset to {save_path}")

Saved Intermediate Dataset to c:\Users\Utente\Desktop\STUDIO\LUISS\ANNO_3\Advanced_Coding\Credit_Score_Classification\data\processed\initial_processed_dataset_10k_rows.csv
