In [1]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

import io
from openpyxl.drawing.image import Image
import sys
# Add the parent directory to the sys.path
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(parent_dir)

from utils.eda import  categorical_univariate_summary,numerical_univariate_summary
from src.utils.feature_engineering import compute_ratio_columnwise

pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.max_rows", None)     # Show all rows (optional)

In [2]:
#Training Data
app_train = pd.read_csv('../data/raw/app_train.csv')

In [None]:
print(f"Training Data shape : {app_train.shape}")
app_train.head()

The training data has 215257 observations (each one a separate loan) and 122 features (variables) including the `TARGET` (the label we want to predict).

In [None]:
# Testing data features
app_test= pd.read_csv('../data/raw/app_test.csv')
print('Testing data shape: ', app_test.shape)
app_test.head()

# Exploratory Data Analysis

Exploratory Data Analysis (EDA) is an open-ended process where we calculate statistics and make figures to find trends, anomalies, patterns, or relationships within the data. The goal of EDA is to learn what our data can tell us. It generally starts out with a high level overview, then narrows in to specific areas as we find intriguing areas of the data. The findings may be interesting in their own right, or they can be used to inform our modeling choices, such as by helping us decide which features to use.

## Examine the Distribution of the Target Column

The target is what we are asked to predict: either a 0 for the loan was repaid on time, or a 1 indicating the client had payment difficulties i.e he/she had late payment more than X days on at least one of the first Y installments of the loan in our sample. We can first examine the number of loans falling into each category.

In [None]:
# Get counts
counts = app_train['TARGET'].value_counts()
# Get percentages
percentages = app_train['TARGET'].value_counts(normalize=True) * 100
# Combine counts and percentages into a single DataFrame
app_train_target = pd.DataFrame({'Count': counts, 'Percentage': percentages})
print(app_train_target)

In [None]:
# Get counts
counts = app_test['TARGET'].value_counts()
# Get percentages
percentages = app_test['TARGET'].value_counts(normalize=True) * 100
# Combine counts and percentages into a single DataFrame
app_test_target = pd.DataFrame({'Count': counts, 'Percentage': percentages})
print(app_test_target)

From this information, we see this is an imbalanced class problem. There are far more loans that were repaid on time than loans that were not repaid.

In [None]:
app_train.head()

In [None]:
# Number of each type of column
app_train.dtypes.value_counts()

In [None]:
type(app_train.select_dtypes(include=['object']).columns.tolist())

In [15]:
data_description = pd.read_csv('../data/raw/HomeCredit_columns_description.csv', encoding = 'latin1',usecols=['Table', 'Row', 'Description'])
data_dict = data_description.loc[data_description['Table'] == 'application_{train|test}.csv',['Row', 'Description']].\
                     set_index('Row')['Description'].to_dict()

In [None]:
# Select categorical columns (object type)
categorical_columns = app_train.select_dtypes(include=['object']).columns.tolist()
# Select numeric columns with unique values <= 21
numeric_categorical_columns = [
    col for col in app_train.select_dtypes(include=['number']).columns
    if app_train[col].nunique() <= 21
]
# Combine both lists
final_categorical_columns = categorical_columns + numeric_categorical_columns
print(len(final_categorical_columns))
# Get all columns in the DataFrame
all_columns = app_train.columns.tolist()
# Get remaining columns (i.e., columns not in final_categorical_columns)
numeric_columns = [col for col in all_columns if col not in final_categorical_columns]
print(len(numeric_columns))
print(len(all_columns))




In [None]:
# Generate the summary and save to the specified path
categorical_univariate_summary(app_train, final_categorical_columns,data_dict=data_dict, save_path="../reports/")


In [None]:
#Lets remove CODE_GENDER=XNA  rows

#Features not to use 
#['NAME_CONTRACT_TYPE','CODE_GENDER','NAME_FAMILY_STATUS','OCCUPATION_TYPE','HOUSETYPE_MODE',
#'FONDKAPREMONT_MODE','WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE',]

# Categorical to binary encoding
#['FLAG_OWN_CAR','FLAG_OWN_REALTY',

# Convert categorical column to 'category' dtype
#df['Category'] = df['Category'].astype('category')
#['NAME_TYPE_SUITE','NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_HOUSING_TYPE','WEEKDAY_APPR_PROCESS_START',
#'ORGANIZATION_TYPE',]

# Convert categorical column to label encoding


# numerical variables to use
#['CNT_CHILDREN','FLAG_MOBIL','FLAG_EMP_PHONE','FLAG_WORK_PHONE','FLAG_CONT_MOBILE','FLAG_EMAIL',
# 'CNT_FAM_MEMBERS','REGION_RATING_CLIENT','REGION_RATING_CLIENT_W_CITY','REG_REGION_NOT_LIVE_REGION',
#'REG_REGION_NOT_WORK_REGION','DEF_30_CNT_SOCIAL_CIRCLE','DEF_60_CNT_SOCIAL_CIRCLE','FLAG_DOCUMENT_2','FLAG_DOCUMENT_3',
#'FLAG_DOCUMENT_6','FLAG_DOCUMENT_8','AMT_REQ_CREDIT_BUREAU_HOUR','AMT_REQ_CREDIT_BUREAU_DAY','AMT_REQ_CREDIT_BUREAU_WEEK',
#'AMT_REQ_CREDIT_BUREAU_QRT']

# OTHER FLAG DOCUMENT VARIABLES WE DIDN'T SELECTED BECAUSE OF LOW VARIANCE IN IT


# also check if categories are fix or is there any chance of getting new categories then handle it
#['NAME_INCOME_TYPE',]

In [None]:
numerical_univariate_summary(app_train, numerical_columns=numeric_columns,data_dict=data_dict, save_path="../reports/")

#### Numerical variables to use and strategy
- Income of the client variable AMT_INCOME_TOTAL has outliers  and cann't be used in absolute term directly hence
needs to do some feature enginering to use it.
- AMT_CREDIT also has some outliers and skewd 

Feature Creation:
1. AMT_CREDIT/AMT_INCOME_TOTAL  -- for this make sure quality check that both should not null and AMT_INCOME_TOTAL is not zero
2. AMT_ANNUITY/AMT_INCOME_TOTAL --> decide how to handle if any of the null 
3. AMT_CREDIT/AMT_GOODS_PRICE --> how to handle if any null or missing
4. DAYS_BIRTH/-365 -->age of the client
5. DAYS_EMPLOYED/-365 if DAYS_EMPLOYED<0 and np.nan impute DAYS_EMPLOYED>0 if  -->employement age 

# Direct to use
['REGION_POPULATION_RELATIVE','employment_years','DAYS_REGISTRATION','DAYS_ID_PUBLISH','HOUR_APPR_PROCESS_START','EXT_SOURCE_1',
'EXT_SOURCE_2','EXT_SOURCE_3','DAYS_LAST_PHONE_CHANGE','AMT_REQ_CREDIT_BUREAU_MON','AMT_REQ_CREDIT_BUREAU_YEAR']

# Not to use directly
['AMT_CREDIT','AMT_INCOME_TOTAL','AMT_ANNUITY','AMT_GOODS_PRICE','DAYS_BIRTH','DAYS_EMPLOYED']

# New Features Created and to use
['AMT_CREDIT_AMT_INCOME_TOTAL_ratio','Client_Age','AMT_ANNUITY_AMT_INCOME_TOTAL_ratio','AMT_CREDIT_AMT_GOODS_PRICE_ratio','employment_age']

- IMP Notes: Before live prediction we need to setup feature proper validation and if valid then only we should predict otherwise not.
THis we can do only for higher significant or important features because they only drive the decision .i.e if significant features are not available then don't do predictions.


### Domain Knwoledge Features

In [None]:
# Lets remove CODE_GENDER=XNA
app_train = app_train.loc[app_train['CODE_GENDER'] != 'XNA']
app_train.shape

In [5]:
# Apply function
app_train['AMT_CREDIT_AMT_INCOME_TOTAL_ratio'] = compute_ratio_columnwise(app_train, 'AMT_CREDIT', 'AMT_INCOME_TOTAL')

In [6]:
# Apply function
app_train['AMT_ANNUITY_AMT_INCOME_TOTAL_ratio'] = compute_ratio_columnwise(app_train, 'AMT_ANNUITY', 'AMT_INCOME_TOTAL')

In [7]:
# Apply function
app_train['AMT_CREDIT_AMT_GOODS_PRICE_ratio'] = compute_ratio_columnwise(app_train, 'AMT_CREDIT', 'AMT_GOODS_PRICE')

In [8]:
app_train["Client_Age"] = (app_train["DAYS_BIRTH"]/(-365)).round(2)

In [9]:
app_train['employment_years'] = np.where(app_train['DAYS_EMPLOYED'] < 0,  (app_train["DAYS_EMPLOYED"] / -365).round(2),  np.nan)

In [10]:
app_train["employment_age"] = np.where(app_train["employment_years"].isna(),  -1,  
                              np.where(app_train["Client_Age"] > app_train["employment_years"],\
                                        app_train["Client_Age"] - app_train["employment_years"], -2) 
                                # f1 > f2 -> f1 - f2, else NaN
)

In [11]:
app_train['AMT_CREDIT_AMT_ANNUITY_ratio'] = compute_ratio_columnwise(app_train, 'AMT_CREDIT', 'AMT_ANNUITY')

In [None]:
plt.figure(figsize = (12, 20))
# iterate through the new features
for i, feature in enumerate(['AMT_CREDIT_AMT_GOODS_PRICE_ratio','AMT_ANNUITY_AMT_INCOME_TOTAL_ratio',\
                             'AMT_CREDIT_AMT_INCOME_TOTAL_ratio','employment_years','employment_age','AMT_CREDIT_AMT_ANNUITY_ratio']):
    
    # create a new subplot for each source
    plt.subplot(6, 1, i + 1)
    # plot repaid loans
    sns.kdeplot(app_train.loc[app_train['TARGET'] == 0, feature], label = 'target == 0')
    # plot loans that were not repaid
    sns.kdeplot(app_train.loc[app_train['TARGET'] == 1, feature], label = 'target == 1')
    
    # Label the plots
    plt.title('Distribution of %s by Target Value' % feature)
    plt.xlabel('%s' % feature); plt.ylabel('Density')
    plt.legend()
    
plt.tight_layout(h_pad = 2.5)

In [None]:
categorical_features = ['FLAG_OWN_CAR','FLAG_OWN_REALTY','NAME_TYPE_SUITE',\
                        'NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_HOUSING_TYPE',\
                        'WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE',]

numerical_features =   ['CNT_CHILDREN','FLAG_MOBIL','FLAG_EMP_PHONE','FLAG_WORK_PHONE','FLAG_CONT_MOBILE','FLAG_EMAIL',\
                       'CNT_FAM_MEMBERS','REGION_RATING_CLIENT','REGION_RATING_CLIENT_W_CITY','REG_REGION_NOT_LIVE_REGION',\
                       'REG_REGION_NOT_WORK_REGION','DEF_30_CNT_SOCIAL_CIRCLE','DEF_60_CNT_SOCIAL_CIRCLE','FLAG_DOCUMENT_2',\
                       'FLAG_DOCUMENT_3','FLAG_DOCUMENT_6','FLAG_DOCUMENT_8','AMT_REQ_CREDIT_BUREAU_HOUR',\
                       'AMT_REQ_CREDIT_BUREAU_DAY','AMT_REQ_CREDIT_BUREAU_WEEK','AMT_REQ_CREDIT_BUREAU_QRT',\
                        'REGION_POPULATION_RELATIVE','employment_years','DAYS_REGISTRATION','DAYS_ID_PUBLISH',\
                        'HOUR_APPR_PROCESS_START','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_LAST_PHONE_CHANGE',\
                        'AMT_REQ_CREDIT_BUREAU_MON','AMT_REQ_CREDIT_BUREAU_YEAR',\
                        'AMT_CREDIT_AMT_INCOME_TOTAL_ratio','Client_Age','AMT_ANNUITY_AMT_INCOME_TOTAL_ratio',\
                        'AMT_CREDIT_AMT_GOODS_PRICE_ratio','employment_age']

Target = "TARGET"

primary_key = "SK_ID_CURR" 


len(categorical_features)+len(numerical_features)

In [14]:
app_train = app_train[categorical_features + numerical_features + [Target,primary_key]]

In [None]:
app_train.shape

In [None]:
import pandas as pd

# Create a small DataFrame with a categorical column
df = pd.DataFrame({
    'ID': [1, 2, 3, 4, 5],
    'Category': ['A', 'B', 'A', 'C', 'B']
})

# Convert 'Category' column to categorical type
df['Category'] = df['Category'].astype('category')

# Check data types
print("Original DataFrame:")
print(df.dtypes)


In [None]:
df.to_csv("test.csv", index=False)

# Read back from CSV
df_csv = pd.read_csv("test.csv")

print("\nDataFrame after loading CSV:")
print(df_csv.dtypes)  # 'Category' will be 'object' (string)


In [None]:
df.to_parquet("test.parquet")

# Read back from Parquet
df_parquet = pd.read_parquet("test.parquet")

print("\nDataFrame after loading Parquet:")
print(df_parquet.dtypes)  # 'Category' remains 'category'
