#### Testing the scripts

In [1]:
import numpy as np
import pandas as pd

# For displaying all of the columns in dataframes
pd.set_option('display.max_columns', None)

#For ignoring warnings
import warnings
warnings.filterwarnings('ignore')

In [5]:
df0 = pd.read_csv(r"C:\Users\hp\OneDrive\Documents\GitHub\credit_line_eligibility\data\credit_eligibility.csv")



In [6]:
# Value conversions
d = {'10+ years':10, '9 years':9, '8 years':8, '7 years':7, '6 years':6,
     '5 years':5, '4 years':4, '3 years':3, '2 years':2,  '1 year':1,
     '< 1 year':0 }
df0['emp_length']=df0['emp_length'].replace(d)


d = {' 36 months':36, ' 60 months':60}
df0['term']=df0['term'].replace(d)

d = {'Fully Paid':1, 'Charged Off':0}
df0['loan_status']=df0['loan_status'].replace(d)
print("Value conversions done")

# Null value handling
if "mort_acc" in df0.columns and df0["mort_acc"].isnull().sum() > 0:
                    median = df0["mort_acc"].median()
                    df0["mort_acc"]=df0["mort_acc"].fillna(median)
df0.dropna(inplace=True)
print("Null value handling done")
print(df0.shape)


Value conversions done
Null value handling done
(370621, 27)


In [7]:
#Outlier handling
df1 = df0.copy()

def iqr_limits(series, multiplier=1.5):
    """
    Calculate upper and lower bounds using the IQR method.
    
    Parameters:
    - series: Pandas Series (column) for which to compute IQR limits.
    - multiplier: Controls the strictness of outlier detection (default: 1.5).
    
    Returns:
    - (lower_limit, upper_limit): Tuple containing lower and upper bounds.
    """
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_limit = Q1 - multiplier * IQR
    upper_limit = Q3 + multiplier * IQR
    return lower_limit, upper_limit

# 1. IQR method
cols = ['loan_amnt', 'int_rate', 'installment']
mask = pd.Series(True, index=df1.index)

# Calculate the upper and lower limits
for col in cols:
    ll, ul = iqr_limits(df0[col], multiplier=1.5)
    
    # Update mask: Mark False where outliers exist in any column
    mask &= (df1[col] >= ll) & (df1[col] <= ul)
    outliers = df1[(df1[col] < ll) | (df1[col] > ul)]
    
# Apply the mask to filter out rows with outliers
df1 = df1[mask]

#2. Quantile capping
cols = ['open_acc', 'revol_util', 'total_acc']

for col in cols:
    lower = df0[col].quantile(0.01)  # 1st percentile
    upper = df0[col].quantile(0.99)  # 99th percentile
    
    df1 = df1[(df1[col] >= lower) & (df1[col] <= upper)]
    outliers = df1[(df1[col] < ll) | (df1[col] > ul)]
    
#3. Yeo-Johnson transformation
from sklearn.preprocessing import PowerTransformer
cols = ['annual_inc', 'revol_bal', 'dti']
pt = PowerTransformer(method='yeo-johnson')

for col in cols:
    df1[col] = pt.fit_transform(df1[[col]])
    
    # Checking the outlier percentage for transformed columns
    ll, ul = iqr_limits(df1[col], multiplier=1.5) 
    outliers = df1[(df1[col] < ll) | (df1[col] > ul)]
    
print("Outlier handling done")
print(df1.shape)

#Dropping columns

df1.drop(columns=['initial_list_status', 'emp_title', 'title','earliest_cr_line', 
                 'issue_d', 'grade', 'sub_grade', 'installment','pub_rec_bankruptcies', 
                 'application_type', 'address'], inplace=True)

# Get the indices of rows to drop
index_to_drop = df1[df1['home_ownership'].isin(['ANY', 'NONE', 'OTHER'])].index
df1 = df1.drop(index_to_drop)
print("Columns dropped")

# Frequency encoding
cols = ['purpose', 'home_ownership', 'verification_status']
for col in cols:
    freq_encoding = df1[col].value_counts().to_dict()
    df1[col] = df1[col].map(freq_encoding)
print("Frequency encoding done")

print(df1.shape)
df1.head()


Outlier handling done
(340998, 27)
Columns dropped
Frequency encoding done
(340875, 16)


Unnamed: 0,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,verification_status,loan_status,purpose,dti,open_acc,pub_rec,revol_bal,revol_util,total_acc,mort_acc
0,10000.0,36,11.44,10.0,139281,1.208276,116355,1,2064,1.085578,16.0,0.0,1.437651,41.8,25.0,0.0
1,8000.0,36,11.99,4.0,171026,0.06201,116355,1,204620,0.618678,17.0,0.0,0.683437,53.3,27.0,3.0
2,15600.0,36,10.49,0.0,139281,-0.797087,115772,1,72545,-0.519602,13.0,0.0,0.079503,92.2,26.0,0.0
3,7200.0,36,6.49,6.0,139281,-0.318819,116355,1,72545,-2.126816,6.0,0.0,-0.742378,21.5,13.0,0.0
4,24375.0,60,17.27,9.0,171026,-0.280703,108748,0,72545,1.89159,13.0,0.0,0.930183,69.8,43.0,1.0


In [26]:
# Data splitting
df0 = df1.copy()
from sklearn.model_selection import train_test_split
X = df0.drop(columns=['loan_status'])
X.reset_index(inplace=True, drop=True)
y = df0['loan_status']
y.reset_index(drop=True, inplace=True)

# Split data before standardization
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Data split done")

# Standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit & transform on training data
X_test_scaled = scaler.transform(X_test)  # Only transform test data (NO fitting)
print("Standardization done")

# SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)
X_test_resampled, y_test_resampled = smote.fit_resample(X_test_scaled, y_test)
print("SMOTE done")


Data split done
Standardization done
SMOTE done
