In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats 
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report, 
    roc_auc_score, roc_curve, auc,
    ConfusionMatrixDisplay, RocCurveDisplay,
    precision_recall_curve,
    PrecisionRecallDisplay
)
from statsmodels.stats.outliers_influence import variance_inflation_factor
from imblearn.over_sampling import SMOTE

In [None]:
# Load dataset
data=pd.read_csv("D:\\LoanTap\\LoanTap\\artifacts\\data_ingestion\\LoanTap.csv")
df = data.copy()
df.head()

In [None]:
# shape of data
df.shape

In [None]:
## Basic info about data
df.info()

In [None]:
#Basic stats 
df.describe()

In [None]:
## check null values
df.isna().sum()/len(df)*100

9.5% missing values in mort_acc column. Let's treat it later

In [None]:
## EDA and Feature engineering
# separate numerical and categorical columns for univariate analysis

numerical_columns = list(df.loc[:, df.dtypes!=object].columns)
categorical_columns = list(df.loc[:, df.dtypes==object].columns)



In [None]:
def plot_histogram_and_boxplot(df, columm):
    plt.figure(figsize=(10,3))
    plt.subplot(1,2,1)
    sns.histplot(data=df, x=columm, kde=True)
    plt.title(f"Distribution of {columm}")
    plt.subplot(1,2,2)
    sns.boxplot(data=df, x=columm)
    plt.title(f"Boxplot of {columm}")
    plt.show()


In [None]:
for col in numerical_columns:
    plot_histogram_and_boxplot(df, col)

In [None]:
## Even though data type of mort_acc and pub_rec_bankrupties have numerical data type they are categorical features.
## Let's append these columns to categorical data

categorical_columns.extend(['mort_acc','pub_rec_bankruptcies'])

In [None]:
# check number of categories in each featue
for col in categorical_columns:
    print(f"{col}-->{df[col].nunique()}")

In [None]:
print(df['purpose'].unique())
print(df['title'].unique())

Purpose and Title have same data.
Let's drop title column as it has more categories due possibly to manual typing errors

In [None]:
df=df.drop(columns=['title'])

In [None]:
categorical_columns=['term',
                        'grade',
                        'sub_grade',
                        'emp_length',
                        'home_ownership',
                        'verification_status',
                        'purpose',
                        'initial_list_status',
                        'application_type',
                        'mort_acc',
                        'pub_rec_bankruptcies']

In [None]:
## plot univariate counts and count w.r.t target
def plot_countplot(df, col, target):
    plt.figure(figsize=(10,3))
    plt.subplot(1,2,1)
    sns.countplot(data=df, y=col,stat='percent')
    plt.title(f"count plot of {col}")
    plt.subplot(1,2,2)
    sns.countplot(data=df, y=col,hue=target,stat='percent')
    plt.title(f"count plot of {col} w.r.t {target} ")
    plt.show()

    


In [None]:
for col in categorical_columns:
    plot_countplot(df, col, 'loan_status')

In [None]:
## Bivariate analysis between numerical features and target
def plot_kdeplot(df, col, target):
    plt.figure(figsize=(10,3))
    sns.kdeplot(data=df, x=col,hue=target)
    plt.title(f"kde plot of {col} w.r.t {target}")
    plt.show()

In [None]:
for col in numerical_columns:
    plot_kdeplot(df,col,'loan_status')

In [None]:
### Impute missing values in mor_acc
df['mort_acc'] = df['mort_acc'].fillna(df.groupby('total_acc')['mort_acc'].transform('median'))


In [None]:
df.isna().sum()/len(df)*100

In [None]:
df['emp_length'].value_counts()

In [None]:
## drop emp_title column and null values from remianing columns
df=df.drop(columns=['emp_title'])
df = df.dropna()
df.isna().sum()/len(df)*100


In [None]:
df['zip_code'] = df.address.apply(lambda x: x[-5:])
df['zip_code']

In [None]:
df['city_code']=df.address.apply(lambda x: x[-8:-6])
df['city_code']

In [None]:
df=df.drop(columns=['address'])
df.info()

In [None]:
df=df.drop(columns=['issue_d', 'earliest_cr_line'])
df.head()

In [None]:
categorical_columns = list(df.loc[:, df.dtypes==object].columns)
# check number of categories in each featue
for col in categorical_columns:
    print(f"{col}-->{df[col].nunique()}")


In [None]:
from scipy.stats import chi2_contingency
 
for col in categorical_columns:
    res=chi2_contingency(pd.crosstab(df[col],df['loan_status']).values)
    print(f"{col}: {res.pvalue}")
     


In [None]:
# remove city_code column as it is having pvalu>0.05
df = df.drop(columns=['city_code'])

df.head()

In [None]:
categorical_columns.remove('city_code')
categorical_columns

In [None]:
df['term']=df['term'].str.strip()


In [None]:
for col in categorical_columns:
    print(f"{col}--> {df[col].unique()}--> {df[col].nunique()}")



In [None]:
## Encode grade and sub_grade using label encoder as these features are ordinal and remaining_features using one hot encoder
label_encoder_grade=LabelEncoder()
label_encoded_features=label_encoder_grade.fit_transform(df['grade'].values)

In [None]:
label_encoder_subgrade=LabelEncoder()
label_encoded_subgrade=label_encoder_subgrade.fit_transform(df['sub_grade'].values)


In [None]:
le_emp_length=LabelEncoder()
label_encoded_emp_length = le_emp_length.fit_transform(df['emp_length'].values)

In [None]:
df['emp_length'].values.shape

In [None]:
ohe=OneHotEncoder(drop=['36 months','RENT','Not Verified','vacation','w','INDIVIDUAL','22690'])
ohe_encoded_features = ohe.fit_transform(df[['term','home_ownership','verification_status','purpose','initial_list_status','application_type','zip_code']]).toarray()

In [None]:
ohe_encoded_features[1]

In [None]:
ohe.inverse_transform(np.array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]).reshape(1,-1))

In [None]:
ohe.get_feature_names_out()

In [None]:
ohe_data=pd.DataFrame(ohe_encoded_features, columns=ohe.get_feature_names_out())
ohe_data.head()

In [None]:
le_data=pd.DataFrame({'le_grade':label_encoded_features.reshape(-1,), 
                      'le_subgrade':label_encoded_subgrade.reshape(-1,),
                      'le_emp_length':label_encoded_emp_length.reshape(-1,)})
le_data.head()



In [None]:
df.reset_index(inplace=True)
final_data = pd.concat([df,ohe_data, le_data], axis=1)
final_data.shape

In [None]:
X= final_data.drop(columns=categorical_columns)
y = final_data[['loan_status']]

In [None]:
X.head()

In [None]:
y['loan_status']=y['loan_status'].apply(lambda x: 1 if x=='Charged Off' else 0)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.2, stratify=y)

In [None]:
sc = StandardScaler()
x_train_scaled=sc.fit_transform(x_train)


In [None]:
lr = LogisticRegression()
lr.fit(x_train_scaled, y_train)

In [None]:
x_test_scaled=sc.transform(x_test)


In [None]:
y_pred=lr.predict(x_test_scaled)
acc=accuracy_score(y_test,y_pred)

In [None]:
acc