### Data overview

In [None]:
# importing library
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix,f1_score
from xgboost import XGBClassifier


In [None]:
# loading data
df_application = pd.read_csv(r'Dataset 1\application_record.csv')
df_credit = pd.read_csv(r'Dataset 1\credit_record.csv')

In [None]:
df_application.sample(5)

In [None]:
df_credit.sample(5)

In [None]:
df_application.info()

In [None]:
df_credit.info()

In [None]:
df_application.shape

In [None]:
df_credit.shape

In [None]:
# df = pd.merge(df_application, df_credit, on='ID',how='left')
# mreging data
df = pd.merge(df_application, df_credit, on='ID', how='inner')

In [None]:
df

In [None]:
print('Application Record data shape: ',df_application.shape)
print('Credit Record data shape: ',df_credit.shape)
print('Merged data shape: ',df.shape)

In [None]:
df.info()

In [None]:
# checking duplicate value
df.duplicated().sum()

In [None]:
# checking null value
df.isnull().sum()

In [None]:
df.isnull().sum()/df.shape[0]*100

In [None]:
df["OCCUPATION_TYPE"].value_counts()

In [None]:
df["OCCUPATION_TYPE"].unique()

In [None]:
# dropping null value 
df.dropna(inplace=True)

In [None]:
df.shape

In [None]:
# checking data after dropping null value
df.info()

In [None]:
df.describe().T.style.background_gradient(cmap='YlOrRd')

# EDA

In [None]:
# correlation matrix
plt.figure(figsize=(25,8))
sns.heatmap(df.select_dtypes(exclude='object').corr(),annot=True)

In [None]:
df["CODE_GENDER"].value_counts()

In [None]:
Categorical=df.select_dtypes(include='object').columns.tolist()
Categorical

In [None]:
df["FLAG_OWN_CAR"].value_counts()

In [None]:
# df["FLAG_OWN_CAR"].value_counts()

In [None]:
fig, axes = plt.subplots(1,3)
colors = ['#ff9999', '#66b3ff']

g1= df['CODE_GENDER'].value_counts().plot.pie(explode=[0,0.1], ax=axes[0], colors=colors, shadow=True)
g1.set_title("Customer Distribution by Gender")

g2= df['FLAG_OWN_CAR'].value_counts().plot.pie(explode=[0,0.1], ax=axes[1],   colors=colors, shadow=True)
g2.set_title("Car Ownership")

g3= df['FLAG_OWN_REALTY'].value_counts().plot.pie(explode=[0,0.1], ax=axes[2],   colors=colors, shadow=True)
g3.set_title("Realty Ownership")

fig.set_size_inches(14,5)

plt.tight_layout()

plt.show()

In [None]:
import plotly.express as px
df_ge = df[df["CODE_GENDER"] == "F"]
df_ge = df_ge[df_ge["FLAG_OWN_CAR"] != "Y"]
df_ge = df_ge[df_ge["AMT_INCOME_TOTAL"] > 50000]

fig = px.scatter(df_ge, x="AMT_INCOME_TOTAL", y="DAYS_BIRTH", color="DAYS_BIRTH", size="AMT_INCOME_TOTAL",
                 hover_data=["FLAG_OWN_CAR", "CODE_GENDER"], size_max=30, title="FEMALE INCOME vs. AGE")
fig.show()

In [None]:
df_ge = df_ge[df_ge["AMT_INCOME_TOTAL"] > 50000]
df_ge = df_ge[df_ge["CNT_FAM_MEMBERS"] > 1]

fig = px.scatter(df_ge, x="AMT_INCOME_TOTAL", y="CNT_FAM_MEMBERS", color="CNT_FAM_MEMBERS", size="AMT_INCOME_TOTAL",
                 hover_data=["CNT_FAM_MEMBERS", "CODE_GENDER"], size_max=50, title="INCOME vs. FAMILY SIZE")
fig.show()

In [None]:
df_ge = df[df["CODE_GENDER"] == "M"]
df_ge = df_ge[df_ge["FLAG_OWN_CAR"] != "Y"]
df_ge = df_ge[df_ge["AMT_INCOME_TOTAL"] > 50000]

fig = px.scatter(df_ge, x="AMT_INCOME_TOTAL", y="DAYS_BIRTH", color="DAYS_BIRTH", size="AMT_INCOME_TOTAL",
                 hover_data=["FLAG_OWN_CAR", "CODE_GENDER"], size_max=30, title="MALE INCOME vs. AGE")
fig.show()

In [None]:
df_ge = df_ge[df_ge["AMT_INCOME_TOTAL"] > 50000]
df_ge = df_ge[df_ge["CNT_FAM_MEMBERS"] > 1]

fig = px.scatter(df_ge, x="AMT_INCOME_TOTAL", y="CNT_FAM_MEMBERS", color="CNT_FAM_MEMBERS", size="AMT_INCOME_TOTAL",
                 hover_data=["CNT_FAM_MEMBERS", "CODE_GENDER"], size_max=50, title="INCOME vs. FAMILY SIZE")
fig.show()

In [None]:
df["NAME_INCOME_TYPE"].value_counts()

In [None]:
plt.bar(df["NAME_INCOME_TYPE"].value_counts().index, df["NAME_INCOME_TYPE"].value_counts().values, color='Purple')



# Add labels and title
plt.xlabel("Income Type")
plt.ylabel("Count")
plt.title("Value Counts of NAME_INCOME_TYPE")

# Rotate x-axis labels for better visibility
plt.xticks(rotation=45)

# Show plot
plt.show()

In [None]:
df["NAME_EDUCATION_TYPE"].value_counts()

In [None]:
plt.bar(df["NAME_EDUCATION_TYPE"].value_counts().index, df["NAME_EDUCATION_TYPE"].value_counts().values,color='green')
# Add labels and title
plt.xlabel("EDUCATION Type")
plt.ylabel("Count")
plt.title("Value Counts of NAME_EDUCATION_TYPE")

# Rotate x-axis labels for better visibility
plt.xticks(rotation=30)

# Show plot
plt.show()

In [None]:
df["NAME_FAMILY_STATUS"].value_counts()

In [None]:
plt.figure(figsize=(10, 8))
family_status_counts = df["NAME_FAMILY_STATUS"].value_counts()
sns.barplot(
    y=family_status_counts.index,
    x=family_status_counts.values,
    palette='rocket'
)

plt.xlabel('Count')
plt.ylabel('Family Status')
plt.title('Counts of Different Family Statuses')
plt.show()

In [None]:
df["OCCUPATION_TYPE"].value_counts()

In [None]:
occupation_counts = df["OCCUPATION_TYPE"].value_counts()

plt.figure(figsize=(10, 8))

# Create the bar plot using the computed value counts
sns.barplot(
    y=occupation_counts.index,
    x=occupation_counts.values,
    palette='Set2'
)

In [None]:
df["STATUS"].value_counts()

In [None]:
status_counts = df["STATUS"].value_counts()

plt.figure(figsize=(10, 8))

# Create the bar plot using the computed value counts
sns.barplot(
    y=status_counts.index,
    x=status_counts.values,
    palette='rocket'
)

In [None]:
df['STATUS'].replace("C" , "Good_Debt" , inplace = True)
df['STATUS'].replace("X" , "Good_Debt" , inplace = True)
df['STATUS'].replace("0" , "Good_Debt" , inplace = True)
df['STATUS'].replace("1" , "Good_Debt" , inplace = True)
df['STATUS'].replace("2" , "Bad_Debt" , inplace = True)
df['STATUS'].replace("3" , "Bad_Debt" , inplace = True)
df['STATUS'].replace("4" , "Bad_Debt" , inplace = True)
df['STATUS'].replace("5" , "Bad_Debt" , inplace = True)

In [None]:
df["STATUS"].value_counts()

In [None]:
sns.countplot (x='STATUS',data=df)

**imbalance data**

In [None]:
Numerical=df.select_dtypes(exclude='object').columns.tolist()
Numerical

In [None]:
value_count_dict = dict()
for i in Numerical :
    print(i,'column\'s count values : ' )
    index , count = df[i].value_counts().index , df[i].value_counts().values
    value_count_dict['value']  = list(index)
    value_count_dict['count'] = list(count)
    value_count_df = pd.DataFrame(value_count_dict)
    if value_count_df.shape[0]<10:
        display(value_count_df.head(10))
    else :
        display(value_count_df.head())

In [None]:
fig, axes = plt.subplots(1,3)

g1=sns.countplot(x=df.FLAG_PHONE,linewidth=1, ax=axes[0],palette="Set2")
g1.set_title("Phone status")
g1.set_xlabel("Count")

g2=sns.countplot(x=df.FLAG_EMAIL,linewidth=1.2, ax=axes[1],palette="Set2")
g2.set_title("Email Status")
g2.set_xlabel("Count")

g3=sns.countplot(x=df.FLAG_WORK_PHONE,linewidth=1.2, ax=axes[2],palette="Set2")
g3.set_title("FLAG_WORK_PHONE Status")
g3.set_xlabel("Count")

fig.set_size_inches(14,5)

plt.tight_layout()


plt.show()

In [None]:
df["CNT_CHILDREN"].value_counts()

In [None]:

sns.boxplot( x=df["CNT_CHILDREN"])

In [None]:
df["CNT_FAM_MEMBERS"].value_counts()

In [None]:
sns.boxplot( x=df["CNT_FAM_MEMBERS"],palette="husl")

In [None]:
df["AMT_INCOME_TOTAL"].agg(["min" , 'max' , 'mean' , "std"])

In [None]:
fig = px.scatter(df, x="AMT_INCOME_TOTAL")
fig.show()

In [None]:
df["DAYS_BIRTH"]

In [None]:
df["DAYS_BIRTH"].agg(["min" , 'max'])

In [None]:


df['DAYS_BIRTH'] = datetime.strptime("2024-01-01", "%Y-%m-%d") + df['DAYS_BIRTH'].apply(pd.offsets.Day)

In [None]:
df['DAYS_BIRTH']

In [None]:
plt.hist(df['DAYS_BIRTH'])
plt.xlabel('Year')
plt.ylabel('Frequency')
plt.title('Distribution of Birth Years')
plt.show()

In [None]:
df["DAYS_EMPLOYED"]

In [None]:
df["DAYS_EMPLOYED"].agg(["min" , 'max'])

In [None]:

df['DAYS_EMPLOYED'] = datetime.strptime("2024-01-01", "%Y-%m-%d") + df['DAYS_EMPLOYED'].apply(pd.offsets.Day)

In [None]:
df['DAYS_EMPLOYED'] 

In [None]:
df["DAYS_EMPLOYED"].agg(["min" , 'max'])

In [None]:
plt.hist(df['DAYS_EMPLOYED'])
plt.xlabel('Year')
plt.ylabel('Frequency')
plt.title('Distribution of Employed Years')
plt.show()

**right skweness**

In [None]:
df["MONTHS_BALANCE"]

In [None]:
df["MONTHS_BALANCE"]=df["MONTHS_BALANCE"]*-1

In [None]:
df["MONTHS_BALANCE"]

In [None]:
df["MONTHS_BALANCE"].agg(["min" , 'max'])

In [None]:
ax = df["MONTHS_BALANCE"].hist( # Remove grid
                        xlabelsize=10, # Change size of labels on the x-axis
                        ylabelsize=12, # Change size of labels on the y-axis
                        bins=20, # Number of bins
                        edgecolor='black', # Color of the border
                        color='orange', # Color of the bins
                        rwidth=0.8 # Space between bins
                       )

# Add a bold title ('\n' allow us to jump rows)
ax.set_title('Distribution of \nthe months length',
             weight='bold') 

# Add label names
ax.set_xlabel('MONTHS_BALANCE')
ax.set_ylabel('Frequency')

# Show the plot
plt.show()

***left skweness*******

In [None]:
df.info()

# Preprocessing

visualization clear to us some issues about data like:
*   some coulumns have imbalanced like status and email_status
*   some coulumns have oultiers like AMT_INCOME_TOTAL,CNT_FAM_MEMBERS and CNT_CHILDREN
*   coulums have skweness like DAYS_EMPLOYED
*    normalize data


**handling outliers**

In [None]:
# Select the columns where you want to remove outliers
columns_to_filter = ['AMT_INCOME_TOTAL', 'CNT_FAM_MEMBERS', 'CNT_CHILDREN']

# Calculate lower and upper bounds for each selected column
lower_bounds = df[columns_to_filter].quantile(0.05)
upper_bounds = df[columns_to_filter].quantile(0.90)

# Define the boundaries for outlier removal
lower_bound_limits = lower_bounds - 0.05 * (upper_bounds - lower_bounds)
upper_bound_limits = upper_bounds + 0.05 * (upper_bounds - lower_bounds)

# Filter out the rows containing outliers in any of the selected columns
df = df[~((df[columns_to_filter] < lower_bound_limits) | (df[columns_to_filter] > upper_bound_limits)).any(axis=1)]


In [None]:
df.info()

In [None]:
sns.boxplot( data=df[["AMT_INCOME_TOTAL"]],palette="husl")

In [None]:
sns.boxplot( data=df[["CNT_FAM_MEMBERS","CNT_CHILDREN"]],palette="husl")

#### label_encoding

In [None]:
lst = list(df.select_dtypes('object').columns)

 

for i in lst :

  lb = LabelEncoder()

  df[i] = lb.fit_transform(df[i])

In [None]:
df=df.drop(["FLAG_PHONE"], axis=1)

In [None]:
x = df.drop(["ID",'STATUS'], axis=1)
y = df['STATUS']

### oversampling data

In [None]:


# Assuming x and y are your feature and target variables respectively
smote = SMOTE()
X_balance, Y_balance = smote.fit_resample(x, y)


In [None]:

x_train, x_test, y_train, y_test = train_test_split(X_balance, Y_balance, test_size=0.2, random_state=16)


In [None]:
# Skew function of Pandas
old_skew = df.skew().sort_values(ascending=False)
old_skew

#### handling skewness 

In [None]:
power = PowerTransformer(method='yeo-johnson')
x_train = power.fit_transform(x_train)
x_test = power.fit_transform(x_test)

In [None]:
# normalizer = StandardScaler()
# x_train = normalizer.fit_transform(x_train)
# x_test = normalizer.fit_transform(x_test)

### Modeling & evaluation
* LogisticRegression
* RandomForestClassifier
* XGBClassifier


**LogisticRegression**

In [None]:
log_model = LogisticRegression()
log_model.fit(x_train, y_train)
print('Logistic Model  Train Accuracy : ',log_model.score(x_train, y_train)*100, '%')
print('Logistic Model Test Accuracy : ', log_model.score(x_test, y_test)*100, '%')

prediction = log_model.predict(x_test)
      


In [None]:
print('\nClassification report:')      
print(classification_report(y_test, prediction))



In [None]:

conf_matrix = confusion_matrix(y_test, prediction)

# Plot confusion matrix
plt.figure(figsize=(2, 2))
sns.set_style('white')
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix: Logistic Regression')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Assuming model2 is your trained classification model
# Assuming x_test and y_test are your test features and labels, respectively

# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, log_model.predict_proba(x_test)[:, 1])

# Calculate AUC
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

**RandomForestClassifier**

In [None]:
model = RandomForestClassifier(random_state = 0)
model.fit(x_train, y_train)
print("RandomForest Train Accuracy = ",model.score(x_train, y_train))
y_predict = model.predict(x_test)
print("RandomForest Test Accuracy = " , accuracy_score(y_test, y_predict))
print("F1 = " , f1_score(y_test, y_predict))



In [None]:
      
print('\nClassification report:')      
print(classification_report(y_test, y_predict))

In [None]:

conf_matrix = confusion_matrix(y_test, y_predict)

# Plot confusion matrix
plt.figure(figsize=(2, 2))
sns.set_style('white')
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Purples', cbar=False)
plt.title('Confusion Matrix: Logistic Regression')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Assuming model2 is your trained classification model
# Assuming x_test and y_test are your test features and labels, respectively

# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(x_test)[:, 1])

# Calculate AUC
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


**XGBoost**

In [None]:
model2 =  XGBClassifier()
model2.fit(x_train, y_train)
print(" XGB Train Accuracy = " ,model2.score(x_train, y_train))
y_predict = model.predict(x_test)
print(" XGB Test Accuracy = " , accuracy_score(y_test, y_predict))
print("F1 = " , f1_score(y_test, y_predict))

      


In [None]:
print('\nClassification report:')      
print(classification_report(y_test, y_predict))

In [None]:

# Plot confusion matrix
plt.figure(figsize=(2,2))
sns.set_style('white')
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='PuBuGn_r', cbar=False)
plt.title('Confusion Matrix: Logistic Regression')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Assuming model2 is your trained classification model
# Assuming x_test and y_test are your test features and labels, respectively

# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, model2.predict_proba(x_test)[:, 1])

# Calculate AUC
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


In [None]:
feat_importances = model.feature_importances_

# Create a pandas Series with feature importances
feat_importances_series = pd.Series(feat_importances, index=df.columns[1:-1])

# Sort feature importances in descending order
feat_importances_series = feat_importances_series.sort_values(ascending=False)

# Plot feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x=feat_importances_series.values, y=feat_importances_series.index, palette='viridis')
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Feature Importance Plot')
plt.show()

In [None]:
# pip install pycaret


In [None]:
# from pycaret.regression import *
# from pycaret.classification import *