In [1]:
import pandas as pd
import numpy as sns

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, accuracy_score, classification_report, precision_recall_fscore_support
from scipy.stats import chi2_contingency
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import RandomForestClassifier

import os
import warnings

In [5]:
a1 = pd.read_excel("data/case_study1.xlsx")
a2 = pd.read_excel("data/case_study2.xlsx")

In [6]:
df1 = a1.copy()
df2 = a2.copy()

In [7]:
df1.shape, df2.shape

((51336, 26), (51336, 62))

## Removing Null

In these DF null values are represented by -99999.<br>
Also we take following rules into consideration while dealing null values:
- Since this is a very sensitive data and we don't want to make any assumption about the missing values and don't want to impute them with some other values. Hence,
- If there is more than 20% data in a particular column that is missing, then we'll drop that column.
- If there is less than 20% missing data, we will remove those rows from the dataset.

In [113]:
100*(df1 == -99999).mean()[(df1 == -99999).mean() > 0].sort_values(ascending=False)

Age_Oldest_TL    0.077918
Age_Newest_TL    0.077918
dtype: float64

In [114]:
null_cols = (df2 == -99999).mean()[(df2 == -99999).mean()>0]*100
null_cols.sort_values(ascending=False)

CC_utilization                  92.792582
PL_utilization                  86.557192
time_since_recent_deliquency    70.026882
max_delinquency_level           70.026882
time_since_first_deliquency     70.026882
max_unsec_exposure_inPct        45.149603
max_deliq_6mts                  25.109085
max_deliq_12mts                 21.100203
time_since_recent_enq           12.312997
enq_L3m                         12.312997
enq_L6m                         12.312997
enq_L12m                        12.312997
PL_enq                          12.312997
PL_enq_L12m                     12.312997
PL_enq_L6m                      12.312997
CC_enq_L12m                     12.312997
CC_enq_L6m                      12.312997
CC_enq                          12.312997
tot_enq                         12.312997
time_since_recent_payment        8.358657
pct_currentBal_all_TL            0.140252
dtype: float64

In [115]:
# removing rows from first dataset
df1 = df1.loc[(df1["Age_Oldest_TL"] != -99999) & (df1 ["Age_Newest_TL"] != -99999)]

In [116]:
# removing columns from second dataset
columns_to_drop = []
for col in df2.columns:
    if df2.loc[df2[col] == -99999].shape[0] > 10000:
        columns_to_drop.append(col)
columns_to_drop
df2.drop(columns=columns_to_drop, inplace=True)

In [122]:
# removing null rows
for col in df2.columns:
    df2 = df2.loc[df2[col] != -99999]

In [125]:
df1.isnull().sum().sum(),df2.isnull().sum().sum()

(0, 0)

## Mergin two datasets 

In [127]:
# checking for common colums
col1 = set(df1.columns)
col2 = set(df2.columns)
col1.intersection(col2)

{'PROSPECTID'}

In [129]:
# merging datasets on "PROSPECTID"
df = pd.merge(df1, df2, how="inner", on="PROSPECTID")

In [130]:
df

Unnamed: 0,PROSPECTID,Total_TL,Tot_Closed_TL,Tot_Active_TL,Total_TL_opened_L6M,Tot_TL_closed_L6M,pct_tl_open_L6M,pct_tl_closed_L6M,pct_active_tl,pct_closed_tl,...,pct_PL_enq_L6m_of_L12m,pct_CC_enq_L6m_of_L12m,pct_PL_enq_L6m_of_ever,pct_CC_enq_L6m_of_ever,HL_Flag,GL_Flag,last_prod_enq2,first_prod_enq2,Credit_Score,Approved_Flag
0,1,5,4,1,0,0,0.000,0.00,0.200,0.800,...,0.0,0.0,0.000,0.0,1,0,PL,PL,696,P2
1,2,1,0,1,0,0,0.000,0.00,1.000,0.000,...,0.0,0.0,0.000,0.0,0,0,ConsumerLoan,ConsumerLoan,685,P2
2,3,8,0,8,1,0,0.125,0.00,1.000,0.000,...,0.0,0.0,0.000,0.0,1,0,ConsumerLoan,others,693,P2
3,5,3,2,1,0,0,0.000,0.00,0.333,0.667,...,0.0,0.0,0.000,0.0,0,0,AL,AL,753,P1
4,6,6,5,1,0,0,0.000,0.00,0.167,0.833,...,1.0,0.0,0.429,0.0,1,0,ConsumerLoan,PL,668,P3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42059,51332,3,0,3,1,0,0.333,0.00,1.000,0.000,...,0.0,0.0,0.000,0.0,0,0,ConsumerLoan,ConsumerLoan,650,P4
42060,51333,4,2,2,0,1,0.000,0.25,0.500,0.500,...,0.0,0.0,0.000,0.0,0,0,others,others,702,P1
42061,51334,2,1,1,1,1,0.500,0.50,0.500,0.500,...,1.0,0.0,1.000,0.0,0,0,ConsumerLoan,others,661,P3
42062,51335,2,1,1,0,0,0.000,0.00,0.500,0.500,...,0.0,0.0,0.000,0.0,0,0,ConsumerLoan,others,686,P2


In [None]:
# 1:24