## Initial Modeling

In [1]:
import pandas as pd

In [53]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc
%matplotlib inline

In [2]:
df = pd.read_csv('Data_and_Cleaning/cleaned_df.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [6]:
df = df.drop(columns='Unnamed: 0', axis=1)

In [7]:
df

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.658180,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.233810,30,0,0.036050,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
120260,0,0.385742,50,0,0.404293,3400.0,7,0,0,0,0.0
120261,0,0.040674,74,0,0.225131,2100.0,4,0,1,0,0.0
120262,0,0.299745,44,0,0.716562,5584.0,4,0,1,0,2.0
120263,0,0.000000,30,0,0.000000,5716.0,4,0,0,0,0.0


## Feature Engineering

In [10]:
#total number of past dues
df['Total_PastDues'] = df['NumberOfTime30-59DaysPastDueNotWorse'] + df['NumberOfTime60-89DaysPastDueNotWorse'] + df['NumberOfTimes90DaysLate']

In [11]:
df

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,Total_PastDues
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0,2
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0,0
2,0,0.658180,38,1,0.085113,3042.0,2,1,0,0,0.0,2
3,0,0.233810,30,0,0.036050,3300.0,5,0,0,0,0.0,0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
120260,0,0.385742,50,0,0.404293,3400.0,7,0,0,0,0.0,0
120261,0,0.040674,74,0,0.225131,2100.0,4,0,1,0,0.0,0
120262,0,0.299745,44,0,0.716562,5584.0,4,0,1,0,2.0,0
120263,0,0.000000,30,0,0.000000,5716.0,4,0,0,0,0.0,0


In [13]:
#util % times age
#created feature

df['Util_by_Age'] = df['RevolvingUtilizationOfUnsecuredLines'] * df['age']
df

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,Total_PastDues,Util_by_Age
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0,2,34.475697
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0,0,38.286041
2,0,0.658180,38,1,0.085113,3042.0,2,1,0,0,0.0,2,25.010845
3,0,0.233810,30,0,0.036050,3300.0,5,0,0,0,0.0,0,7.014293
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0,1,44.454731
...,...,...,...,...,...,...,...,...,...,...,...,...,...
120260,0,0.385742,50,0,0.404293,3400.0,7,0,0,0,0.0,0,19.287113
120261,0,0.040674,74,0,0.225131,2100.0,4,0,1,0,0.0,0,3.009866
120262,0,0.299745,44,0,0.716562,5584.0,4,0,1,0,2.0,0,13.188787
120263,0,0.000000,30,0,0.000000,5716.0,4,0,0,0,0.0,0,0.000000


In [15]:
#DebtRatio * MI / number of lines
#debt per line

df['Debt_per_Line'] = df['DebtRatio'] * df['MonthlyIncome'] / df['NumberOfOpenCreditLinesAndLoans']
df

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,Total_PastDues,Util_by_Age,Debt_per_Line
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0,2,34.475697,563.322847
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0,0,38.286041,79.219531
2,0,0.658180,38,1,0.085113,3042.0,2,1,0,0,0.0,2,25.010845,129.457443
3,0,0.233810,30,0,0.036050,3300.0,5,0,0,0,0.0,0,7.014293,23.792790
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0,1,44.454731,226.425013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120260,0,0.385742,50,0,0.404293,3400.0,7,0,0,0,0.0,0,19.287113,196.370815
120261,0,0.040674,74,0,0.225131,2100.0,4,0,1,0,0.0,0,3.009866,118.193717
120262,0,0.299745,44,0,0.716562,5584.0,4,0,1,0,2.0,0,13.188787,1000.320859
120263,0,0.000000,30,0,0.000000,5716.0,4,0,0,0,0.0,0,0.000000,0.000000


In [18]:
df['NumberOfDependents'] + 1

0         3.0
1         2.0
2         1.0
3         1.0
4         1.0
         ... 
120260    1.0
120261    1.0
120262    3.0
120263    1.0
120264    1.0
Name: NumberOfDependents, Length: 120265, dtype: float64

In [22]:
#debt level

df['DebtLevel'] = df['DebtRatio'] * df['MonthlyIncome']
df

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,Total_PastDues,Util_by_Age,Debt_per_Line,DebtLevel
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0,2,34.475697,563.322847,7323.197016
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0,0,38.286041,79.219531,316.878123
2,0,0.658180,38,1,0.085113,3042.0,2,1,0,0,0.0,2,25.010845,129.457443,258.914887
3,0,0.233810,30,0,0.036050,3300.0,5,0,0,0,0.0,0,7.014293,23.792790,118.963951
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0,1,44.454731,226.425013,1584.975094
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120260,0,0.385742,50,0,0.404293,3400.0,7,0,0,0,0.0,0,19.287113,196.370815,1374.595707
120261,0,0.040674,74,0,0.225131,2100.0,4,0,1,0,0.0,0,3.009866,118.193717,472.774869
120262,0,0.299745,44,0,0.716562,5584.0,4,0,1,0,2.0,0,13.188787,1000.320859,4001.283436
120263,0,0.000000,30,0,0.000000,5716.0,4,0,0,0,0.0,0,0.000000,0.000000,0.000000


In [24]:
#DebtRatio * MI / (num of deps + 1) 
#Debt per individual

df['Debt_per_Individual'] = df['DebtRatio'] * df['MonthlyIncome'] / (df['NumberOfDependents'] + 1)
df

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,Total_PastDues,Util_by_Age,Debt_per_Line,DebtLevel,Debt_per_Individual
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0,2,34.475697,563.322847,7323.197016,2441.065672
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0,0,38.286041,79.219531,316.878123,158.439061
2,0,0.658180,38,1,0.085113,3042.0,2,1,0,0,0.0,2,25.010845,129.457443,258.914887,258.914887
3,0,0.233810,30,0,0.036050,3300.0,5,0,0,0,0.0,0,7.014293,23.792790,118.963951,118.963951
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0,1,44.454731,226.425013,1584.975094,1584.975094
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120260,0,0.385742,50,0,0.404293,3400.0,7,0,0,0,0.0,0,19.287113,196.370815,1374.595707,1374.595707
120261,0,0.040674,74,0,0.225131,2100.0,4,0,1,0,0.0,0,3.009866,118.193717,472.774869,472.774869
120262,0,0.299745,44,0,0.716562,5584.0,4,0,1,0,2.0,0,13.188787,1000.320859,4001.283436,1333.761145
120263,0,0.000000,30,0,0.000000,5716.0,4,0,0,0,0.0,0,0.000000,0.000000,0.000000,0.000000


In [25]:
df.isna().sum()

SeriousDlqin2yrs                          0
RevolvingUtilizationOfUnsecuredLines      0
age                                       0
NumberOfTime30-59DaysPastDueNotWorse      0
DebtRatio                                 0
MonthlyIncome                             0
NumberOfOpenCreditLinesAndLoans           0
NumberOfTimes90DaysLate                   0
NumberRealEstateLoansOrLines              0
NumberOfTime60-89DaysPastDueNotWorse      0
NumberOfDependents                        0
Total_PastDues                            0
Util_by_Age                               0
Debt_per_Line                           679
DebtLevel                                 0
Debt_per_Individual                       0
dtype: int64

In [28]:
#MI / (num of deps + 1)
#MI per individual

df['MonthlyIncome_per_Individual'] = df['MonthlyIncome'] / (df['NumberOfDependents'] + 1)
df

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,Total_PastDues,Util_by_Age,Debt_per_Line,DebtLevel,Debt_per_Individual,MonthlyIncome_per_Individual
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0,2,34.475697,563.322847,7323.197016,2441.065672,3040.000000
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0,0,38.286041,79.219531,316.878123,158.439061,1300.000000
2,0,0.658180,38,1,0.085113,3042.0,2,1,0,0,0.0,2,25.010845,129.457443,258.914887,258.914887,3042.000000
3,0,0.233810,30,0,0.036050,3300.0,5,0,0,0,0.0,0,7.014293,23.792790,118.963951,118.963951,3300.000000
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0,1,44.454731,226.425013,1584.975094,1584.975094,63588.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120260,0,0.385742,50,0,0.404293,3400.0,7,0,0,0,0.0,0,19.287113,196.370815,1374.595707,1374.595707,3400.000000
120261,0,0.040674,74,0,0.225131,2100.0,4,0,1,0,0.0,0,3.009866,118.193717,472.774869,472.774869,2100.000000
120262,0,0.299745,44,0,0.716562,5584.0,4,0,1,0,2.0,0,13.188787,1000.320859,4001.283436,1333.761145,1861.333333
120263,0,0.000000,30,0,0.000000,5716.0,4,0,0,0,0.0,0,0.000000,0.000000,0.000000,0.000000,5716.000000


In [29]:
df.isna().sum()

SeriousDlqin2yrs                          0
RevolvingUtilizationOfUnsecuredLines      0
age                                       0
NumberOfTime30-59DaysPastDueNotWorse      0
DebtRatio                                 0
MonthlyIncome                             0
NumberOfOpenCreditLinesAndLoans           0
NumberOfTimes90DaysLate                   0
NumberRealEstateLoansOrLines              0
NumberOfTime60-89DaysPastDueNotWorse      0
NumberOfDependents                        0
Total_PastDues                            0
Util_by_Age                               0
Debt_per_Line                           679
DebtLevel                                 0
Debt_per_Individual                       0
MonthlyIncome_per_Individual              0
dtype: int64

In [30]:
df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,Total_PastDues,Util_by_Age,Debt_per_Line,DebtLevel,Debt_per_Individual,MonthlyIncome_per_Individual
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0,2,34.475697,563.322847,7323.197016,2441.065672,3040.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0,0,38.286041,79.219531,316.878123,158.439061,1300.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0,2,25.010845,129.457443,258.914887,258.914887,3042.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0,0,7.014293,23.79279,118.963951,118.963951,3300.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0,1,44.454731,226.425013,1584.975094,1584.975094,63588.0


In [34]:
df[df.isna().any(axis=1)]

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,Total_PastDues,Util_by_Age,Debt_per_Line,DebtLevel,Debt_per_Individual,MonthlyIncome_per_Individual
330,0,1.0,60,0,0.0,4200.0,0,0,0,0,0.0,0,59.999994,,0.0,0.0,4200.000000
746,0,1.0,34,1,0.0,5400.0,0,1,0,0,0.0,2,33.999997,,0.0,0.0,5400.000000
957,0,1.0,82,0,0.0,1400.0,0,0,0,0,0.0,0,81.999992,,0.0,0.0,1400.000000
1402,1,1.0,27,98,0.0,2700.0,0,98,0,98,0.0,294,26.999997,,0.0,0.0,2700.000000
1481,0,1.0,31,0,0.0,1200.0,0,0,0,0,0.0,0,30.999997,,0.0,0.0,1200.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119378,0,1.0,42,0,0.0,2400.0,0,0,0,0,0.0,0,41.999996,,0.0,0.0,2400.000000
119603,0,1.0,82,0,0.0,439.0,0,0,0,0,0.0,0,81.999992,,0.0,0.0,439.000000
119663,0,1.0,26,98,0.0,2000.0,0,98,0,98,0.0,294,25.999997,,0.0,0.0,2000.000000
119861,1,1.0,27,0,0.0,3258.0,0,0,0,0,0.0,0,26.999997,,0.0,0.0,3258.000000


In [37]:
df = df.dropna()

In [38]:
df

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,Total_PastDues,Util_by_Age,Debt_per_Line,DebtLevel,Debt_per_Individual,MonthlyIncome_per_Individual
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0,2,34.475697,563.322847,7323.197016,2441.065672,3040.000000
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0,0,38.286041,79.219531,316.878123,158.439061,1300.000000
2,0,0.658180,38,1,0.085113,3042.0,2,1,0,0,0.0,2,25.010845,129.457443,258.914887,258.914887,3042.000000
3,0,0.233810,30,0,0.036050,3300.0,5,0,0,0,0.0,0,7.014293,23.792790,118.963951,118.963951,3300.000000
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0,1,44.454731,226.425013,1584.975094,1584.975094,63588.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120260,0,0.385742,50,0,0.404293,3400.0,7,0,0,0,0.0,0,19.287113,196.370815,1374.595707,1374.595707,3400.000000
120261,0,0.040674,74,0,0.225131,2100.0,4,0,1,0,0.0,0,3.009866,118.193717,472.774869,472.774869,2100.000000
120262,0,0.299745,44,0,0.716562,5584.0,4,0,1,0,2.0,0,13.188787,1000.320859,4001.283436,1333.761145,1861.333333
120263,0,0.000000,30,0,0.000000,5716.0,4,0,0,0,0.0,0,0.000000,0.000000,0.000000,0.000000,5716.000000


In [39]:
df.isna().sum()

SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
Total_PastDues                          0
Util_by_Age                             0
Debt_per_Line                           0
DebtLevel                               0
Debt_per_Individual                     0
MonthlyIncome_per_Individual            0
dtype: int64

## Modeling

In [41]:
df['SeriousDlqin2yrs']

0         1
1         0
2         0
3         0
4         0
         ..
120260    0
120261    0
120262    0
120263    0
120264    0
Name: SeriousDlqin2yrs, Length: 119586, dtype: int64

In [42]:
df['SeriousDlqin2yrs'].value_counts()

0    111410
1      8176
Name: SeriousDlqin2yrs, dtype: int64

In [44]:
y = df['SeriousDlqin2yrs']

In [51]:
X = df.drop('SeriousDlqin2yrs', axis=1)

In [52]:
X.head()

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,Total_PastDues,Util_by_Age,Debt_per_Line,DebtLevel,Debt_per_Individual,MonthlyIncome_per_Individual
0,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0,2,34.475697,563.322847,7323.197016,2441.065672,3040.0
1,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0,0,38.286041,79.219531,316.878123,158.439061,1300.0
2,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0,2,25.010845,129.457443,258.914887,258.914887,3042.0
3,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0,0,7.014293,23.79279,118.963951,118.963951,3300.0
4,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0,1,44.454731,226.425013,1584.975094,1584.975094,63588.0


In [54]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=55)

In [None]:
# Instantiate the model
logreg = LogisticRegression(fit_intercept=False, C=1e12, solver='liblinear')

# Fit the model
logreg.fit(X_train, y_train)

In [None]:
# Instantiate the model
logreg = LogisticRegression(fit_intercept=False, C=1e12, solver='liblinear', class_weight='balanced')

# Fit the model
logreg.fit(X_train, y_train)

In [None]:
# Instantiate the model
logreg = LogisticRegression(fit_intercept=False, C=1e12, solver='sag')

# Fit the model
logreg.fit(X_train, y_train)

In [None]:
# Instantiate the model
logreg = LogisticRegression(fit_intercept=False, C=1e12, solver='saga')

# Fit the model
logreg.fit(X_train, y_train)