# IIMT2641 Assignment 4
## Sibo Ding
## Spring 2023

In [1]:
import pandas as pd
import statsmodels.api as sm

## Load the Data

In [2]:
loans = pd.read_csv("Loans.csv")
print('First 5 rows:\n', loans.head())
print('\nNumber of observations and variables:', loans.shape)
print('\nNames of variables:', loans.columns)

# Change to the categorical/factor variable
loans['NotFullyPaid'] = loans['NotFullyPaid'].astype('category')

First 5 rows:
    CreditPolicy  Purpose.CC  Purpose.DC  Purpose.Edu  Purpose.MP  Purpose.SB  \
0             1           0           1            0           0           0   
1             1           1           0            0           0           0   
2             1           0           1            0           0           0   
3             1           0           1            0           0           0   
4             1           1           0            0           0           0   

   IntRate  Installment  LogAnnualInc    Dti  Fico  DaysWithCrLine  RevolBal  \
0   0.1189       829.10     11.350407  19.48   737     5639.958333     28854   
1   0.1071       228.22     11.082143  14.29   707     2760.000000     33623   
2   0.1357       366.86     10.373491  11.63   682     4710.000000      3511   
3   0.1008       162.34     11.350407   8.10   712     2699.958333     33667   
4   0.1426       102.92     11.299732  14.97   667     4066.000000      4740   

   RevolUtil  InqLast6m

## Train-test Split

In [3]:
# Randomly split the dataset with 70% in the training set
from sklearn.model_selection import train_test_split

x = loans[['CreditPolicy', 'Purpose.CC', 'Purpose.DC', 'Purpose.Edu', 'Purpose.MP',
       'Purpose.SB', 'IntRate', 'Installment', 'LogAnnualInc', 'Dti', 'Fico',
       'DaysWithCrLine', 'RevolBal', 'RevolUtil', 'InqLast6mths', 'Delinq2yrs',
       'PubRec']]
y = loans['NotFullyPaid']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=12)
print(f'Length of training set:', len(y_train))
print(f'Length of test set:', len(y_test))

Length of training set: 6704
Length of test set: 2874


## Baseline Model Accuracy

In [4]:
len(y_test[y_test == 0]) / len(y_test)

0.8302018093249826

## Logistic Regression

In [5]:
x = sm.add_constant(x_train)
model1 = sm.Logit(y_train, x).fit()
print(model1.summary())

Optimization terminated successfully.
         Current function value: 0.403446
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:           NotFullyPaid   No. Observations:                 6704
Model:                          Logit   Df Residuals:                     6686
Method:                           MLE   Df Model:                           17
Date:                Wed, 12 Apr 2023   Pseudo R-squ.:                 0.06776
Time:                        22:22:41   Log-Likelihood:                -2704.7
converged:                       True   LL-Null:                       -2901.3
Covariance Type:            nonrobust   LLR p-value:                 4.969e-73
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const              9.7875      1.578      6.201      0.000       6.694      12.881
CreditPolicy     

In [6]:
# Significant independent variables (with p < 0.05)
# Note: `const` is not an independent variable
model1.params[model1.pvalues < 0.05]

const           9.787485
CreditPolicy   -0.294438
Purpose.CC     -0.560265
Purpose.DC     -0.387778
Purpose.SB      0.427391
Installment     0.001255
LogAnnualInc   -0.460106
Fico           -0.009860
RevolBal        0.000004
InqLast6mths    0.096129
PubRec          0.236448
dtype: float64

## Differences between Two Logits

In [7]:
coef_fico = model1.params['Fico']
coef_fico * (700 - 710)

0.09859609729070407

## Predict the Test Set

In [8]:
x = sm.add_constant(x_test)
predict_test1 = model1.predict(x)

# Confusion matrix for out-of-sample prediction at threshold value 0.5
from sklearn.metrics import confusion_matrix, accuracy_score

threshold = 0.5
predict_class1 = (predict_test1 > threshold).astype(int)
confusion_matrix(y_test, predict_class1)

array([[2371,   15],
       [ 473,   15]], dtype=int64)

In [9]:
# Accuracy
accuracy_score(y_test, predict_class1)

0.8302018093249826

In [10]:
# Baseline Model Accuracy
len(y_test[y_test == 0]) / len(y_test)

0.8302018093249826

The logistic regression model is same accurate as the baseline model.

## Logistic Regression Using `IntRate`

In [11]:
x = sm.add_constant(x_train['IntRate'])
model2 = sm.Logit(y_train, x).fit()
print(model2.summary())

Optimization terminated successfully.
         Current function value: 0.421564
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:           NotFullyPaid   No. Observations:                 6704
Model:                          Logit   Df Residuals:                     6702
Method:                           MLE   Df Model:                            1
Date:                Wed, 12 Apr 2023   Pseudo R-squ.:                 0.02589
Time:                        22:22:42   Log-Likelihood:                -2826.2
converged:                       True   LL-Null:                       -2901.3
Covariance Type:            nonrobust   LLR p-value:                 1.534e-34
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -3.6464      0.170    -21.475      0.000      -3.979      -3.314
IntRate       15.5286      1.

`IntRate` is significant in this model at 0.1%. It is not significant in the first model at 5%.  
This difference may be because some information in the second model is explained by other independent variables in the first model.

## Predict the Test Set

In [12]:
x = sm.add_constant(x_test['IntRate'])
predict_test2 = model2.predict(x)

# Highest predicted probability
predict_test2.max()

0.4127090431130393

In [13]:
# No. of loans would not be paid back in full
predict_test2[predict_test2 > 0.5].count()

0