In [70]:
# imports
import csv
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
import sklearn.model_selection as ms
from sklearn.metrics import f1_score
from dateutil.parser import parse
from datetime import datetime
import statsmodels.api as sm
from scipy import stats

## Part 1: Data Extraction and Cleaning

We'll begin by importing the data and doing some basic information extraction and cleaning. 
This helps prepare our dataset for analysis later on, in Part 2: Data Wrangling and Analysis.

The data file, whose contents are described in the provided file _Data Dictionary.xlsx_, is:

* `train.csv` - data on auto-loans, in comma-separated value (CSV) format

## Step 1.1: Importing CSV Data

The first task will be to import tabular data.

In [25]:
# load data
train_x = pd.read_csv("../Credit-Scoring/train.csv")
train_x = train_x.iloc[0:5000]

In [26]:
# observe what data looks like
train_x.head(10)

Unnamed: 0,UNIQUEID,DISBURSED_AMOUNT,ASSET_COST,LTV,BRANCH_ID,SUPPLIER_ID,MANUFACTURER_ID,CURRENT_PINCODE_ID,DATE_OF_BIRTH,EMPLOYMENT_TYPE,...,SEC_SANCTIONED_AMOUNT,SEC_DISBURSED_AMOUNT,PRIMARY_INSTAL_AMT,SEC_INSTAL_AMT,NEW_ACCTS_IN_LAST_SIX_MONTHS,DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS,AVERAGE_ACCT_AGE,CREDIT_HISTORY_LENGTH,NO_OF_INQUIRIES,LOAN_DEFAULT
0,420825,50578,58400,89.55,67,22807,45,1441,01-01-1984,Salaried,...,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0,0
1,537409,47145,65550,73.23,67,22807,45,1502,31-07-1985,Self employed,...,0,0,1991,0,0,1,1yrs 11mon,1yrs 11mon,0,1
2,417566,53278,61360,89.63,67,22807,45,1497,24-08-1985,Self employed,...,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0,0
3,624493,57513,66113,88.48,67,22807,45,1501,30-12-1993,Self employed,...,0,0,31,0,0,0,0yrs 8mon,1yrs 3mon,1,1
4,539055,52378,60300,88.39,67,22807,45,1495,09-12-1977,Self employed,...,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,1,1
5,518279,54513,61900,89.66,67,22807,45,1501,08-09-1990,Self employed,...,0,0,1347,0,0,0,1yrs 9mon,2yrs 0mon,0,0
6,529269,46349,61500,76.42,67,22807,45,1502,01-06-1988,Salaried,...,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0,0
7,510278,43894,61900,71.89,67,22807,45,1501,04-10-1989,Salaried,...,0,0,0,0,0,0,0yrs 2mon,0yrs 2mon,0,0
8,490213,53713,61973,89.56,67,22807,45,1497,15-11-1991,Self employed,...,0,0,0,0,0,0,4yrs 8mon,4yrs 8mon,1,0
9,510980,52603,61300,86.95,67,22807,45,1492,01-06-1968,Salaried,...,0,0,2608,0,0,0,1yrs 7mon,1yrs 7mon,0,0


In [27]:
# Identify number of missing values in different columns
train_x.isnull().sum()

UNIQUEID                                 0
DISBURSED_AMOUNT                         0
ASSET_COST                               0
LTV                                      0
BRANCH_ID                                0
SUPPLIER_ID                              0
MANUFACTURER_ID                          0
CURRENT_PINCODE_ID                       0
DATE_OF_BIRTH                            0
EMPLOYMENT_TYPE                        144
DISBURSAL_DATE                           0
STATE_ID                                 0
EMPLOYEE_CODE_ID                         0
MOBILENO_AVL_FLAG                        0
AADHAR_FLAG                              0
PAN_FLAG                                 0
VOTERID_FLAG                             0
DRIVING_FLAG                             0
PASSPORT_FLAG                            0
PERFORM_CNS_SCORE                        0
PERFORM_CNS_SCORE_DESCRIPTION            0
PRI_NO_OF_ACCTS                          0
PRI_ACTIVE_ACCTS                         0
PRI_OVERDUE

## Part 1.2 Handling Missing Data

Since our only missing data __'EMPLOYMENT _ TYPE'__ is categorical, we assign a unique 'missing' label to rows missing that value.

In [28]:
# fill missing categorical variables with new label 'Missing'
categorical_missing_vars = ['EMPLOYMENT_TYPE']
for category in categorical_missing_vars:
    train_x[category] = train_x[category].fillna('Missing')

## Part 1.3 Wrangling Zip-Codes

Because there are so many values for __'CURRENT_PINCODE_ID'__, to ensure that enough samples belong to each category, we will bin them by dropping the first 2 digits, leaving only the area code.

In [29]:
train_x['CURRENT_PINCODE_ID'] = train_x['CURRENT_PINCODE_ID'].apply(lambda x: x//100)
train_x[['CURRENT_PINCODE_ID']].head()

Unnamed: 0,CURRENT_PINCODE_ID
0,14
1,15
2,14
3,15
4,14


## Part 1.4 Converting Birthdays to Age

To make __DATE_OF_BIRTH__ a useable numeric feature, we will convert birthdays to age in years

In [30]:
def age(x):
    birthday = parse(x)
    now = datetime.now()
    return (now - birthday).days // 365
train_x['DATE_OF_BIRTH'] = train_x['DATE_OF_BIRTH'].apply(age)
train_x['DATE_OF_BIRTH'].describe()

count    5000.00000
mean       35.15460
std         9.89472
min        18.00000
25%        27.00000
50%        33.00000
75%        42.00000
max        65.00000
Name: DATE_OF_BIRTH, dtype: float64

## Part 1.5 Parsing Months from Months & Years Strings

__AVERAGE_ACCT_AGE__ and __CREDIT_HISTORY_LENGTH__ were both in the form _4yrs 8mon to number_
We parse these strings and convert them to numeric months

In [31]:
toParse = ['AVERAGE_ACCT_AGE', 'CREDIT_HISTORY_LENGTH']

def monthsFromString(string):
    split = string.split('yrs')
    yearsToMonths = int(split[0]) * 12
    months = int(split[1].split(' ')[1].split('mon')[0])
    return yearsToMonths + months

for field in toParse:
    train_x[field] = train_x[field].apply(monthsFromString)

train_x['AVERAGE_ACCT_AGE'].describe()

count    5000.000000
mean        8.602200
std        15.310272
min         0.000000
25%         0.000000
50%         0.000000
75%        12.000000
max       154.000000
Name: AVERAGE_ACCT_AGE, dtype: float64

## Part 1.6 Converting Categorical Variables to One-hot Encodings

Categorical variables will be converted to one-hot encodings to allow for regression. 
The following were one-hot encoded:

__['BRANCH_ID', 'SUPPLIER_ID', 'MANUFACTURER_ID', 'STATE_ID', 'EMPLOYMENT_TYPE', 'STATE_ID', 'PERFORM_CNS_SCORE_DESCRIPTION']__

Some categorical variables were already one-hot encodeded:

__['AADHAR_FLAG', 'PAN_FLAG', 'VOTERID_FLAG', 'DRIVING_FLAG', 'PASSPORT_FLAG']__

In [32]:
# replace non-binary categorical variables with one-hot encodings
categorical_vars = ['BRANCH_ID', 'SUPPLIER_ID', 'MANUFACTURER_ID', 'STATE_ID', 'EMPLOYMENT_TYPE', 'PERFORM_CNS_SCORE_DESCRIPTION', 'CURRENT_PINCODE_ID']
for category in categorical_vars:
    train_x = pd.concat([train_x, pd.get_dummies(train_x[category], drop_first=True)], axis=1)
    train_x.drop([category], axis=1, inplace=True)
train_x.shape

(5000, 372)

## Part 1.7 Non-Predictive Attributes Dropped

We finally drop columns that won't be predictive or take on only one value, including:

__['DISBURSAL_DATE', 'EMPLOYEE_CODE_ID', 'MOBILENO_AVL_FLAG', 'UNIQUEID', 'EMPLOYEE_CODE_ID']__

This concludes data cleaning.

In [33]:
train_x = train_x.drop(columns = ['DISBURSAL_DATE', 'MOBILENO_AVL_FLAG', 'UNIQUEID', 'EMPLOYEE_CODE_ID'])

In [34]:
# visualize cleaned data
train_x.head(10)

Unnamed: 0,DISBURSED_AMOUNT,ASSET_COST,LTV,DATE_OF_BIRTH,AADHAR_FLAG,PAN_FLAG,VOTERID_FLAG,DRIVING_FLAG,PASSPORT_FLAG,PERFORM_CNS_SCORE,...,61,62,63,64,65,66,67,68,70,71
0,50578,58400,89.55,35,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,47145,65550,73.23,34,1,0,0,0,0,598,...,0,0,0,0,0,0,0,0,0,0
2,53278,61360,89.63,33,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,57513,66113,88.48,25,1,0,0,0,0,305,...,0,0,0,0,0,0,0,0,0,0
4,52378,60300,88.39,41,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,54513,61900,89.66,28,1,0,0,0,0,825,...,0,0,0,0,0,0,0,0,0,0
6,46349,61500,76.42,31,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,43894,61900,71.89,30,1,0,0,0,0,17,...,0,0,0,0,0,0,0,0,0,0
8,53713,61973,89.56,27,1,0,0,0,0,718,...,0,0,0,0,0,0,0,0,0,0
9,52603,61300,86.95,51,0,0,1,0,0,818,...,0,0,0,0,0,0,0,0,0,0


## Part 2: Unfair Model Building



In [35]:
# move target to a separate column, and drop it from x variables
train_x = train_x.dropna()
train_y = train_x['LOAN_DEFAULT']
train_x = train_x.drop(columns="LOAN_DEFAULT")

In [36]:
train_y.head(10)

0    0
1    1
2    0
3    1
4    1
5    0
6    0
7    0
8    0
9    0
Name: LOAN_DEFAULT, dtype: int64

In [37]:
train_y.describe()

count    5000.000000
mean        0.224400
std         0.417228
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: LOAN_DEFAULT, dtype: float64

In [39]:
# train/test split
X_train, X_test, y_train, y_test = ms.train_test_split(train_x, train_y, test_size=0.2, random_state = 195)
print(X_train.shape)
print(X_test.shape)

(4000, 367)
(1000, 367)


In [87]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
X_wo_var = sel.fit_transform(X_train)
print(X_wo_var.columns)
X_test_wo_var = X_test[[X_wo_var.columns]]
reg = LogisticRegression(solver='liblinear', max_iter=1000, penalty='l1').fit(X_wo_var, y_train)
print(f'Train accuracy: {reg.score(X_wo_var, y_train)}, Test accuracy: {reg.score(X_test_wo_var, y_test)}')
print(f'Train f1: {f1_score(y_train, reg.predict(X_wo_var))}, Test f1: {f1_score(y_test, reg.predict(X_test_wo_var))}')

AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [82]:
logit_model=sm.Logit(y_train,X_train)
result=logit_model.fit(maxiter=1000, method='bfgs')
print(result.summary())

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


         Current function value: 0.693147
         Iterations: 0
         Function evaluations: 16
         Gradient evaluations: 4
                           Logit Regression Results                           
Dep. Variable:           LOAN_DEFAULT   No. Observations:                 4000
Model:                          Logit   Df Residuals:                     3691
Method:                           MLE   Df Model:                          308
Date:                Thu, 01 Aug 2019   Pseudo R-squ.:                 -0.2933
Time:                        18:15:04   Log-Likelihood:                -2772.6
converged:                      False   LL-Null:                       -2143.7
                                        LLR p-value:                     1.000
                                                              coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------------------

In [None]:
pred = reg.predict(X_test)
pd.Series(pred).describe()

In [78]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
print(f'Train accuracy: {gbc.score(X_train, y_train)}, Test accuracy: {gbc.score(X_test, y_test)}')
print(f'Train f1: {f1_score(y_train, gbc.predict(X_train))}, Test f1: {f1_score(y_test, gbc.predict(X_test))}')

Train accuracy: 0.791, Test accuracy: 0.787
Train f1: 0.1606425702811245, Test f1: 0.06167400881057269


In [79]:
from sklearn.svm import SVC
clf_3 = SVC(kernel='rbf', 
            class_weight='balanced', # penalize
            probability=True,
            gamm='scale')
clf_3.fit(X_train, y_train)
print(f'Train accuracy: {clf_3.score(X_train, y_train)}, Test accuracy: {clf_3.score(X_test, y_test)}')
print(f'Train f1: {f1_score(y_train, clf_3.predict(X_train))}, Test f1: {f1_score(y_test, clf_3.predict(X_test))}')



Train accuracy: 0.996, Test accuracy: 0.784
Train f1: 0.9912758996728462, Test f1: 0.009174311926605505


In [None]:
X_test.shape

In [None]:
pd.Series(clf_3.predict(X_test)).describe()