In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
Train = pd.read_csv('training_logistic.csv')
Test = pd.read_csv('test_logistic.csv')

In [3]:
from sklearn.linear_model import LogisticRegression

In [4]:
LogisticRegression

sklearn.linear_model.logistic.LogisticRegression

In [5]:
BaseLogisticModel = LogisticRegression()

In [6]:
#Date columns were imported as strings - change them to dates
for d in ['DOB_clean', 'Lead_Creation_Date_clean']:
    Train[d] = pd.to_datetime(Train[d])
    Test[d] = pd.to_datetime(Test[d])

In [7]:
Train.head()

Unnamed: 0.1,Unnamed: 0,ID,Monthly_Income,Existing_EMI,Loan_Amount,Loan_Period,Interest_Rate,EMI,Approved,DOB_clean,...,Source_Category_nan,Employer_Category2_2.0,Employer_Category2_3.0,Employer_Category2_4.0,Employer_Category2_nan,Var1_2.0,Var1_4.0,Var1_7.0,Var1_10.0,Var1_nan
0,0,APPC90493171225,2000.0,0.0,,,,,0,1979-07-23,...,0,0,0,1,0,0,0,0,0,0
1,1,APPD40611263344,3500.0,0.0,20000.0,2.0,13.25,953.0,0,1986-12-07,...,0,0,0,0,0,0,0,0,1,0
2,2,APPE70289249423,2250.0,0.0,45000.0,4.0,,,0,1982-12-10,...,0,0,0,1,0,0,0,0,0,0
3,3,APPF80273865537,3500.0,0.0,92000.0,5.0,,,0,1989-01-30,...,0,0,0,1,0,0,0,1,0,0
4,4,APPG60994436641,10000.0,2500.0,50000.0,2.0,,,0,1985-04-19,...,0,0,0,1,0,0,0,0,1,0


In [8]:
Train.dtypes

Unnamed: 0                           int64
ID                                  object
Monthly_Income                     float64
Existing_EMI                       float64
Loan_Amount                        float64
Loan_Period                        float64
Interest_Rate                      float64
EMI                                float64
Approved                             int64
DOB_clean                   datetime64[ns]
Lead_Creation_Date_clean    datetime64[ns]
Gender_Male                          int64
Gender_nan                           int64
City_Category_B                      int64
City_Category_C                      int64
City_Category_nan                    int64
Employer_Category1_B                 int64
Employer_Category1_C                 int64
Employer_Category1_nan               int64
Primary_Bank_Type_P                  int64
Primary_Bank_Type_nan                int64
Contacted_Y                          int64
Contacted_nan                        int64
Source_Cate

In [9]:
import datetime

In [10]:
#Need to change dates to ordinals to include them in regression
for d in ['DOB_clean', 'Lead_Creation_Date_clean']:
    Train[d] = map(datetime.date.toordinal, Train[d])
    Test[d] = map(datetime.date.toordinal, Test[d])

In [11]:
Train.head()

Unnamed: 0.1,Unnamed: 0,ID,Monthly_Income,Existing_EMI,Loan_Amount,Loan_Period,Interest_Rate,EMI,Approved,DOB_clean,...,Source_Category_nan,Employer_Category2_2.0,Employer_Category2_3.0,Employer_Category2_4.0,Employer_Category2_nan,Var1_2.0,Var1_4.0,Var1_7.0,Var1_10.0,Var1_nan
0,0,APPC90493171225,2000.0,0.0,,,,,0,722653,...,0,0,0,1,0,0,0,0,0,0
1,1,APPD40611263344,3500.0,0.0,20000.0,2.0,13.25,953.0,0,725347,...,0,0,0,0,0,0,0,0,1,0
2,2,APPE70289249423,2250.0,0.0,45000.0,4.0,,,0,723889,...,0,0,0,1,0,0,0,0,0,0
3,3,APPF80273865537,3500.0,0.0,92000.0,5.0,,,0,726132,...,0,0,0,1,0,0,0,1,0,0
4,4,APPG60994436641,10000.0,2500.0,50000.0,2.0,,,0,724750,...,0,0,0,1,0,0,0,0,1,0


Now we must figure out what to do with missing values. Options include dropping all rows with a missing value (undesirable because that's a high percentage of the data set), filling in with the median/mode (could cause bias if the NaNs are correlated with other variables), and interpolating using other variables (more computationally expensive). But we can try all three.

In [12]:
#clean up "Train" a bit more
Train.index = Train['ID']
Train.drop(['ID', 'Unnamed: 0'], axis=1, inplace=True)

In [13]:
Train.head()

Unnamed: 0_level_0,Monthly_Income,Existing_EMI,Loan_Amount,Loan_Period,Interest_Rate,EMI,Approved,DOB_clean,Lead_Creation_Date_clean,Gender_Male,...,Source_Category_nan,Employer_Category2_2.0,Employer_Category2_3.0,Employer_Category2_4.0,Employer_Category2_nan,Var1_2.0,Var1_4.0,Var1_7.0,Var1_10.0,Var1_nan
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
APPC90493171225,2000.0,0.0,,,,,0,722653,736160,0,...,0,0,0,1,0,0,0,0,0,0
APPD40611263344,3500.0,0.0,20000.0,2.0,13.25,953.0,0,725347,736149,1,...,0,0,0,0,0,0,0,0,1,0
APPE70289249423,2250.0,0.0,45000.0,4.0,,,0,723889,736164,1,...,0,0,0,1,0,0,0,0,0,0
APPF80273865537,3500.0,0.0,92000.0,5.0,,,0,726132,736154,1,...,0,0,0,1,0,0,0,1,0,0
APPG60994436641,10000.0,2500.0,50000.0,2.0,,,0,724750,736165,1,...,0,0,0,1,0,0,0,0,1,0


## Logistic regression with nulls removed

In [14]:
#Training data with nulls dropped
Train_NoNA = Train.dropna()
Train_NoNA.shape

(22256, 37)

In [15]:
Train_NoNA.head()

Unnamed: 0_level_0,Monthly_Income,Existing_EMI,Loan_Amount,Loan_Period,Interest_Rate,EMI,Approved,DOB_clean,Lead_Creation_Date_clean,Gender_Male,...,Source_Category_nan,Employer_Category2_2.0,Employer_Category2_3.0,Employer_Category2_4.0,Employer_Category2_nan,Var1_2.0,Var1_4.0,Var1_7.0,Var1_10.0,Var1_nan
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
APPD40611263344,3500.0,0.0,20000.0,2.0,13.25,953.0,0,725347,736149,1,...,0,0,0,0,0,0,0,0,1,0
APPK80327232033,7500.0,0.0,130000.0,5.0,14.85,3082.0,0,720345,736147,1,...,0,0,0,1,0,0,0,0,1,0
APPL20820172146,3000.0,0.0,30000.0,3.0,18.25,1088.0,0,726783,736148,0,...,0,0,0,1,0,0,0,0,0,0
APPM30977401211,2500.0,0.0,66000.0,5.0,20.0,1749.0,0,721781,736147,1,...,0,0,0,1,0,0,0,1,0,0
APPP50632654511,2700.0,460.0,20000.0,5.0,18.0,508.0,0,724146,736154,1,...,0,0,0,1,0,0,0,1,0,0


In [16]:
X_NoNA = Train_NoNA.drop('Approved', axis=1)
y_NoNA = Train_NoNA['Approved']

In [17]:
LogisticModel_NoNA = BaseLogisticModel

In [18]:
BaseLogisticModel

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [19]:
LogisticModel_NoNA.fit(X_NoNA, y_NoNA)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [20]:
Test.head()

Unnamed: 0.1,Unnamed: 0,ID,Monthly_Income,Existing_EMI,Loan_Amount,Loan_Period,Interest_Rate,EMI,DOB_clean,Lead_Creation_Date_clean,...,Source_Category_nan,Employer_Category2_2.0,Employer_Category2_3.0,Employer_Category2_4.0,Employer_Category2_nan,Var1_2.0,Var1_4.0,Var1_7.0,Var1_10.0,Var1_nan
0,0,APPA70109647212,2150.0,0.0,10000.0,3.0,20.0,372.0,725891,736150,...,0,0,0,1,0,0,1,0,0,0
1,1,APPB10687939341,4200.0,0.0,69000.0,5.0,24.0,1985.0,723374,736146,...,0,0,0,1,0,0,0,1,0,0
2,2,APPC80449411414,1000.0,0.0,,,,,726790,736146,...,0,0,0,1,0,0,0,0,0,0
3,3,APPD30665094501,1465.0,0.0,,,,,727486,736146,...,0,0,1,0,0,0,0,0,0,0
4,4,APPE80379821637,2340.0,500.0,10000.0,2.0,,,726001,736146,...,0,0,0,1,0,0,0,0,0,0


In [21]:
#clean up "Test" a bit more
Test.index = Test['ID']
Test.drop(['ID', 'Unnamed: 0'], axis=1, inplace=True)
Test.head()

Unnamed: 0_level_0,Monthly_Income,Existing_EMI,Loan_Amount,Loan_Period,Interest_Rate,EMI,DOB_clean,Lead_Creation_Date_clean,Gender_Male,Gender_nan,...,Source_Category_nan,Employer_Category2_2.0,Employer_Category2_3.0,Employer_Category2_4.0,Employer_Category2_nan,Var1_2.0,Var1_4.0,Var1_7.0,Var1_10.0,Var1_nan
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
APPA70109647212,2150.0,0.0,10000.0,3.0,20.0,372.0,725891,736150,1,0,...,0,0,0,1,0,0,1,0,0,0
APPB10687939341,4200.0,0.0,69000.0,5.0,24.0,1985.0,723374,736146,1,0,...,0,0,0,1,0,0,0,1,0,0
APPC80449411414,1000.0,0.0,,,,,726790,736146,0,0,...,0,0,0,1,0,0,0,0,0,0
APPD30665094501,1465.0,0.0,,,,,727486,736146,0,0,...,0,0,1,0,0,0,0,0,0,0
APPE80379821637,2340.0,500.0,10000.0,2.0,,,726001,736146,1,0,...,0,0,0,1,0,0,0,0,0,0


In [22]:
#Test data with nulls dropped
Test_NoNA = Test.dropna()
Test_NoNA.shape

(9640, 36)

In [23]:
Results_NoNA = LogisticModel_NoNA.predict_proba(Test_NoNA)
Results_NoNA.head()

AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [24]:
Results_NoNA

array([[ 0.98549696,  0.01450304],
       [ 0.97174889,  0.02825111],
       [ 0.97978073,  0.02021927],
       ..., 
       [ 0.97436658,  0.02563342],
       [ 0.98209869,  0.01790131],
       [ 0.98063441,  0.01936559]])

In [25]:
Results_NoNA_2 = LogisticModel_NoNA.predict(Test_NoNA)
Results_NoNA_2

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [26]:
del Results_NoNA_2

In [27]:
Results_NoNA = np.array([x[1] for x in Results_NoNA])

In [31]:
Soln_Logistic_NoNA = pd.DataFrame(Test_NoNA.index, Results_NoNA, columns=['ID', 'Approved'])
Soln_Logistic_NoNA.head()

Unnamed: 0,ID,Approved
0.014503,APPA70109647212,
0.028251,APPB10687939341,
0.020219,APPK30106042233,
0.015746,APPN80816878501,
0.020812,APPR10725816917,


In [32]:
Soln_Logistic_NoNA.Approved = Soln_Logistic_NoNA.index

In [33]:
Soln_Logistic_NoNA.head()

Unnamed: 0,ID,Approved
0.014503,APPA70109647212,0.014503
0.028251,APPB10687939341,0.028251
0.020219,APPK30106042233,0.020219
0.015746,APPN80816878501,0.015746
0.020812,APPR10725816917,0.020812


In [34]:
Soln_Logistic_NoNA.index = range(len(Soln_Logistic_NoNA.index))
Soln_Logistic_NoNA.head()

Unnamed: 0,ID,Approved
0,APPA70109647212,0.014503
1,APPB10687939341,0.028251
2,APPK30106042233,0.020219
3,APPN80816878501,0.015746
4,APPR10725816917,0.020812


In [35]:
Soln_Logistic_NoNA.to_csv('soln_logistic_noNA.csv', index=False)

Well, this doesn't work because we are required to have all the rows from the test data included!

## Logistic regression with nulls replaced by medians/modes

In [37]:
Train_FillNA = Train.copy()

In [38]:
Train_FillNA.dtypes

Monthly_Income              float64
Existing_EMI                float64
Loan_Amount                 float64
Loan_Period                 float64
Interest_Rate               float64
EMI                         float64
Approved                      int64
DOB_clean                     int64
Lead_Creation_Date_clean      int64
Gender_Male                   int64
Gender_nan                    int64
City_Category_B               int64
City_Category_C               int64
City_Category_nan             int64
Employer_Category1_B          int64
Employer_Category1_C          int64
Employer_Category1_nan        int64
Primary_Bank_Type_P           int64
Primary_Bank_Type_nan         int64
Contacted_Y                   int64
Contacted_nan                 int64
Source_Category_B             int64
Source_Category_C             int64
Source_Category_D             int64
Source_Category_E             int64
Source_Category_F             int64
Source_Category_G             int64
Source_Category_nan         

In [42]:
Train_FillNA.describe()

Unnamed: 0,Monthly_Income,Existing_EMI,Loan_Amount,Loan_Period,Interest_Rate,EMI,Approved,DOB_clean,Lead_Creation_Date_clean,Gender_Male,...,Source_Category_nan,Employer_Category2_2.0,Employer_Category2_3.0,Employer_Category2_4.0,Employer_Category2_nan,Var1_2.0,Var1_4.0,Var1_7.0,Var1_10.0,Var1_nan
count,69713.0,69662.0,42004.0,42004.0,22276.0,22276.0,69713.0,69713.0,69713.0,69713.0,...,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0
mean,5622.283,360.928751,39429.982859,3.890629,19.21357,1101.466242,0.014631,725988.727382,736193.889074,0.57305,...,0.0,0.028044,0.023238,0.825987,0.061653,0.191686,0.11008,0.171388,0.192504,0.0
std,174767.1,2288.517927,30727.59599,1.167491,5.847136,752.661394,0.120073,11955.597818,26.931771,0.494639,...,0.0,0.165098,0.15066,0.379124,0.240526,0.39363,0.312992,0.376851,0.394269,0.0
min,0.0,0.0,5000.0,1.0,11.99,118.0,0.0,1.0,736146.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1650.0,0.0,20000.0,3.0,15.25,649.0,0.0,724153.0,736171.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2500.0,0.0,30000.0,4.0,18.0,941.0,0.0,725656.0,736195.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4000.0,350.0,50000.0,5.0,20.0,1295.0,0.0,726830.0,736218.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,38383840.0,545436.5,300000.0,6.0,37.0,13556.0,1.0,754956.0,736237.0,1.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [43]:
Test_FillNA = Test.copy()
Test_FillNA.describe()

Unnamed: 0,Monthly_Income,Existing_EMI,Loan_Amount,Loan_Period,Interest_Rate,EMI,DOB_clean,Lead_Creation_Date_clean,Gender_Male,Gender_nan,...,Source_Category_nan,Employer_Category2_2.0,Employer_Category2_3.0,Employer_Category2_4.0,Employer_Category2_nan,Var1_2.0,Var1_4.0,Var1_7.0,Var1_10.0,Var1_nan
count,30037.0,30005.0,18166.0,18166.0,9652.0,9652.0,30037.0,30037.0,30037.0,30037.0,...,30037.0,30037.0,30037.0,30037.0,30037.0,30037.0,30037.0,30037.0,30037.0,30037.0
mean,3977.139,348.90906,39482.990201,3.903116,19.280537,1094.914836,726026.165762,736194.004461,0.57276,0.0,...,0.0,0.026634,0.025069,0.832507,0.05643,0.196158,0.107567,0.171588,0.193861,0.0
std,23289.6,1000.816847,30527.865594,1.15082,5.882246,727.452303,9017.03094,26.806399,0.494686,0.0,...,0.0,0.161013,0.156338,0.373422,0.230755,0.397096,0.309838,0.377029,0.395328,0.0
min,0.0,0.0,5000.0,1.0,11.99,118.0,1.0,736146.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1650.0,0.0,20000.0,3.0,15.25,625.0,724142.0,736171.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2500.0,0.0,30000.0,4.0,18.0,946.0,725651.0,736195.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4000.0,350.0,50000.0,5.0,20.0,1291.0,726829.0,736217.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,3500000.0,43000.0,300000.0,6.0,37.0,6979.0,754956.0,736237.0,1.0,0.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [48]:
for col in ['Monthly_Income', 'Existing_EMI', 'Loan_Amount', 'Loan_Period', 'Interest_Rate', 'EMI']:
    Train_FillNA[col].fillna(value=Train_FillNA[col].median(), inplace=True)
    Test_FillNA[col].fillna(value=Test_FillNA[col].median(), inplace=True)

In [49]:
Train_FillNA.describe(include=['float64'])

Unnamed: 0,Monthly_Income,Existing_EMI,Loan_Amount,Loan_Period,Interest_Rate,EMI
count,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0
mean,5622.283,360.664706,35681.82405,3.934101,18.387782,992.275171
std,174767.1,2287.70147,24293.802244,0.907812,3.353297,431.98581
min,0.0,0.0,5000.0,1.0,11.99,118.0
25%,1650.0,0.0,30000.0,4.0,18.0,941.0
50%,2500.0,0.0,30000.0,4.0,18.0,941.0
75%,4000.0,350.0,36000.0,4.0,18.0,941.0
max,38383840.0,545436.5,300000.0,6.0,37.0,13556.0


In [50]:
Test_FillNA.describe(include=['float64'])

Unnamed: 0,Monthly_Income,Existing_EMI,Loan_Amount,Loan_Period,Interest_Rate,EMI
count,30037.0,30037.0,30037.0,30037.0,30037.0,30037.0
mean,3977.139,348.537348,35735.193262,3.941406,18.411484,993.851849
std,23289.6,1000.348335,24189.146047,0.896213,3.387529,418.176494
min,0.0,0.0,5000.0,1.0,11.99,118.0
25%,1650.0,0.0,30000.0,4.0,18.0,946.0
50%,2500.0,0.0,30000.0,4.0,18.0,946.0
75%,4000.0,350.0,36000.0,4.0,18.0,946.0
max,3500000.0,43000.0,300000.0,6.0,37.0,6979.0


In [52]:
LogisticModel_FillNA = BaseLogisticModel

In [53]:
X_FillNA = Train_FillNA.drop('Approved', axis=1)
y_FillNA = Train_FillNA['Approved']

In [54]:
LogisticModel_FillNA.fit(X_FillNA, y_FillNA)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [55]:
Results_FillNA = LogisticModel_FillNA.predict_proba(Test_FillNA)
Results_FillNA

array([[ 0.99086735,  0.00913265],
       [ 0.97953727,  0.02046273],
       [ 0.98779568,  0.01220432],
       ..., 
       [ 0.98695363,  0.01304637],
       [ 0.98539397,  0.01460603],
       [ 0.9870042 ,  0.0129958 ]])

In [57]:
Soln_LogReg_FillNA = pd.Series([x[1] for x in Results_FillNA])
Soln_LogReg_FillNA

0        0.009133
1        0.020463
2        0.012204
3        0.012136
4        0.010853
5        0.013274
6        0.020829
7        0.013943
8        0.009784
9        0.011955
10       0.013396
11       0.012092
12       0.012734
13       0.010224
14       0.012817
15       0.013484
16       0.024837
17       0.013999
18       0.012174
19       0.009860
20       0.012351
21       0.009756
22       0.015099
23       0.009921
24       0.014116
25       0.010934
26       0.012782
27       0.009241
28       0.012116
29       0.015669
           ...   
30007    0.015219
30008    0.015987
30009    0.012099
30010    0.013333
30011    0.020929
30012    0.017546
30013    0.012715
30014    0.012303
30015    0.012654
30016    0.013320
30017    0.010493
30018    0.012334
30019    0.012271
30020    0.009913
30021    0.012128
30022    0.012286
30023    0.015451
30024    0.012266
30025    0.012015
30026    0.013784
30027    0.014775
30028    0.012155
30029    0.012115
30030    0.012149
30031    0

In [62]:
DF_LogReg_FillNA = pd.DataFrame({'ID': Test.index, 'Approved': Soln_LogReg_FillNA})
DF_LogReg_FillNA.head()

Unnamed: 0,Approved,ID
0,0.009133,APPA70109647212
1,0.020463,APPB10687939341
2,0.012204,APPC80449411414
3,0.012136,APPD30665094501
4,0.010853,APPE80379821637


In [64]:
DF_LogReg_FillNA = DF_LogReg_FillNA[['ID', 'Approved']]
DF_LogReg_FillNA.head()

Unnamed: 0,ID,Approved
0,APPA70109647212,0.009133
1,APPB10687939341,0.020463
2,APPC80449411414,0.012204
3,APPD30665094501,0.012136
4,APPE80379821637,0.010853


In [66]:
DF_LogReg_FillNA.to_csv('soln_logistic_fillNA.csv', index=False)

## Logistic regression using linear regression to fill in nulls

In [67]:
Train_RegNA = Train.copy()

In [68]:
Train_RegNA.columns

Index([u'Monthly_Income', u'Existing_EMI', u'Loan_Amount', u'Loan_Period',
       u'Interest_Rate', u'EMI', u'Approved', u'DOB_clean',
       u'Lead_Creation_Date_clean', u'Gender_Male', u'Gender_nan',
       u'City_Category_B', u'City_Category_C', u'City_Category_nan',
       u'Employer_Category1_B', u'Employer_Category1_C',
       u'Employer_Category1_nan', u'Primary_Bank_Type_P',
       u'Primary_Bank_Type_nan', u'Contacted_Y', u'Contacted_nan',
       u'Source_Category_B', u'Source_Category_C', u'Source_Category_D',
       u'Source_Category_E', u'Source_Category_F', u'Source_Category_G',
       u'Source_Category_nan', u'Employer_Category2_2.0',
       u'Employer_Category2_3.0', u'Employer_Category2_4.0',
       u'Employer_Category2_nan', u'Var1_2.0', u'Var1_4.0', u'Var1_7.0',
       u'Var1_10.0', u'Var1_nan'],
      dtype='object')

In [69]:
from sklearn.linear_model import LinearRegression

In [79]:
Train_RegNA_dummies = Train_RegNA.drop(['Monthly_Income', 'Existing_EMI', 'Loan_Amount', 'Loan_Period',
                                      'Interest_Rate', 'EMI', 'Approved'], axis=1)

In [78]:
Train_RegNA.describe(include=['float64'])

Unnamed: 0,Monthly_Income,Existing_EMI,Loan_Amount,Loan_Period,Interest_Rate,EMI
count,69713.0,69662.0,42004.0,42004.0,22276.0,22276.0
mean,5622.283,360.928751,39429.982859,3.890629,19.21357,1101.466242
std,174767.1,2288.517927,30727.59599,1.167491,5.847136,752.661394
min,0.0,0.0,5000.0,1.0,11.99,118.0
25%,1650.0,0.0,20000.0,3.0,15.25,649.0
50%,2500.0,0.0,30000.0,4.0,18.0,941.0
75%,4000.0,350.0,50000.0,5.0,20.0,1295.0
max,38383840.0,545436.5,300000.0,6.0,37.0,13556.0


In [80]:
Test_RegNA = Test.copy()

In [97]:
def regress_fill(df, varlist):
    linmod = {}
    dfc = df.copy()
    dum = df.drop(varlist + ['Approved'], axis=1)
    for v in varlist:
        X = dfc.dropna(subset=[v]).drop(varlist + ['Approved'], axis=1)
        y = dfc[v].dropna()
        linmod[v] = LinearRegression()
        linmod[v].fit(X, y)
        result = linmod[v].predict(dum)
        for n in range(len(dfc[v])):
            if pd.isnull(dfc[v][n]):
                dfc[v][n] = result[n]
    return dfc

In [98]:
Train_RegNA = regress_fill(Train, ['Monthly_Income', 'Existing_EMI', 'Loan_Amount', 'Loan_Period',
                                      'Interest_Rate', 'EMI'])
#takes too long

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


KeyboardInterrupt: 

In [96]:
Train_RegNA.head(10)

Unnamed: 0_level_0,Monthly_Income,Existing_EMI,Loan_Amount,Loan_Period,Interest_Rate,EMI,Approved,DOB_clean,Lead_Creation_Date_clean,Gender_Male,...,Source_Category_nan,Employer_Category2_2.0,Employer_Category2_3.0,Employer_Category2_4.0,Employer_Category2_nan,Var1_2.0,Var1_4.0,Var1_7.0,Var1_10.0,Var1_nan
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
APPC90493171225,2000.0,0.0,,,,,0,722653,736160,0,...,0,0,0,1,0,0,0,0,0,0
APPD40611263344,3500.0,0.0,20000.0,2.0,13.25,953.0,0,725347,736149,1,...,0,0,0,0,0,0,0,0,1,0
APPE70289249423,2250.0,0.0,45000.0,4.0,,,0,723889,736164,1,...,0,0,0,1,0,0,0,0,0,0
APPF80273865537,3500.0,0.0,92000.0,5.0,,,0,726132,736154,1,...,0,0,0,1,0,0,0,1,0,0
APPG60994436641,10000.0,2500.0,50000.0,2.0,,,0,724750,736165,1,...,0,0,0,1,0,0,0,0,1,0
APPI90914237819,7000.0,0.0,,,,,0,726094,736146,0,...,0,0,0,1,0,0,0,0,0,0
APPK80327232033,7500.0,0.0,130000.0,5.0,14.85,3082.0,0,720345,736147,1,...,0,0,0,1,0,0,0,0,1,0
APPL20820172146,3000.0,0.0,30000.0,3.0,18.25,1088.0,0,726783,736148,0,...,0,0,0,1,0,0,0,0,0,0
APPM30977401211,2500.0,0.0,66000.0,5.0,20.00,1749.0,0,721781,736147,1,...,0,0,0,1,0,0,0,1,0,0
APPN60714112334,2500.0,0.0,,,,,0,726814,736147,0,...,0,0,0,1,0,0,0,0,0,0


In [99]:
#More dealing with dates
Train.describe()

Unnamed: 0,Monthly_Income,Existing_EMI,Loan_Amount,Loan_Period,Interest_Rate,EMI,Approved,DOB_clean,Lead_Creation_Date_clean,Gender_Male,...,Source_Category_nan,Employer_Category2_2.0,Employer_Category2_3.0,Employer_Category2_4.0,Employer_Category2_nan,Var1_2.0,Var1_4.0,Var1_7.0,Var1_10.0,Var1_nan
count,69713.0,69662.0,42004.0,42004.0,22276.0,22276.0,69713.0,69713.0,69713.0,69713.0,...,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0
mean,5622.283,360.928751,39429.982859,3.890629,19.21357,1101.466242,0.014631,725988.727382,736193.889074,0.57305,...,0.0,0.028044,0.023238,0.825987,0.061653,0.191686,0.11008,0.171388,0.192504,0.0
std,174767.1,2288.517927,30727.59599,1.167491,5.847136,752.661394,0.120073,11955.597818,26.931771,0.494639,...,0.0,0.165098,0.15066,0.379124,0.240526,0.39363,0.312992,0.376851,0.394269,0.0
min,0.0,0.0,5000.0,1.0,11.99,118.0,0.0,1.0,736146.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1650.0,0.0,20000.0,3.0,15.25,649.0,0.0,724153.0,736171.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2500.0,0.0,30000.0,4.0,18.0,941.0,0.0,725656.0,736195.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4000.0,350.0,50000.0,5.0,20.0,1295.0,0.0,726830.0,736218.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,38383840.0,545436.5,300000.0,6.0,37.0,13556.0,1.0,754956.0,736237.0,1.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [100]:
for d in ['DOB_clean', 'Lead_Creation_Date_clean']:
    Train[d] = map(datetime.date.fromordinal, Train[d])
    Test[d] = map(datetime.date.fromordinal, Test[d])

In [101]:
Train['DOB_clean'].describe()

count          69713
unique         10760
top       1982-01-11
freq             253
Name: DOB_clean, dtype: object

In [102]:
max(Train['DOB_clean'])

datetime.date(2067, 12, 31)

In [103]:
min(Train['DOB_clean'])

datetime.date(1, 1, 1)