## House and document features from logistic regression

Use logistic regression to predict target with only house or document features. The predictions are saved to disk to be merged to main table later.

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler

import gc

import matplotlib.pyplot as plt
import seaborn as sns

import os
print(os.listdir("../input"))
    
gc.enable()

['download_command.txt', 'sample_submission.csv.zip', 'installments_payments.csv.zip', 'application_test.csv.zip', 'HomeCredit_columns_description.csv', 'previous_application.csv.zip', 'bureau_balance.csv.zip', 'POS_CASH_balance.csv.zip', 'credit_card_balance.csv.zip', 'application_train.csv.zip', 'bureau.csv.zip']


### House features

Read house features from main train/test table.

In [2]:
data = pd.read_csv('../input/application_train.csv.zip')
test = pd.read_csv('../input/application_test.csv.zip')

rejected_features = ['AMT_GOODS_PRICE',
                     'APARTMENTS_AVG','APARTMENTS_MEDI',
                     'BASEMENTAREA_AVG','BASEMENTAREA_MODE','COMMONAREA_AVG','COMMONAREA_MODE',
                     'ELEVATORS_AVG','ELEVATORS_MEDI','ENTRANCES_AVG','ENTRANCES_MEDI','FLOORSMAX_AVG','FLOORSMAX_MEDI',
                     'FLOORSMIN_AVG','FLOORSMIN_MEDI','LANDAREA_AVG','LANDAREA_MODE',
                     'LIVINGAPARTMENTS_AVG','LIVINGAPARTMENTS_MEDI',
                     'LIVINGAREA_AVG','LIVINGAREA_MODE',
                     'NONLIVINGAPARTMENTS_AVG','NONLIVINGAPARTMENTS_MEDI',
                     'NONLIVINGAREA_AVG','NONLIVINGAREA_MODE','OBS_60_CNT_SOCIAL_CIRCLE',
                     'REGION_RATING_CLIENT_W_CITY','YEARS_BEGINEXPLUATATION_AVG','YEARS_BEGINEXPLUATATION_MEDI',
                     'YEARS_BUILD_AVG','YEARS_BUILD_MEDI']

for f_ in rejected_features:
    del data[f_]
    del test[f_]
    
gc.collect()

217

Create per person house features (living area per person, number of floors per person, etc.)

In [3]:
data['AGE'] = - (data['DAYS_BIRTH']/365.25).astype('int32')
data['house_person'] = 1
data['house_person'].loc[data['NAME_HOUSING_TYPE']=='With parents'] +=2
data['house_person'].loc[(data['NAME_FAMILY_STATUS']=='Married')|(data['NAME_FAMILY_STATUS']=='Civil marriage')] +=1
data['house_person'].loc[data['AGE']<55] += data['CNT_CHILDREN']

test['AGE'] = - (test['DAYS_BIRTH']/365.25).astype('int32')
test['house_person'] = 1
test['house_person'].loc[test['NAME_HOUSING_TYPE']=='With parents'] +=2
test['house_person'].loc[(test['NAME_FAMILY_STATUS']=='Married')|(test['NAME_FAMILY_STATUS']=='Civil marriage')] +=1
test['house_person'].loc[test['AGE']<55] += test['CNT_CHILDREN']

house = [f_ for f_ in data.columns if ('AVG' in f_) | ('MEDI' in f_) | ('MODE' in f_) & (not 'YEARS' in f_)]
for f_ in house:
    if data[f_].dtype != 'object':
        print (f_)
        data[f_+'_PP'] = data[f_]/data['house_person']
        test[f_+'_PP'] = test[f_]/test['house_person']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


APARTMENTS_MODE
ELEVATORS_MODE
ENTRANCES_MODE
FLOORSMAX_MODE
FLOORSMIN_MODE
LIVINGAPARTMENTS_MODE
NONLIVINGAPARTMENTS_MODE
BASEMENTAREA_MEDI
COMMONAREA_MEDI
LANDAREA_MEDI
LIVINGAREA_MEDI
NONLIVINGAREA_MEDI
TOTALAREA_MODE


In [4]:
house = [f_ for f_ in data.columns if ('AVG' in f_) | ('MEDI' in f_) | ('MODE' in f_) ]
print (len(house),house)
train_house = data.loc[data[house].isnull().sum(axis=1)<len(house)][['REGION_POPULATION_RELATIVE','NAME_HOUSING_TYPE']+house]
test_house = test.loc[test[house].isnull().sum(axis=1)<len(house)][['REGION_POPULATION_RELATIVE','NAME_HOUSING_TYPE']+house]
y = data['TARGET'].loc[data[house].isnull().sum(axis=1)<len(house)]
train_ID = data['SK_ID_CURR'].loc[data[house].isnull().sum(axis=1)<len(house)]
test_ID = test['SK_ID_CURR'].loc[test[house].isnull().sum(axis=1)<len(house)]
train_house.head()

32 ['APARTMENTS_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LIVINGAPARTMENTS_MODE', 'NONLIVINGAPARTMENTS_MODE', 'BASEMENTAREA_MEDI', 'COMMONAREA_MEDI', 'LANDAREA_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAREA_MEDI', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'TOTALAREA_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'APARTMENTS_MODE_PP', 'ELEVATORS_MODE_PP', 'ENTRANCES_MODE_PP', 'FLOORSMAX_MODE_PP', 'FLOORSMIN_MODE_PP', 'LIVINGAPARTMENTS_MODE_PP', 'NONLIVINGAPARTMENTS_MODE_PP', 'BASEMENTAREA_MEDI_PP', 'COMMONAREA_MEDI_PP', 'LANDAREA_MEDI_PP', 'LIVINGAREA_MEDI_PP', 'NONLIVINGAREA_MEDI_PP', 'TOTALAREA_MODE_PP']


Unnamed: 0,REGION_POPULATION_RELATIVE,NAME_HOUSING_TYPE,APARTMENTS_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LIVINGAPARTMENTS_MODE,...,FLOORSMAX_MODE_PP,FLOORSMIN_MODE_PP,LIVINGAPARTMENTS_MODE_PP,NONLIVINGAPARTMENTS_MODE_PP,BASEMENTAREA_MEDI_PP,COMMONAREA_MEDI_PP,LANDAREA_MEDI_PP,LIVINGAREA_MEDI_PP,NONLIVINGAREA_MEDI_PP,TOTALAREA_MODE_PP
0,0.018801,House / apartment,0.0252,0.9722,0.6341,0.0,0.069,0.0833,0.125,0.022,...,0.0833,0.125,0.022,0.0,0.0369,0.0144,0.0375,0.0193,0.0,0.0149
1,0.003541,House / apartment,0.0924,0.9851,0.804,0.0806,0.0345,0.2917,0.3333,0.079,...,0.14585,0.16665,0.0395,0.0,0.02645,0.0304,0.0066,0.0279,0.005,0.0357
12,0.031329,House / apartment,0.084,0.9811,,0.0,0.2069,0.1667,,,...,0.08335,,,,,,0.00685,0.0396,0.0,0.0306
13,0.016612,House / apartment,0.1502,0.9806,0.7452,0.1611,0.1379,0.3333,0.375,0.1313,...,0.1111,0.125,0.043767,0.0,0.032433,0.0195,0.031567,0.0474,0.0,0.047233
14,0.010006,House / apartment,0.3561,0.9985,0.9804,0.4028,0.1724,0.6667,0.7083,0.3113,...,0.33335,0.35415,0.15565,0.00975,0.06675,0.0575,0.08945,0.1921,0.0511,0.19055


Create one hot encoding for some features.

In [5]:
train_house_size = train_house.shape[0]
test_house_size = test_house.shape[0]
combined = pd.concat([train_house, test_house], axis=0)
for f_ in combined.columns:
    if combined[f_].dtype == 'object':
        combined[f_].fillna(combined[f_].mode(), inplace=True)
    else:
        combined[f_].fillna(combined[f_].median(), inplace=True)
combined = pd.concat([combined, pd.get_dummies(combined['REGION_POPULATION_RELATIVE'], prefix='REGION')], axis=1)
for f_ in ['FONDKAPREMONT_MODE','HOUSETYPE_MODE','WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE','NAME_HOUSING_TYPE']:
    combined = pd.concat([combined, pd.get_dummies(combined[f_], prefix=f_)], axis=1)
    del combined[f_]
train_house = combined.iloc[:train_house_size,:]
test_house = combined.iloc[-test_house_size:,:]

#### Training using current target.

In [6]:
from sklearn.linear_model import LogisticRegression

train_x = train_house
train_y = y
test_x = test_house
print(train_x.shape, test_x.shape)

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=546789)
oof_preds = np.zeros(train_x.shape[0])
sub_preds = np.zeros(test_x.shape[0])
feature_importance_df = pd.DataFrame()

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train_x, train_y)):
    trn_x, val_x = train_x.iloc[trn_idx], train_x.iloc[val_idx]
    trn_y, val_y = train_y.iloc[trn_idx], train_y.iloc[val_idx]
     
    model = LogisticRegression(C=0.05)
    model.fit(trn_x, trn_y)
    
    trn_y_pred = model.predict_proba(trn_x)[:,1]
    oof_preds[val_idx] = model.predict_proba(val_x)[:,1]
    sub_preds += model.predict_proba(test_x)[:,1] / folds.n_splits
    print('Fold %2d AUC : (train): %.6f, (val): %.6f' % (n_fold + 1, 
        roc_auc_score(trn_y, trn_y_pred), roc_auc_score(val_y, oof_preds[val_idx])))
    
print('Full AUC score %.6f' % roc_auc_score(train_y, oof_preds)) 

(161756, 130) (26535, 130)




Fold  1 AUC : (train): 0.590448, (val): 0.575626




Fold  2 AUC : (train): 0.589088, (val): 0.581632




Fold  3 AUC : (train): 0.587221, (val): 0.586963




Fold  4 AUC : (train): 0.589171, (val): 0.578094




Fold  5 AUC : (train): 0.590127, (val): 0.576567
Full AUC score 0.579710


Save predictions to disk.

In [7]:
train_house_score = pd.DataFrame({'house_score':oof_preds}, index=train_ID)
test_house_score = pd.DataFrame({'house_score':sub_preds}, index=test_ID)
train_house_score.to_csv('../output/train_house_score.csv')
test_house_score.to_csv('../output/test_house_score.csv')

del combined, trn_x, trn_y, train_x, train_y, test_x
del train_house, test_house
gc.collect()

161

#### Another training ues credit score as target

In [8]:
house = [f_ for f_ in data.columns if ('AVG' in f_) | ('MEDI' in f_) | ('MODE' in f_) ]
combined = pd.concat([data,test],axis=0,sort=False)
combined_house = combined.loc[combined[house].isnull().sum(axis=1)<47][['REGION_POPULATION_RELATIVE','NAME_HOUSING_TYPE']+house]
y = combined[['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']].loc[combined[house].isnull().sum(axis=1)<47].mean(axis=1)
combined_ID = combined['SK_ID_CURR'].loc[combined[house].isnull().sum(axis=1)<47]

combined_house = combined_house.loc[y.notna()]
combined_ID = combined_ID.loc[y.notna()]
y = y.loc[y.notna()]
print (y.shape, combined_house.shape, combined_ID.shape)
combined_house.head()

(356076,) (356076, 34) (356076,)


Unnamed: 0,REGION_POPULATION_RELATIVE,NAME_HOUSING_TYPE,APARTMENTS_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LIVINGAPARTMENTS_MODE,...,FLOORSMAX_MODE_PP,FLOORSMIN_MODE_PP,LIVINGAPARTMENTS_MODE_PP,NONLIVINGAPARTMENTS_MODE_PP,BASEMENTAREA_MEDI_PP,COMMONAREA_MEDI_PP,LANDAREA_MEDI_PP,LIVINGAREA_MEDI_PP,NONLIVINGAREA_MEDI_PP,TOTALAREA_MODE_PP
0,0.018801,House / apartment,0.0252,0.9722,0.6341,0.0,0.069,0.0833,0.125,0.022,...,0.0833,0.125,0.022,0.0,0.0369,0.0144,0.0375,0.0193,0.0,0.0149
1,0.003541,House / apartment,0.0924,0.9851,0.804,0.0806,0.0345,0.2917,0.3333,0.079,...,0.14585,0.16665,0.0395,0.0,0.02645,0.0304,0.0066,0.0279,0.005,0.0357
2,0.010032,House / apartment,,,,,,,,,...,,,,,,,,,,
3,0.008019,House / apartment,,,,,,,,,...,,,,,,,,,,
4,0.028663,House / apartment,,,,,,,,,...,,,,,,,,,,


In [9]:
combined = combined_house
for f_ in combined.columns:
    if combined[f_].dtype == 'object':
        combined[f_].fillna(combined[f_].mode(), inplace=True)
    else:
        combined[f_].fillna(combined[f_].median(), inplace=True)
combined = pd.concat([combined, pd.get_dummies(combined['REGION_POPULATION_RELATIVE'], prefix='REGION')], axis=1)
for f_ in ['FONDKAPREMONT_MODE','HOUSETYPE_MODE','WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE','NAME_HOUSING_TYPE']:
    combined = pd.concat([combined, pd.get_dummies(combined[f_], prefix=f_)], axis=1)
    del combined[f_]

combined.head()

Unnamed: 0,REGION_POPULATION_RELATIVE,APARTMENTS_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LIVINGAPARTMENTS_MODE,NONLIVINGAPARTMENTS_MODE,...,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,NAME_HOUSING_TYPE_Co-op apartment,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Office apartment,NAME_HOUSING_TYPE_Rented apartment,NAME_HOUSING_TYPE_With parents
0,0.018801,0.0252,0.9722,0.6341,0.0,0.069,0.0833,0.125,0.022,0.0,...,1,0,1,0,0,1,0,0,0,0
1,0.003541,0.0924,0.9851,0.804,0.0806,0.0345,0.2917,0.3333,0.079,0.0,...,0,0,1,0,0,1,0,0,0,0
2,0.010032,0.084,0.9816,0.7648,0.0,0.1379,0.1667,0.2083,0.0771,0.0,...,0,0,0,0,0,1,0,0,0,0
3,0.008019,0.084,0.9816,0.7648,0.0,0.1379,0.1667,0.2083,0.0771,0.0,...,0,0,0,0,0,1,0,0,0,0
4,0.028663,0.084,0.9816,0.7648,0.0,0.1379,0.1667,0.2083,0.0771,0.0,...,0,0,0,0,0,1,0,0,0,0


This time we use Ridge because credit scores are continuous...

In [10]:
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

folds = KFold(n_splits=5, shuffle=True, random_state=546789)

train_x = combined
train_y = y

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train_x)):
    trn_x, val_x = train_x.iloc[trn_idx], train_x.iloc[val_idx]
    trn_y, val_y = train_y.iloc[trn_idx], train_y.iloc[val_idx]
     
    model = Ridge()    
    model.fit(trn_x, trn_y)
    
    trn_y_pred = model.predict(trn_x)
    val_y_pred = model.predict(val_x)
    print('Fold %2d r2_score : (train): %.6f, (val): %.6f' % (n_fold + 1, 
        r2_score(trn_y, trn_y_pred), r2_score(val_y, val_y_pred)))

model.fit(train_x, y)
pred_y = model.predict(train_x)
print('Full r2_score %.6f' % r2_score(train_y, pred_y)) 

Fold  1 r2_score : (train): 0.106178, (val): 0.112055
Fold  2 r2_score : (train): 0.108160, (val): 0.104122
Fold  3 r2_score : (train): 0.108014, (val): 0.104745
Fold  4 r2_score : (train): 0.107465, (val): 0.106880
Fold  5 r2_score : (train): 0.107725, (val): 0.105824
Full r2_score 0.107432


Save predictions to disk.

In [11]:
house_ex = pd.DataFrame({'house_score':pred_y}, index=combined_ID)
house_ex.to_csv('../output/house_ex.csv')

del combined, combined_house, train_x, train_y
gc.collect()

7

### Document features

Read document features from main train/test table.

In [12]:
doc = [f_ for f_ in data.columns if ('FLAG_DOCUMENT' in f_) ]
train_x = data[doc]
test_x = test[doc]
train_y = data['TARGET']
train_ID = data['SK_ID_CURR']
test_ID = test['SK_ID_CURR']
train_x.head()

Unnamed: 0,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


Training with logistic regression.

In [14]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=546789)
oof_preds = np.zeros(train_x.shape[0])
sub_preds = np.zeros(test_x.shape[0])

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train_x, train_y)):
    trn_x, val_x = train_x.iloc[trn_idx], train_x.iloc[val_idx]
    trn_y, val_y = train_y.iloc[trn_idx], train_y.iloc[val_idx]
     
    model = LogisticRegression(C=0.1)
    model.fit(trn_x, trn_y)
    
    trn_y_pred = model.predict_proba(trn_x)[:,1]
    oof_preds[val_idx] = model.predict_proba(val_x)[:,1]
    sub_preds += model.predict_proba(test_x)[:,1] / folds.n_splits
    print('Fold %2d AUC : (train): %.6f, (val): %.6f' % (n_fold + 1, 
        roc_auc_score(trn_y, trn_y_pred), roc_auc_score(val_y, oof_preds[val_idx])))
    
print('Full AUC score %.6f' % roc_auc_score(train_y, oof_preds)) 



Fold  1 AUC : (train): 0.545659, (val): 0.541470




Fold  2 AUC : (train): 0.544123, (val): 0.547888




Fold  3 AUC : (train): 0.544387, (val): 0.546828




Fold  4 AUC : (train): 0.544828, (val): 0.544968




Fold  5 AUC : (train): 0.545709, (val): 0.541570
Full AUC score 0.543417


Save predictions to disk.

In [16]:
train_doc_score = pd.DataFrame({'doc_score':oof_preds}, index=train_ID)
test_doc_score = pd.DataFrame({'doc_score':sub_preds}, index=test_ID)
train_doc_score.to_csv('../output/train_doc_score.csv')
test_doc_score.to_csv('../output/test_doc_score.csv')

del trn_x, trn_y, train_x, train_y, test_x
gc.collect()

90