In [1]:
import pandas as pd
import numpy as np
#from sklearn.feature_selection import VarianceThreshold
#from sklearn.feature_selection import SelectKBest
#from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
#import matplotlib.pyplot as plt
#import seaborn as sns
%matplotlib inline

In [2]:
sipp_2018 = pd.read_csv('../data/interim/sipp2018_person/sipp2018_person.csv')

In [3]:
# collapse eeduc categories

m1 = {5:[31,32,33,34,35,36,37,38], 
      6:[39],
      7:[40,41,42],
      8:[45],
      9:[43],
      10:[44,46],
     }
m2 = {v: k for k,vv in m1.items() for v in vv}
m2
sipp_2018['EEDUC_X'] = (sipp_2018
                        .EEDUC
                        .map(m2)
                        .astype('category')
                       )
sipp_2018.EEDUC_X.value_counts()

7     14293
6     14176
9      9524
5      6000
10     4955
8       756
Name: EEDUC_X, dtype: int64

In [4]:
# Initialize Independent and Target Features 
y = sipp_2018.EOWN_ST
X = sipp_2018.drop(['SSUID', 'PNUM', 'EOWN_ST'], axis='columns')

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=23)

In [6]:
# Return df with all features with % missing values more than 70%
desc = X_train.describe().T
desc = (desc
        .assign(percent_missing = 1 - (desc['count'] / len(X_train)))
        .percent_missing
        .sort_values(ascending=False)
        .to_frame()
        .query('percent_missing > 0')
       )
desc

Unnamed: 0,percent_missing
TWKHRS5,1.000000
EJB6_WSHMSAT,1.000000
EJB6_JBORSE,1.000000
EJB5_PROPB,1.000000
EDDELAY,1.000000
...,...
ETRANS_MNYN,0.001207
TEHC_REGION,0.001207
EOTHAS_MNYN,0.001207
EFOOD_MNYN,0.001207


In [7]:
# Read Metadata
sipp_dict_1 = pd.read_csv('../data/raw/sipp_2018/sippdict_1_of_2.csv')
sipp_dict_2 = pd.read_csv('../data/raw/sipp_2018/sippdict_2_of_2.csv')
sipp_dict = (pd.concat([sipp_dict_1, sipp_dict_2])
             .set_index('Variable')
             [['Description', 'Topic','Response Code']]
            )
# Inner Join High NaN with metadata dict
high_nan_metadata = pd.merge(sipp_dict, desc, left_index=True, right_index=True, how='inner')
high_nan_metadata.to_csv('../data/interim/sipp2018_person/FOR_REVIEW_high_NaN_features.csv', index=True)


In [8]:
high_nan_features = list(desc.T.columns)

# Transform train set
X_train = X_train.drop(high_nan_features, axis='columns')
print(f'X_train shape: {X_train.shape}')

# Transform test set
X_test = X_test.drop(high_nan_features, axis='columns')
print(f'X_test shape: {X_test.shape}')

X_train shape: (39763, 291)
X_test shape: (9941, 291)


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

In [10]:
sel = SelectFromModel(RandomForestClassifier(n_estimators=100))
sel.fit(X_train, y_train)

SelectFromModel(estimator=RandomForestClassifier())

In [11]:
sel.get_support()

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True,  True, False, False, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True,  True,  True,  True, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True,  True, False, False, False,
        True, False,  True, False, False,  True, False,  True, False,
        True, False,  True,  True,  True,  True,  True, False, False,
       False, False, False, False, False, False, False, False, False,
        True,  True,

In [12]:
select_features = X_train.columns[(sel.get_support())]

In [13]:
select_features

Index(['EOWN_MM', 'EOWN_MF', 'EOWN_IRAKEO', 'TINC_BANK', 'TVAL_BANK',
       'TINC_STMF', 'TVAL_STMF', 'TVAL_BOND', 'TINC_AST', 'TVAL_RET',
       'TVAL_HOME', 'TEQ_HOME', 'TEQ_VEH', 'TVAL_RMU', 'TVAL_AST', 'TNETWORTH',
       'THINC_BANK', 'THVAL_BANK', 'THINC_STMF', 'THVAL_STMF', 'THINC_AST',
       'THVAL_RET', 'THEQ_VEH', 'THVAL_RMU', 'THVAL_AST', 'THNETWORTH',
       'TAGE_EHC', 'TPPRPINC', 'TPTOTINC', 'TFINCPOV', 'TFCYINCPOV',
       'TFCYINCPOVT2', 'THCYINCPOV', 'THCYINCPOVT2'],
      dtype='object')

In [43]:
rf = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)

In [71]:
feature_importances = pd.DataFrame(rf.feature_importances_, 
                                   index=X_train.columns, columns=['importance'])
f1 = feature_importances.sort_values('importance',ascending=False)
f1.assign(cumsum = f1.cumsum(axis=0)).head(15)

Unnamed: 0,importance,cumsum
TVAL_STMF,0.204536,0.204536
TINC_STMF,0.134824,0.33936
THVAL_STMF,0.113538,0.452899
THINC_STMF,0.044274,0.497173
EOWN_MF,0.040589,0.537762
TINC_AST,0.031747,0.569509
TPPRPINC,0.027203,0.596712
TNETWORTH,0.020664,0.617376
THINC_AST,0.019441,0.636818
TVAL_BANK,0.012895,0.649713


In [16]:
y_train.value_counts(normalize=True)

2    0.884289
1    0.115711
Name: EOWN_ST, dtype: float64

In [21]:
y_pred = rf.predict(X_train)

In [59]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
cm = confusion_matrix(y_train, y_pred)
print(cm)

[[ 4601     0]
 [    0 35162]]


In [55]:
accuracy_score(y_train, y_pred)

1.0

In [47]:
y_pred_test = rf.predict(X_test)


In [61]:
cm2 = confusion_matrix(y_test, pd.Series(y_pred_test))
print(cm2)

[[1125   51]
 [ 226 8539]]


In [62]:
accuracy_score(y_test, y_pred_test)

0.9721356000402374

In [73]:
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           1       0.83      0.96      0.89      1176
           2       0.99      0.97      0.98      8765

    accuracy                           0.97      9941
   macro avg       0.91      0.97      0.94      9941
weighted avg       0.97      0.97      0.97      9941

