In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [116]:
sipp_2018 = pd.read_csv('../data/interim/sipp2018_person/sipp2018_person.csv')

In [114]:
# collapse eeduc categories

m1 = {5:[31,32,33,34,35,36,37,38], 
      6:[39],
      7:[40,41,42],
      8:[45],
      9:[43, 44, 46],
     }
m2 = {v: k for k,vv in m1.items() for v in vv}
m2
sipp_2018['EEDUC_X'] = (sipp_2018
                        .EEDUC
                        .map(m2)
                        .astype('category')
                       )
sipp_2018.EEDUC_X.value_counts()

9    14479
7    14293
6    14176
5     6000
8      756
Name: EEDUC_X, dtype: int64

In [122]:
stock_value_features = list(sipp_2018.filter(like='_STMF').columns)
id_features = ['SSUID', 'PNUM', 'EOWN_ST']
drop_features = id_features + stock_value_features

# Initialize Independent and Target Features 
y = sipp_2018.EOWN_ST
X = sipp_2018.drop(drop_features, axis='columns')

In [123]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=23)

In [124]:
train_sample_weight = X_train.WPFINWGT

In [125]:
# Return df with all features with % missing values more than 70%
desc = X_train.describe().T
desc = (desc
        .assign(percent_missing = 1 - (desc['count'] / len(X_train)))
        .percent_missing
        .sort_values(ascending=False)
        .to_frame()
        .query('percent_missing > 0')
       )
desc

Unnamed: 0,percent_missing
EJB6_PFTLOSS,1.000000
EJB6_JBORSE,1.000000
EJB6_WSHMMON,1.000000
EJB6_TYPPAY4,1.000000
RPAR,1.000000
...,...
ETRANS_MNYN,0.001207
EOTHAS_MNYN,0.001207
EFOOD_MNYN,0.001207
TEHC_REGION,0.001207


In [126]:
high_nan_features = list(desc.T.columns)

# Transform train set
X_train = X_train.drop(high_nan_features, axis='columns')
print(f'X_train shape: {X_train.shape}')

# Transform test set
X_test = X_test.drop(high_nan_features, axis='columns')
print(f'X_test shape: {X_test.shape}')

X_train shape: (39763, 287)
X_test shape: (9941, 287)


In [127]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

In [120]:
sel = SelectFromModel(RandomForestClassifier(n_estimators=100))
sel.fit(X_train, y_train)

SelectFromModel(estimator=RandomForestClassifier())

In [128]:
sel.get_support()

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True,  True, False, False, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False,  True,  True,  True,  True,  True, False, False,
       False, False, False, False, False, False, False, False,  True,
        True,  True,  True,  True, False, False, False,  True,  True,
        True,  True, False,  True, False,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True, False, False, False,
       False, False, False,  True,  True,  True,  True, False, False,
       False,  True,

In [129]:
select_features = X_train.columns[(sel.get_support())]

In [130]:
len(select_features)

74

In [131]:
rf = RandomForestClassifier(n_estimators=100).fit(X_train, y_train, sample_weight=train_sample_weight)

In [132]:
feature_importances = pd.DataFrame(rf.feature_importances_, 
                                   index=X_train.columns, columns=['importance'])
f1 = feature_importances.sort_values('importance',ascending=False)
f1.assign(cumsum = f1.cumsum(axis=0)).head()

Unnamed: 0,importance,cumsum
TINC_AST,0.075685,0.075685
TPPRPINC,0.062754,0.138439
EOWN_MF,0.039833,0.178272
THINC_AST,0.03689,0.215162
TVAL_AST,0.027236,0.242398


In [105]:
y_train.value_counts(normalize=True)

2    0.884289
1    0.115711
Name: EOWN_ST, dtype: float64

In [133]:
y_pred = rf.predict(X_train)

In [134]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
cm = confusion_matrix(y_train, y_pred)
print(cm)

[[ 4598     3]
 [    1 35161]]


In [135]:
accuracy_score(y_train, y_pred)

0.9998994039685134

In [136]:
y_pred_test = rf.predict(X_test)


In [137]:
cm2 = confusion_matrix(y_test, pd.Series(y_pred_test))
print(cm2)

[[ 566  610]
 [ 194 8571]]


In [138]:
accuracy_score(y_test, y_pred_test)

0.9191228246655266

In [139]:
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           1       0.74      0.48      0.58      1176
           2       0.93      0.98      0.96      8765

    accuracy                           0.92      9941
   macro avg       0.84      0.73      0.77      9941
weighted avg       0.91      0.92      0.91      9941

