In [21]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.ensemble import BalancedRandomForestClassifier
#from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
#from sklearn.ensemble import VotingClassifier

In [22]:
train = pd.read_csv("trainingData.csv")

In [23]:
train.head()

Unnamed: 0,testindex,outcome,distance,visit_count,claims_daysaway,tier,fqhc,pcp_lookback,family_assignment,kid,is_ped,same_gender,same_language,same_address
0,0,0,0.071912,0,,2.0,0,0,0,0,0,1,0,0
1,2,0,0.448447,0,,1.0,0,0,1,1,1,0,0,0
2,5,0,0.345793,0,,1.0,0,1,0,0,0,1,0,0
3,6,0,1.857032,0,,1.0,0,0,1,1,1,1,0,0
4,7,1,7.22206,0,,1.0,0,0,1,1,0,0,0,0


In [24]:
train.drop(['testindex','outcome'], axis =1).describe()

Unnamed: 0,distance,visit_count,claims_daysaway,tier,fqhc,pcp_lookback,family_assignment,kid,is_ped,same_gender,same_language,same_address
count,166573.0,166573.0,29417.0,166054.0,166573.0,166573.0,166573.0,166573.0,166573.0,166573.0,166573.0,166573.0
mean,2.818446,1.063732,140.967434,1.790068,0.18793,0.442449,0.331812,0.373062,0.390003,0.508822,0.064884,0.062753
std,5.636061,3.217193,126.741228,0.971943,0.390657,0.496678,0.470865,0.48362,0.487752,0.499924,0.246323,0.242519
min,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.47426,0.0,49.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.065102,0.0,105.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,2.227714,0.0,191.0,2.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
max,50.0,108.0,749.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [25]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166573 entries, 0 to 166572
Data columns (total 14 columns):
testindex            166573 non-null int64
outcome              166573 non-null int64
distance             166573 non-null float64
visit_count          166573 non-null int64
claims_daysaway      29417 non-null float64
tier                 166054 non-null float64
fqhc                 166573 non-null int64
pcp_lookback         166573 non-null int64
family_assignment    166573 non-null int64
kid                  166573 non-null int64
is_ped               166573 non-null int64
same_gender          166573 non-null int64
same_language        166573 non-null int64
same_address         166573 non-null int64
dtypes: float64(3), int64(11)
memory usage: 17.8 MB


In [26]:
#populating Nan value with mean
train['tier'].fillna(round(train['tier'].mean(),2),inplace=True)

In [27]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166573 entries, 0 to 166572
Data columns (total 14 columns):
testindex            166573 non-null int64
outcome              166573 non-null int64
distance             166573 non-null float64
visit_count          166573 non-null int64
claims_daysaway      29417 non-null float64
tier                 166573 non-null float64
fqhc                 166573 non-null int64
pcp_lookback         166573 non-null int64
family_assignment    166573 non-null int64
kid                  166573 non-null int64
is_ped               166573 non-null int64
same_gender          166573 non-null int64
same_language        166573 non-null int64
same_address         166573 non-null int64
dtypes: float64(3), int64(11)
memory usage: 17.8 MB


In [28]:
X = train.drop(['testindex','outcome','claims_daysaway'], axis=1)
y = train['outcome']

In [29]:
print(X.shape)
print(y.shape)

(166573, 11)
(166573,)


# Scaling

## StandardScaler

In [30]:
# scaling 3 features

sc = StandardScaler()
X['scaled_distance']= sc.fit_transform(X['distance'].values.reshape(-1,1))
X['scaled_visit_count'] = sc.fit_transform(X['visit_count'].values.reshape(-1,1))
X['scaled_tier'] = sc.fit_transform(X['tier'].values.reshape(-1,1))




In [31]:
# dropping unscaled columns
X = X.drop(['distance','visit_count','tier'],axis=1)

In [32]:
X.columns

Index(['fqhc', 'pcp_lookback', 'family_assignment', 'kid', 'is_ped',
       'same_gender', 'same_language', 'same_address', 'scaled_distance',
       'scaled_visit_count', 'scaled_tier'],
      dtype='object')

In [33]:
# highly imbalanced
y.value_counts()

0    159730
1      6843
Name: outcome, dtype: int64

### train test split

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y)

In [35]:
print(X_train.shape)
print(y_train.shape)

(149915, 11)
(149915,)


# Hypertuning 

In [None]:
#Use large max_bin (may be slower)
#Use small learning_rate with large num_iterations
#Use large num_leaves (may cause over-fitting)
#Use bigger training data
#Try dart

In [36]:
lgbm = LGBMClassifier( 
    boosting_type="gbdt",
    is_unbalance=True, 
    random_state=10, 
    n_estimators=100,
    num_leaves=30, 
    max_depth=8,
    feature_fraction=0.5,  
    bagging_fraction=0.8, 
    bagging_freq=15, 
    learning_rate=0.01,    
)

In [37]:
params_opt = {'n_estimators':range(200, 600, 80), 'num_leaves':range(20,100,20), 'max_depth':range(3,9,2)}
gridSearchCV = GridSearchCV(estimator = lgbm, 
    param_grid = params_opt, 
    scoring='roc_auc',
    n_jobs=4,
    iid=False, 
    verbose=1,
    cv=3)
gridSearchCV.fit(X_train,y_train)
gridSearchCV.best_params_, gridSearchCV.best_score_

Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed: 12.5min
[Parallel(n_jobs=4)]: Done 180 out of 180 | elapsed: 108.5min finished


({'max_depth': 7, 'n_estimators': 520, 'num_leaves': 40}, 0.8211326377623914)

In [38]:
gridSearchCV.best_params_, gridSearchCV.best_score_

({'max_depth': 7, 'n_estimators': 520, 'num_leaves': 40}, 0.8211326377623914)

### LGBMClassifier

In [39]:
lgbm = LGBMClassifier( 
    boosting_type="gbdt",
    is_unbalance=True, 
    random_state=10, 
    n_estimators=520,
    num_leaves=40, 
    max_depth=7,
    feature_fraction=0.5,  
    bagging_fraction=0.8, 
    bagging_freq=15, 
    learning_rate=0.01,    
)

In [40]:
lgbm.fit(X_train,y_train)

LGBMClassifier(bagging_fraction=0.8, bagging_freq=15, boosting_type='gbdt',
        class_weight=None, colsample_bytree=1.0, feature_fraction=0.5,
        importance_type='split', is_unbalance=True, learning_rate=0.01,
        max_depth=7, min_child_samples=20, min_child_weight=0.001,
        min_split_gain=0.0, n_estimators=520, n_jobs=-1, num_leaves=40,
        objective=None, random_state=10, reg_alpha=0.0, reg_lambda=0.0,
        silent=True, subsample=1.0, subsample_for_bin=200000,
        subsample_freq=0)

In [41]:
y_pred = lgbm.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test,y_pred))

y_pred_prob = lgbm.predict_proba(X_test)[:,1]

# plotting roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
print("auc score : {}".format(roc_auc_score(y_test, y_pred_prob)))

[[13246  2728]
 [  222   462]]
              precision    recall  f1-score   support

           0       0.98      0.83      0.90     15974
           1       0.14      0.68      0.24       684

   micro avg       0.82      0.82      0.82     16658
   macro avg       0.56      0.75      0.57     16658
weighted avg       0.95      0.82      0.87     16658

auc score : 0.8214332848627558


### Stack Ensemble

In [132]:
clf1 = LGBMClassifier(random_state=0)
clf2 = CatBoostClassifier(random_state=0)
clf3 = XGBClassifier(random_state=0)
votf = VotingClassifier(estimators=[('lgb', clf1),('cat', clf2),('xgb', clf3)], voting='soft')
votf.fit(X_train, y_train)

Learning rate set to 0.069165
0:	learn: 0.5901213	total: 124ms	remaining: 2m 3s
1:	learn: 0.5006134	total: 266ms	remaining: 2m 12s
2:	learn: 0.4372946	total: 428ms	remaining: 2m 22s
3:	learn: 0.3813333	total: 545ms	remaining: 2m 15s
4:	learn: 0.3404567	total: 680ms	remaining: 2m 15s
5:	learn: 0.3085829	total: 813ms	remaining: 2m 14s
6:	learn: 0.2788907	total: 1.06s	remaining: 2m 30s
7:	learn: 0.2541682	total: 1.24s	remaining: 2m 34s
8:	learn: 0.2336540	total: 1.43s	remaining: 2m 37s
9:	learn: 0.2174125	total: 1.59s	remaining: 2m 37s
10:	learn: 0.2036643	total: 1.81s	remaining: 2m 42s
11:	learn: 0.1933775	total: 1.92s	remaining: 2m 38s
12:	learn: 0.1845092	total: 2.05s	remaining: 2m 35s
13:	learn: 0.1768787	total: 2.19s	remaining: 2m 33s
14:	learn: 0.1705776	total: 2.32s	remaining: 2m 32s
15:	learn: 0.1669138	total: 2.45s	remaining: 2m 30s
16:	learn: 0.1622696	total: 2.57s	remaining: 2m 28s
17:	learn: 0.1588575	total: 2.7s	remaining: 2m 27s
18:	learn: 0.1551035	total: 2.86s	remaining: 2

158:	learn: 0.1300505	total: 24.3s	remaining: 2m 8s
159:	learn: 0.1300374	total: 24.4s	remaining: 2m 8s
160:	learn: 0.1300321	total: 24.6s	remaining: 2m 8s
161:	learn: 0.1300181	total: 24.7s	remaining: 2m 7s
162:	learn: 0.1300086	total: 24.9s	remaining: 2m 7s
163:	learn: 0.1300055	total: 25s	remaining: 2m 7s
164:	learn: 0.1299842	total: 25.2s	remaining: 2m 7s
165:	learn: 0.1299722	total: 25.3s	remaining: 2m 7s
166:	learn: 0.1299595	total: 25.4s	remaining: 2m 6s
167:	learn: 0.1299524	total: 25.6s	remaining: 2m 6s
168:	learn: 0.1299369	total: 25.8s	remaining: 2m 6s
169:	learn: 0.1299282	total: 25.9s	remaining: 2m 6s
170:	learn: 0.1299017	total: 26s	remaining: 2m 6s
171:	learn: 0.1298865	total: 26.1s	remaining: 2m 5s
172:	learn: 0.1298677	total: 26.3s	remaining: 2m 5s
173:	learn: 0.1298599	total: 26.5s	remaining: 2m 5s
174:	learn: 0.1298412	total: 26.6s	remaining: 2m 5s
175:	learn: 0.1298010	total: 26.8s	remaining: 2m 5s
176:	learn: 0.1297950	total: 26.9s	remaining: 2m 5s
177:	learn: 0.12

315:	learn: 0.1278483	total: 48.2s	remaining: 1m 44s
316:	learn: 0.1278418	total: 48.3s	remaining: 1m 44s
317:	learn: 0.1278345	total: 48.4s	remaining: 1m 43s
318:	learn: 0.1278184	total: 48.6s	remaining: 1m 43s
319:	learn: 0.1278063	total: 48.7s	remaining: 1m 43s
320:	learn: 0.1277989	total: 48.8s	remaining: 1m 43s
321:	learn: 0.1277849	total: 49s	remaining: 1m 43s
322:	learn: 0.1277730	total: 49.1s	remaining: 1m 42s
323:	learn: 0.1277684	total: 49.3s	remaining: 1m 42s
324:	learn: 0.1277525	total: 49.4s	remaining: 1m 42s
325:	learn: 0.1277420	total: 49.5s	remaining: 1m 42s
326:	learn: 0.1277285	total: 49.6s	remaining: 1m 42s
327:	learn: 0.1277171	total: 49.8s	remaining: 1m 41s
328:	learn: 0.1277103	total: 49.9s	remaining: 1m 41s
329:	learn: 0.1277053	total: 50.1s	remaining: 1m 41s
330:	learn: 0.1276950	total: 50.2s	remaining: 1m 41s
331:	learn: 0.1276785	total: 50.4s	remaining: 1m 41s
332:	learn: 0.1276681	total: 50.5s	remaining: 1m 41s
333:	learn: 0.1276596	total: 50.7s	remaining: 1m

471:	learn: 0.1262227	total: 1m 11s	remaining: 1m 20s
472:	learn: 0.1262156	total: 1m 12s	remaining: 1m 20s
473:	learn: 0.1262057	total: 1m 12s	remaining: 1m 20s
474:	learn: 0.1261952	total: 1m 12s	remaining: 1m 20s
475:	learn: 0.1261913	total: 1m 12s	remaining: 1m 19s
476:	learn: 0.1261818	total: 1m 12s	remaining: 1m 19s
477:	learn: 0.1261725	total: 1m 12s	remaining: 1m 19s
478:	learn: 0.1261640	total: 1m 12s	remaining: 1m 19s
479:	learn: 0.1261625	total: 1m 13s	remaining: 1m 19s
480:	learn: 0.1261555	total: 1m 13s	remaining: 1m 19s
481:	learn: 0.1261401	total: 1m 13s	remaining: 1m 18s
482:	learn: 0.1261373	total: 1m 13s	remaining: 1m 18s
483:	learn: 0.1261226	total: 1m 13s	remaining: 1m 18s
484:	learn: 0.1261086	total: 1m 13s	remaining: 1m 18s
485:	learn: 0.1260938	total: 1m 13s	remaining: 1m 18s
486:	learn: 0.1260866	total: 1m 14s	remaining: 1m 17s
487:	learn: 0.1260763	total: 1m 14s	remaining: 1m 17s
488:	learn: 0.1260650	total: 1m 14s	remaining: 1m 17s
489:	learn: 0.1260558	total:

626:	learn: 0.1249055	total: 1m 34s	remaining: 56s
627:	learn: 0.1248957	total: 1m 34s	remaining: 55.9s
628:	learn: 0.1248764	total: 1m 34s	remaining: 55.8s
629:	learn: 0.1248709	total: 1m 34s	remaining: 55.7s
630:	learn: 0.1248545	total: 1m 35s	remaining: 55.6s
631:	learn: 0.1248489	total: 1m 35s	remaining: 55.5s
632:	learn: 0.1248421	total: 1m 35s	remaining: 55.3s
633:	learn: 0.1248355	total: 1m 35s	remaining: 55.1s
634:	learn: 0.1248261	total: 1m 35s	remaining: 55s
635:	learn: 0.1248184	total: 1m 35s	remaining: 54.8s
636:	learn: 0.1248082	total: 1m 35s	remaining: 54.7s
637:	learn: 0.1248034	total: 1m 36s	remaining: 54.5s
638:	learn: 0.1247978	total: 1m 36s	remaining: 54.4s
639:	learn: 0.1247960	total: 1m 36s	remaining: 54.2s
640:	learn: 0.1247931	total: 1m 36s	remaining: 54.1s
641:	learn: 0.1247824	total: 1m 36s	remaining: 53.9s
642:	learn: 0.1247758	total: 1m 36s	remaining: 53.8s
643:	learn: 0.1247715	total: 1m 37s	remaining: 53.6s
644:	learn: 0.1247600	total: 1m 37s	remaining: 53.

783:	learn: 0.1236691	total: 1m 59s	remaining: 33s
784:	learn: 0.1236546	total: 1m 59s	remaining: 32.8s
785:	learn: 0.1236477	total: 1m 59s	remaining: 32.7s
786:	learn: 0.1236458	total: 2m	remaining: 32.5s
787:	learn: 0.1236367	total: 2m	remaining: 32.3s
788:	learn: 0.1236313	total: 2m	remaining: 32.2s
789:	learn: 0.1236271	total: 2m	remaining: 32s
790:	learn: 0.1236230	total: 2m	remaining: 31.9s
791:	learn: 0.1236146	total: 2m	remaining: 31.7s
792:	learn: 0.1236071	total: 2m	remaining: 31.6s
793:	learn: 0.1236001	total: 2m 1s	remaining: 31.4s
794:	learn: 0.1235898	total: 2m 1s	remaining: 31.3s
795:	learn: 0.1235814	total: 2m 1s	remaining: 31.1s
796:	learn: 0.1235690	total: 2m 1s	remaining: 30.9s
797:	learn: 0.1235645	total: 2m 1s	remaining: 30.8s
798:	learn: 0.1235569	total: 2m 1s	remaining: 30.6s
799:	learn: 0.1235538	total: 2m 1s	remaining: 30.5s
800:	learn: 0.1235470	total: 2m 2s	remaining: 30.3s
801:	learn: 0.1235436	total: 2m 2s	remaining: 30.2s
802:	learn: 0.1235385	total: 2m 2s

940:	learn: 0.1225493	total: 2m 22s	remaining: 8.94s
941:	learn: 0.1225438	total: 2m 22s	remaining: 8.79s
942:	learn: 0.1225413	total: 2m 22s	remaining: 8.64s
943:	learn: 0.1225282	total: 2m 23s	remaining: 8.49s
944:	learn: 0.1225198	total: 2m 23s	remaining: 8.34s
945:	learn: 0.1225125	total: 2m 23s	remaining: 8.19s
946:	learn: 0.1225058	total: 2m 23s	remaining: 8.03s
947:	learn: 0.1224924	total: 2m 23s	remaining: 7.88s
948:	learn: 0.1224888	total: 2m 23s	remaining: 7.73s
949:	learn: 0.1224819	total: 2m 23s	remaining: 7.58s
950:	learn: 0.1224748	total: 2m 24s	remaining: 7.42s
951:	learn: 0.1224684	total: 2m 24s	remaining: 7.27s
952:	learn: 0.1224535	total: 2m 24s	remaining: 7.12s
953:	learn: 0.1224465	total: 2m 24s	remaining: 6.97s
954:	learn: 0.1224375	total: 2m 24s	remaining: 6.82s
955:	learn: 0.1224269	total: 2m 24s	remaining: 6.66s
956:	learn: 0.1224220	total: 2m 24s	remaining: 6.51s
957:	learn: 0.1224192	total: 2m 25s	remaining: 6.36s
958:	learn: 0.1224131	total: 2m 25s	remaining:

VotingClassifier(estimators=[('lgb', LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))],
         flatten_transform=None, n_jobs=None, voting='soft', weights=None)

In [133]:
y_pred = votf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test,y_pred))

y_pred_prob = votf.predict_proba(X_test)[:,1]

# plotting roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
print("auc score : {}".format(roc_auc_score(y_test, y_pred_prob)))

[[15936    38]
 [  601    83]]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     15974
           1       0.69      0.12      0.21       684

   micro avg       0.96      0.96      0.96     16658
   macro avg       0.82      0.56      0.59     16658
weighted avg       0.95      0.96      0.95     16658

auc score : 0.8213027730734959


## XGBClassifier

In [22]:
clf = XGBClassifier()
grid = GridSearchCV(clf,
                    params, n_jobs=-1,
                    scoring="roc_auc",
                    cv=3)

grid.fit(X_train, y_train)
print("Best: %f using %s" % (grid.best_score_, grid.best_params_))

Best: 0.819249 using {'learning_rate': 0.1, 'n_estimators': 200, 'scale_pos_weight': 15}


In [23]:
xgb = XGBClassifier(random_state=0, scale_pos_weight=15, n_estimators=200 )
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=200,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=15, seed=None,
       silent=True, subsample=1)

In [24]:
y_pred = xgb.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test,y_pred))

y_pred_prob = xgb.predict_proba(X_test)[:,1]

# plotting roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
print("auc score : {}".format(roc_auc_score(y_test, y_pred_prob)))

[[14209  1765]
 [  279   405]]
              precision    recall  f1-score   support

           0       0.98      0.89      0.93     15974
           1       0.19      0.59      0.28       684

   micro avg       0.88      0.88      0.88     16658
   macro avg       0.58      0.74      0.61     16658
weighted avg       0.95      0.88      0.91     16658

auc score : 0.8212534421797996


### RandomForestClassifier

In [93]:
class_weight={0:1,1:500}
rf = RandomForestClassifier(random_state=0, n_jobs=-1, class_weight=class_weight)
rf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 500},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=-1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [94]:
y_pred = rf.predict(X_test)

In [95]:
### confusion matrix and classification report

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test,y_pred))

[[46899  1020]
 [ 1658   395]]
              precision    recall  f1-score   support

           0       0.97      0.98      0.97     47919
           1       0.28      0.19      0.23      2053

   micro avg       0.95      0.95      0.95     49972
   macro avg       0.62      0.59      0.60     49972
weighted avg       0.94      0.95      0.94     49972



In [96]:
from sklearn.metrics import roc_curve

y_pred_prob = rf.predict_proba(X_test)[:,1]

# plotting roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
print("auc score : {}".format(roc_auc_score(y_test, y_pred_prob)))


auc score : 0.6819380736328811


### Oversampling using SMOTE and RandomClassifier

In [97]:
smote = SMOTE()
X_train_sample, y_train_sample = smote.fit_sample(X_train, y_train)

In [98]:
class_weight={0:1,1:500}
rf1 = RandomForestClassifier(random_state=0, n_jobs=-1, class_weight=class_weight)
rf1.fit(X_train_sample, y_train_sample)



RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 500},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=-1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [100]:
y_pred = rf1.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test,y_pred))

y_pred_prob = rf.predict_proba(X_test)[:,1]

# plotting roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
print("auc score : {}".format(roc_auc_score(y_test, y_pred_prob)))

[[39877  8042]
 [ 1112   941]]
              precision    recall  f1-score   support

           0       0.97      0.83      0.90     47919
           1       0.10      0.46      0.17      2053

   micro avg       0.82      0.82      0.82     49972
   macro avg       0.54      0.65      0.53     49972
weighted avg       0.94      0.82      0.87     49972

auc score : 0.6819380736328811


In [102]:
undersample = RandomUnderSampler()
X_train_under, y_train_under = undersample.fit_sample(X_train, y_train)

In [106]:
rf2 = RandomForestClassifier(random_state=0, n_jobs=-1, class_weight='balanced')
rf2.fit(X_train_under, y_train_under)



RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=-1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [120]:
y_pred = rf2.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test,y_pred))

y_pred_prob = rf2.predict_proba(X_test)[:,1]

# plotting roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
print("auc score : {}".format(roc_auc_score(y_test, y_pred_prob)))

[[34485 13434]
 [  669  1384]]
              precision    recall  f1-score   support

           0       0.98      0.72      0.83     47919
           1       0.09      0.67      0.16      2053

   micro avg       0.72      0.72      0.72     49972
   macro avg       0.54      0.70      0.50     49972
weighted avg       0.94      0.72      0.80     49972

auc score : 0.758768747273201


## Unknown TEST dataset

In [53]:
test = pd.read_csv("scoringData.csv")
test['tier'].unique()

array([ 1.,  3.,  2.,  4., nan])

In [54]:
# populating null values
test['tier']= test['tier'].fillna(round(test['tier'].mean(),2))

In [55]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71388 entries, 0 to 71387
Data columns (total 13 columns):
testindex            71388 non-null int64
distance             71388 non-null float64
visit_count          71388 non-null int64
claims_daysaway      12783 non-null float64
tier                 71388 non-null float64
fqhc                 71388 non-null int64
pcp_lookback         71388 non-null int64
family_assignment    71388 non-null int64
kid                  71388 non-null int64
is_ped               71388 non-null int64
same_gender          71388 non-null int64
same_language        71388 non-null int64
same_address         71388 non-null int64
dtypes: float64(3), int64(10)
memory usage: 7.1 MB


In [56]:
X = test.drop(['testindex','claims_daysaway'], axis=1)

In [57]:
print(X.shape)

(71388, 11)


#### scaling dataset 

In [58]:
X['scaled_distance']= sc.fit_transform(X['distance'].values.reshape(-1,1))
X['scaled_visit_count'] = sc.fit_transform(X['visit_count'].values.reshape(-1,1))
X['scaled_tier'] = sc.fit_transform(X['tier'].values.reshape(-1,1))



In [59]:
# dropping non scaled columns now
X = X.drop(['distance','visit_count','tier'],axis=1)

In [47]:
X.head()

Unnamed: 0,fqhc,pcp_lookback,family_assignment,kid,is_ped,same_gender,same_language,same_address,scaled_distance,scaled_visit_count,scaled_tier
0,1,0,0,1,1,1,0,0,-0.454209,-0.339068,-0.813053
1,0,0,1,1,1,0,0,0,-0.110327,-0.339068,1.244346
2,0,0,0,0,0,0,0,0,0.653442,-0.339068,0.215646
3,1,0,1,0,0,1,0,0,-0.480768,-0.339068,-0.813053
4,0,0,1,1,1,0,0,0,-0.405588,-0.339068,1.244346


In [60]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71388 entries, 0 to 71387
Data columns (total 11 columns):
fqhc                  71388 non-null int64
pcp_lookback          71388 non-null int64
family_assignment     71388 non-null int64
kid                   71388 non-null int64
is_ped                71388 non-null int64
same_gender           71388 non-null int64
same_language         71388 non-null int64
same_address          71388 non-null int64
scaled_distance       71388 non-null float64
scaled_visit_count    71388 non-null float64
scaled_tier           71388 non-null float64
dtypes: float64(3), int64(8)
memory usage: 6.0 MB


In [65]:
y_test_pred = rf1.predict(X)

In [66]:
pd.value_counts(pd.Series(y_test_pred))

0    57045
1    14343
dtype: int64

In [125]:
### Generating output file using XGB model

In [121]:
y_xgb_pred = xgb.predict(X)

In [124]:
X.head()

Unnamed: 0,fqhc,pcp_lookback,family_assignment,kid,is_ped,same_gender,same_language,same_address,scaled_distance,scaled_visit_count,scaled_tier
0,1,0,0,1,1,1,0,0,-0.454209,-0.339068,-0.814388
1,0,0,1,1,1,0,0,0,-0.110327,-0.339068,1.246391
2,0,0,0,0,0,0,0,0,0.653442,-0.339068,0.216002
3,1,0,1,0,0,1,0,0,-0.480768,-0.339068,-0.814388
4,0,0,1,1,1,0,0,0,-0.405588,-0.339068,1.246391


In [122]:
pd.value_counts(pd.Series(y_xgb_pred))

0    70970
1      418
dtype: int64

In [123]:
pd.DataFrame({'testindex': test['testindex'], 'Predicted':y_xgb_pred}).to_csv("output_xgb.csv", index=False)