In [111]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [4]:
## Models to try 
# SVC with balanced 
# SMOTE with SVC
# GridSearchCV with SVC
# Random Classifier
# Boosting -- XGBoost

In [109]:
!pip install xgboost

Collecting xgboost
  Downloading https://files.pythonhosted.org/packages/51/c1/198915b13e98b62a98f48309c41012638464651da755d941f4abe384c012/xgboost-0.82-py2.py3-none-win_amd64.whl (7.7MB)
Installing collected packages: xgboost
Successfully installed xgboost-0.82


In [5]:
train = pd.read_csv("trainingData.csv")

In [6]:
train.head()

Unnamed: 0,testindex,outcome,distance,visit_count,claims_daysaway,tier,fqhc,pcp_lookback,family_assignment,kid,is_ped,same_gender,same_language,same_address
0,0,0,0.071912,0,,2.0,0,0,0,0,0,1,0,0
1,2,0,0.448447,0,,1.0,0,0,1,1,1,0,0,0
2,5,0,0.345793,0,,1.0,0,1,0,0,0,1,0,0
3,6,0,1.857032,0,,1.0,0,0,1,1,1,1,0,0
4,7,1,7.22206,0,,1.0,0,0,1,1,0,0,0,0


In [7]:
train.drop(['testindex','outcome'], axis =1).describe()

Unnamed: 0,distance,visit_count,claims_daysaway,tier,fqhc,pcp_lookback,family_assignment,kid,is_ped,same_gender,same_language,same_address
count,166573.0,166573.0,29417.0,166054.0,166573.0,166573.0,166573.0,166573.0,166573.0,166573.0,166573.0,166573.0
mean,2.818446,1.063732,140.967434,1.790068,0.18793,0.442449,0.331812,0.373062,0.390003,0.508822,0.064884,0.062753
std,5.636061,3.217193,126.741228,0.971943,0.390657,0.496678,0.470865,0.48362,0.487752,0.499924,0.246323,0.242519
min,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.47426,0.0,49.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.065102,0.0,105.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,2.227714,0.0,191.0,2.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
max,50.0,108.0,749.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166573 entries, 0 to 166572
Data columns (total 14 columns):
testindex            166573 non-null int64
outcome              166573 non-null int64
distance             166573 non-null float64
visit_count          166573 non-null int64
claims_daysaway      29417 non-null float64
tier                 166054 non-null float64
fqhc                 166573 non-null int64
pcp_lookback         166573 non-null int64
family_assignment    166573 non-null int64
kid                  166573 non-null int64
is_ped               166573 non-null int64
same_gender          166573 non-null int64
same_language        166573 non-null int64
same_address         166573 non-null int64
dtypes: float64(3), int64(11)
memory usage: 17.8 MB


In [9]:
train['tier'].fillna(round(train['tier'].mean(),2),inplace=True)

In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166573 entries, 0 to 166572
Data columns (total 14 columns):
testindex            166573 non-null int64
outcome              166573 non-null int64
distance             166573 non-null float64
visit_count          166573 non-null int64
claims_daysaway      29417 non-null float64
tier                 166573 non-null float64
fqhc                 166573 non-null int64
pcp_lookback         166573 non-null int64
family_assignment    166573 non-null int64
kid                  166573 non-null int64
is_ped               166573 non-null int64
same_gender          166573 non-null int64
same_language        166573 non-null int64
same_address         166573 non-null int64
dtypes: float64(3), int64(11)
memory usage: 17.8 MB


In [11]:
X = train.drop(['testindex','outcome','claims_daysaway'], axis=1)
y = train['outcome']

In [12]:
print(X.shape)
print(y.shape)

(166573, 11)
(166573,)


In [13]:
sc = StandardScaler()
X['scaled_distance']= sc.fit_transform(X['distance'].values.reshape(-1,1))
X['scaled_visit_count'] = sc.fit_transform(X['visit_count'].values.reshape(-1,1))
X['scaled_tier'] = sc.fit_transform(X['tier'].values.reshape(-1,1))




In [14]:
X = X.drop(['distance','visit_count','tier'],axis=1)

In [15]:
X.columns

Index(['fqhc', 'pcp_lookback', 'family_assignment', 'kid', 'is_ped',
       'same_gender', 'same_language', 'same_address', 'scaled_distance',
       'scaled_visit_count', 'scaled_tier'],
      dtype='object')

### train test split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

In [17]:
print(X_train.shape)
print(y_train.shape)

(116601, 11)
(116601,)


### RandomForestClassifier

In [93]:
class_weight={0:1,1:500}
rf = RandomForestClassifier(random_state=0, n_jobs=-1, class_weight=class_weight)
rf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 500},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=-1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [94]:
y_pred = rf.predict(X_test)

In [95]:
### confusion matrix and classification report

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test,y_pred))

[[46899  1020]
 [ 1658   395]]
              precision    recall  f1-score   support

           0       0.97      0.98      0.97     47919
           1       0.28      0.19      0.23      2053

   micro avg       0.95      0.95      0.95     49972
   macro avg       0.62      0.59      0.60     49972
weighted avg       0.94      0.95      0.94     49972



In [96]:
from sklearn.metrics import roc_curve

y_pred_prob = rf.predict_proba(X_test)[:,1]

# plotting roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
print("auc score : {}".format(roc_auc_score(y_test, y_pred_prob)))


auc score : 0.6819380736328811


### Oversampling using SMOTE and RandomClassifier

In [97]:
smote = SMOTE()
X_train_sample, y_train_sample = smote.fit_sample(X_train, y_train)

In [98]:
class_weight={0:1,1:500}
rf1 = RandomForestClassifier(random_state=0, n_jobs=-1, class_weight=class_weight)
rf1.fit(X_train_sample, y_train_sample)



RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 500},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=-1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [100]:
y_pred = rf1.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test,y_pred))

y_pred_prob = rf.predict_proba(X_test)[:,1]

# plotting roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
print("auc score : {}".format(roc_auc_score(y_test, y_pred_prob)))

[[39877  8042]
 [ 1112   941]]
              precision    recall  f1-score   support

           0       0.97      0.83      0.90     47919
           1       0.10      0.46      0.17      2053

   micro avg       0.82      0.82      0.82     49972
   macro avg       0.54      0.65      0.53     49972
weighted avg       0.94      0.82      0.87     49972

auc score : 0.6819380736328811


In [102]:
undersample = RandomUnderSampler()
X_train_under, y_train_under = undersample.fit_sample(X_train, y_train)

In [106]:
rf2 = RandomForestClassifier(random_state=0, n_jobs=-1, class_weight='balanced')
rf2.fit(X_train_under, y_train_under)



RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=-1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [120]:
y_pred = rf2.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test,y_pred))

y_pred_prob = rf2.predict_proba(X_test)[:,1]

# plotting roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
print("auc score : {}".format(roc_auc_score(y_test, y_pred_prob)))

[[34485 13434]
 [  669  1384]]
              precision    recall  f1-score   support

           0       0.98      0.72      0.83     47919
           1       0.09      0.67      0.16      2053

   micro avg       0.72      0.72      0.72     49972
   macro avg       0.54      0.70      0.50     49972
weighted avg       0.94      0.72      0.80     49972

auc score : 0.758768747273201


## Xgboost

In [116]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [117]:
y_pred = xgb.predict(X_test)

In [119]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test,y_pred))

y_pred_prob = xgb.predict_proba(X_test)[:,1]

# plotting roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
print("auc score : {}".format(roc_auc_score(y_test, y_pred_prob)))

[[47847    72]
 [ 1841   212]]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     47919
           1       0.75      0.10      0.18      2053

   micro avg       0.96      0.96      0.96     49972
   macro avg       0.85      0.55      0.58     49972
weighted avg       0.95      0.96      0.95     49972

auc score : 0.817294069478566


## TEST dataset

In [53]:
test = pd.read_csv("scoringData.csv")
test['tier'].unique()

array([ 1.,  3.,  2.,  4., nan])

In [54]:
# populating null values
test['tier']= test['tier'].fillna(round(test['tier'].mean(),2))

In [55]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71388 entries, 0 to 71387
Data columns (total 13 columns):
testindex            71388 non-null int64
distance             71388 non-null float64
visit_count          71388 non-null int64
claims_daysaway      12783 non-null float64
tier                 71388 non-null float64
fqhc                 71388 non-null int64
pcp_lookback         71388 non-null int64
family_assignment    71388 non-null int64
kid                  71388 non-null int64
is_ped               71388 non-null int64
same_gender          71388 non-null int64
same_language        71388 non-null int64
same_address         71388 non-null int64
dtypes: float64(3), int64(10)
memory usage: 7.1 MB


In [56]:
X = test.drop(['testindex','claims_daysaway'], axis=1)

In [57]:
print(X.shape)

(71388, 11)


#### scaling dataset 

In [58]:
X['scaled_distance']= sc.fit_transform(X['distance'].values.reshape(-1,1))
X['scaled_visit_count'] = sc.fit_transform(X['visit_count'].values.reshape(-1,1))
X['scaled_tier'] = sc.fit_transform(X['tier'].values.reshape(-1,1))



In [59]:
# dropping non scaled columns now
X = X.drop(['distance','visit_count','tier'],axis=1)

In [47]:
X.head()

Unnamed: 0,fqhc,pcp_lookback,family_assignment,kid,is_ped,same_gender,same_language,same_address,scaled_distance,scaled_visit_count,scaled_tier
0,1,0,0,1,1,1,0,0,-0.454209,-0.339068,-0.813053
1,0,0,1,1,1,0,0,0,-0.110327,-0.339068,1.244346
2,0,0,0,0,0,0,0,0,0.653442,-0.339068,0.215646
3,1,0,1,0,0,1,0,0,-0.480768,-0.339068,-0.813053
4,0,0,1,1,1,0,0,0,-0.405588,-0.339068,1.244346


In [60]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71388 entries, 0 to 71387
Data columns (total 11 columns):
fqhc                  71388 non-null int64
pcp_lookback          71388 non-null int64
family_assignment     71388 non-null int64
kid                   71388 non-null int64
is_ped                71388 non-null int64
same_gender           71388 non-null int64
same_language         71388 non-null int64
same_address          71388 non-null int64
scaled_distance       71388 non-null float64
scaled_visit_count    71388 non-null float64
scaled_tier           71388 non-null float64
dtypes: float64(3), int64(8)
memory usage: 6.0 MB


In [65]:
y_test_pred = rf1.predict(X)

In [66]:
pd.value_counts(pd.Series(y_test_pred))

0    57045
1    14343
dtype: int64

In [121]:
y_xgb_pred = xgb.predict(X)

In [124]:
X.head()

Unnamed: 0,fqhc,pcp_lookback,family_assignment,kid,is_ped,same_gender,same_language,same_address,scaled_distance,scaled_visit_count,scaled_tier
0,1,0,0,1,1,1,0,0,-0.454209,-0.339068,-0.814388
1,0,0,1,1,1,0,0,0,-0.110327,-0.339068,1.246391
2,0,0,0,0,0,0,0,0,0.653442,-0.339068,0.216002
3,1,0,1,0,0,1,0,0,-0.480768,-0.339068,-0.814388
4,0,0,1,1,1,0,0,0,-0.405588,-0.339068,1.246391


In [122]:
pd.value_counts(pd.Series(y_xgb_pred))

0    70970
1      418
dtype: int64

In [123]:
pd.DataFrame({'testindex': test['testindex'], 'Predicted':y_xgb_pred}).to_csv("output_xgb.csv", index=False)