In [495]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from datetime import datetime
from datetime import timedelta
from sklearn.grid_search import GridSearchCV
from return_low_no_suit_score import return_low_no_suit_score

# Imports and feature selection

In [2]:
# using docs to model
# labels come from claims data 
claims = pd.read_csv('/Users/drewrice/Desktop/capson_csv/Claims.csv')
docs = pd.read_csv('/Users/drewrice/Desktop/capson_csv/ProviderInfo.csv')
prems = pd.read_csv('/Users/drewrice/Desktop/capson_csv/ScheduleOfWrittenPremium.csv')
debs_and_creds_full = pd.read_csv('/Users/drewrice/Desktop/capson_csv/ScheduleOfDebitsAndCredits.csv')
policy = pd.read_csv('/Users/drewrice/Desktop/capson_csv/PolicyInfo.csv')

In [3]:
# drop non-essential columns
docs.drop(['Middle Name','Address 1', 'Address 2', 'Zip','Address Type','Practice Start Date'],axis=1,inplace=True)
docs.drop_duplicates(inplace=True)

In [4]:
# grab essential columns for building classifier
indem = claims[['Indemnity_Paid','ProviderId']]

In [5]:
# docs DF, conversion to string
# lower ProviderId
docs[['ProviderId','Last Name','First Name','Gender','City','State','County']] = \
docs[['ProviderId','Last Name','First Name','Gender','City','State','County']].astype(str)
docs['ProviderId'] = docs['ProviderId'].map(lambda x: x if type(x)!=str else x.lower())

In [6]:
# indem DF, conversions to string and float
# lower ProviderId
indem['Indemnity_Paid'] = indem['Indemnity_Paid'].str.replace(r'[$,]', '').astype('float')
indem['ProviderId'] = indem['ProviderId'].astype(str)
indem['ProviderId'] = indem['ProviderId'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [7]:
# def build_classifier_label(item):
#     if item == 0:
#         return 'suit, no indem'
#     if item != 0:
#         return 'suit, indem'

def build_classifier_label(item):
    if item == 0:
        return 1
    if item != 0:
        return 2

In [8]:
indem['indem_label'] = indem['Indemnity_Paid'].apply(lambda item: build_classifier_label(item))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


# Join docs and indemnity tables, fill NaNs with 'no suit'

In [9]:
joined = pd.merge(docs, indem[['ProviderId', 'indem_label']], how='left', on='ProviderId')
joined.drop_duplicates(inplace=True)

In [10]:
# finish indemnity classifier
# find NaNs in indem_label column, replace with 'no suit'
def indem_label_no_suit(x):
    if type(x) == np.float_: 
        if np.isnan(x): 
#             return 'no suit'
            return 0
        else:
            return x

In [11]:
joined.indem_label = joined.indem_label.apply(lambda item: indem_label_no_suit(item))

# Claims data
#### building classifiers for BoardCredit and ClaimsBand, then merging with joined

In [12]:
def build_classifier(item):
    if item == 0:
        return 0
    if item != 0:
        return 1

In [13]:
# resave debs_and_creds with just the desired columns
# NOTE: typo 'RecordKepping'
debs_and_creds = debs_and_creds_full[['ProviderId','BoardCredit','ClaimsBand','Procedures','JCAHO','RecordKepping']]

# lower Provider Id
debs_and_creds['ProviderId'] = debs_and_creds['ProviderId'].map(lambda x: x if type(x)!=str else x.lower())
debs_and_creds.tail()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,ProviderId,BoardCredit,ClaimsBand,Procedures,JCAHO,RecordKepping
8458,19d9c4e6-e820-41c4-b398-134e6c0ba410,-55,-220,0,0,0
8459,6398eeea-10af-448f-bd10-364be1d0c90c,0,-8951,0,-2984,0
8460,d54c0b59-08e5-4ed7-ba5a-413b2016b269,-1366,0,0,-1366,-2733
8461,3eacedf6-9173-4a64-89c7-ba8b6e150850,-88,0,0,0,0
8462,ea83f592-67dd-46d5-b15c-9d0738f81c93,0,-258,0,0,0


In [14]:
# group by Provider ID, resave debs_and_creds
debs_and_creds = debs_and_creds.groupby(debs_and_creds['ProviderId']).sum()
# reset index required
debs_and_creds.reset_index(inplace=True)

In [15]:
# use build_classifier to create binary column from BoardCredit & ClaimsBand
debs_and_creds['BoardCredit'] = debs_and_creds['BoardCredit'].apply(lambda item: build_classifier(item))
debs_and_creds['ClaimsBand'] = debs_and_creds['ClaimsBand'].apply(lambda item: build_classifier(item))
debs_and_creds['Procedures'] = debs_and_creds['Procedures'].apply(lambda item: build_classifier(item))
debs_and_creds['JCAHO'] = debs_and_creds['JCAHO'].apply(lambda item: build_classifier(item))
debs_and_creds['RecordKepping'] = debs_and_creds['RecordKepping'].apply(lambda item: build_classifier(item))

In [16]:
joined = pd.merge(joined, debs_and_creds, how='left', on='ProviderId')

# EDA

In [17]:
# print num of doctors in each bucket, using the 
print 'Doctors wih no suits: ', joined[joined['indem_label'] == 0].sort(['Last Name']).shape[0]
print 'Doctors with suits, no indemnity paid: ', joined[joined['indem_label'] == 1].sort(['Last Name']).shape[0]
print 'Doctors with suits, indemnity paid: ', joined[joined['indem_label'] == 2].sort(['Last Name']).shape[0]

Doctors wih no suits:  6147
Doctors with suits, no indemnity paid:  281
Doctors with suits, indemnity paid:  41


  from ipykernel import kernelapp as app
  app.launch_new_instance()


# Feature engineering

### ~ build age column ~

In [18]:
# convert DoB to datetime object
joined['Date of Birth'] = pd.to_datetime(joined['Date of Birth'])

In [19]:
# time_delta function
def time_delta(dt):
    if dt.year > 2000:
        dt = dt - timedelta(days=36525)
    return dt

In [20]:
# apply time_delta function to DoB column, deal with datetime rounding to the 2000s
joined['Date of Birth'] = joined['Date of Birth'].apply(lambda dt: time_delta(dt))

In [21]:
# fill missing values with 1/1/2060
joined['Date of Birth'].fillna(datetime(2060, 1, 1, 0, 0),inplace=True)

In [22]:
# build age column
now = datetime.now()
joined['age'] = now - joined['Date of Birth']

In [23]:
# function to convert years into days
def to_years(dt):
    if dt:
        dt = (dt.days) / 365.25
    return dt

In [24]:
# apply to_years function to age column
joined['age'] = (joined['age'].apply(lambda dt: to_years(dt))).round()

In [25]:
# refill Date of Birth with NaNs
joined['Date of Birth'].replace(datetime(2060, 1, 1, 0, 0), np.nan, inplace=True)

In [26]:
def refill_age(x):
    if x < 0:
        x = np.nan
    return x

In [27]:
# refill age with nan's for docs with missing birthdate
joined['age'] = (joined['age'].apply(lambda dt: refill_age(dt))).round()

In [28]:
joined.corr()

Unnamed: 0,indem_label,BoardCredit,ClaimsBand,Procedures,JCAHO,RecordKepping,age
indem_label,1.0,0.13173,0.114054,0.157412,0.191448,0.04883,0.113787
BoardCredit,0.13173,1.0,0.525431,0.034711,0.434245,0.17105,0.10967
ClaimsBand,0.114054,0.525431,1.0,0.055314,0.445487,0.224323,0.350781
Procedures,0.157412,0.034711,0.055314,1.0,-0.001525,-0.032338,0.048702
JCAHO,0.191448,0.434245,0.445487,-0.001525,1.0,0.269885,0.062544
RecordKepping,0.04883,0.17105,0.224323,-0.032338,0.269885,1.0,0.003475
age,0.113787,0.10967,0.350781,0.048702,0.062544,0.003475,1.0


# Basic model

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix

In [364]:
# drop rows with any NaN values
# not a long-term solution, but a quick and dirty model
no_suit = joined[joined.indem_label == 0]
suit_no_indem = joined[joined.indem_label == 1]
suit_indem = joined[joined.indem_label == 2]

no_suit.dropna(how='any', inplace=True)
suit_no_indem.dropna(how='any', inplace=True)
suit_indem.dropna(how='any', inplace=True)

# print totals after NaN drop
print no_suit.shape[0], suit_indem.shape[0], suit_no_indem.shape[0]

2828 39 272


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [365]:
# !
# change values to 1 for binary classification
# suit_indem['indem_label'] = 1

In [406]:
# random sample from the three categories
no_suit = no_suit.sample(600)
suit_no_indem = suit_no_indem.sample(250)
suit_indem = suit_indem.sample(39)

In [407]:
# combine random sample dataframes
to_model = pd.concat([no_suit, suit_indem, suit_no_indem], axis=0)

In [408]:
# build label and feature dataframes
y = to_model.pop('indem_label')
to_model = to_model[['BoardCredit','ClaimsBand','Procedures','JCAHO','age']]
to_model.shape

(889, 5)

In [409]:
# train / test split with random state
X_train, X_test, y_train, y_test = train_test_split(to_model, y, test_size=0.15, random_state=42)

In [410]:
model = RandomForestClassifier(n_estimators=80,oob_score=True)

In [411]:
# fit, validate with OOB score
model.fit(X_train,y_train)
model.oob_score_

0.64370860927152318

In [412]:
preds = model.predict(X_test)
model.score(X_test, y_test)

0.67910447761194026

In [413]:
confusion_matrix(y_test, preds)

array([[76, 14,  2],
       [22, 15,  1],
       [ 4,  0,  0]])

In [496]:
return_low_no_suit_score(model,X_test)

[[0, 0.4711, 0.5289, 0.0],
 [2, 0.6122, 0.3878, 0.0],
 [4, 0.4994, 0.5006, 0.0],
 [5, 0.3525, 0.6314, 0.0161],
 [6, 0.75, 0.0375, 0.2125],
 [7, 0.3225, 0.342, 0.3354],
 [8, 0.2444, 0.5103, 0.2453],
 [10, 0.199, 0.801, 0.0],
 [12, 0.7398, 0.2602, 0.0],
 [13, 0.2867, 0.6852, 0.0281],
 [14, 0.2354, 0.7371, 0.0275],
 [15, 0.5342, 0.3012, 0.1646],
 [17, 0.69, 0.2397, 0.0703],
 [18, 0.4994, 0.5006, 0.0],
 [29, 0.69, 0.2397, 0.0703],
 [30, 0.2625, 0.025, 0.7125],
 [33, 0.7304, 0.1775, 0.0921],
 [34, 0.7398, 0.2602, 0.0],
 [36, 0.5917, 0.4083, 0.0],
 [37, 0.6213, 0.3788, 0.0],
 [39, 0.4268, 0.4181, 0.1551],
 [41, 0.1187, 0.8812, 0.0],
 [43, 0.714, 0.286, 0.0],
 [47, 0.4983, 0.3401, 0.1617],
 [48, 0.7304, 0.1775, 0.0921],
 [50, 0.7778, 0.2222, 0.0],
 [51, 0.4711, 0.5289, 0.0],
 [54, 0.45, 0.425, 0.125],
 [56, 0.6615, 0.3385, 0.0],
 [57, 0.5342, 0.3012, 0.1646],
 [58, 0.433, 0.567, 0.0],
 [64, 0.0375, 0.4773, 0.4852],
 [65, 0.6602, 0.3398, 0.0],
 [66, 0.4625, 0.5375, 0.0],
 [68, 0.0, 0.9177, 0.0

#### Gridsearching! bypass to model

In [330]:
n_est_range = [32,64,80,85,90,95,100,120]
m_range = [2,3,4,5,6,8,16,32]
lrgs = GridSearchCV(estimator=model, param_grid=dict(n_estimators=n_est_range, max_depth=m_range), n_jobs=1)
lrgs.fit(X_train,y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=100, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [32, 64, 80, 85, 90, 95, 100, 120], 'max_depth': [2, 3, 4, 5, 6, 8, 16, 32]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [331]:
print lrgs.best_score_
print lrgs.best_estimator_

0.703311258278
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=80, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)
