In [1]:
import pandas as pd
import json
from datetime import datetime,timedelta
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression


with open('data_challenge.json') as f:
    data = json.load(f)

In [2]:
data = pd.DataFrame(data)
data.info()

df,missing_rating_of= (data.drop(data[data.avg_rating_of_driver.isnull()].index),
                       data[data.avg_rating_of_driver.isnull()])

df,missing_rating_by= (df.drop(df[df.avg_rating_by_driver.isnull()].index),
                       df[df.avg_rating_by_driver.isnull()])

df,missing_phone= (df.drop(df[df.phone.isnull()].index),
                       df[df.phone.isnull()])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 12 columns):
avg_dist                  50000 non-null float64
avg_rating_by_driver      49799 non-null float64
avg_rating_of_driver      41878 non-null float64
avg_surge                 50000 non-null float64
city                      50000 non-null object
last_trip_date            50000 non-null object
phone                     49604 non-null object
signup_date               50000 non-null object
surge_pct                 50000 non-null float64
trips_in_first_30_days    50000 non-null int64
ultimate_black_user       50000 non-null bool
weekday_pct               50000 non-null float64
dtypes: bool(1), float64(6), int64(1), object(4)
memory usage: 4.2+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 41445 entries, 0 to 49998
Data columns (total 12 columns):
avg_dist                  41445 non-null float64
avg_rating_by_driver      41445 non-null float64
avg_rating_of_driver      41445 no

In [3]:
frame = timedelta(days=30)
start = datetime.strptime(df.last_trip_date.max(),'%Y-%m-%d') - frame
act = []
for i in df.last_trip_date:
    inst = datetime.strptime(i,'%Y-%m-%d')
    if inst > start:
        act.append(1)
    else:
        act.append(0)

df['phone'] = df['phone'].apply(lambda x: 1 if x=='Android' else 0)
df['ultimate_black_user'] = df['ultimate_black_user'].apply(
        lambda x: 1 if x==True else 0)

In [4]:
city_dummy = pd.get_dummies(df['city'],drop_first=True)

model_data = df.drop(['last_trip_date','signup_date','city'],axis=1)
model_data = pd.concat([model_data,city_dummy],axis=1)
model_data["Active"] = act

In [13]:
pd.options.mode.chained_assignment=None 

of_act = []
for i in missing_rating_of.last_trip_date:
    inst = datetime.strptime(i,'%Y-%m-%d')
    if inst > start:
        of_act.append(1)
    else:
        of_act.append(0)
missing_rating_of['Active'] = of_act


by_act = []
for i in missing_rating_by.last_trip_date:
    inst = datetime.strptime(i,'%Y-%m-%d')
    if inst > start:
        by_act.append(1)
    else:
        by_act.append(0)
missing_rating_by['Active'] = by_act


missing_ratings = missing_rating_of[missing_rating_of['avg_rating_by_driver'].isnull() ==True]

miss_act = []
for i in missing_ratings.last_trip_date:
    inst = datetime.strptime(i,'%Y-%m-%d')
    if inst > start:
        miss_act.append(1)
    else:
        miss_act.append(0)
missing_ratings['Active'] = miss_act

print('missing_rating_of_driver Percent inactive:%',
      (1-(missing_rating_of.Active.sum() / len(missing_rating_of)))*100)
print('missing_rating_by_driver Percent inactive:%',
      (1-(missing_rating_by.Active.sum() / len(missing_rating_by)))*100)
print('missing_ratings_both Percent inactive:%',
      (1-(missing_ratings.Active.sum() / len(missing_ratings)))*100)

missing_rating_of_driver Percent inactive:% 80.69441024378233
missing_rating_by_driver Percent inactive:% 89.55223880597015
missing_ratings_both Percent inactive:% 70.1492537313433


In [8]:
def split_process(a,b):
#split data into training and test set
    X_train,X_test,Y_train,Y_test = train_test_split(a, b, test_size = 0.25,random_state=42)
    scaler_X = StandardScaler()
    X_train = scaler_X.fit_transform(X_train)
    X_test = scaler_X.transform(X_test)    
    return X_train,X_test,Y_train,Y_test

xtr,xte,ytr,yte = split_process(model_data.iloc[:,:-1],model_data['Active'])

In [9]:
graboost = GradientBoostingClassifier(max_features='sqrt')
parametergra = {'n_estimators':[155,165,160],'learning_rate':[.3,.2],
              'max_depth':[3,4]}
grid1 = GridSearchCV(estimator=graboost,param_grid=parametergra,
                     scoring='accuracy',cv=5)
grid1.fit(xtr,ytr)
best_para_gra = grid1.best_params_
best_acc_gra = grid1.best_score_

print(best_para_gra,best_acc_gra)


{'learning_rate': 0.2, 'max_depth': 4, 'n_estimators': 165} 0.788051346395


In [10]:
gbm = GradientBoostingClassifier(max_features='sqrt',
                                 n_estimators=165,learning_rate=.2,max_depth= 4)
gbm.fit(xtr,ytr)

ypredgra = gbm.predict(xte)
print(classification_report(yte,ypredgra))
boost_result =confusion_matrix(yte,ypredgra)
print(boost_result)
boo_acc = (boost_result[1][1]+boost_result[0][0])/boost_result.sum()
print("Gradient Boosted Tress Accuracy =",boo_acc)

feature_coef = pd.DataFrame(gbm.feature_importances_).transpose()
feature_coef.columns = list(model_data.columns[:-1])
feature_coef.index = ['GBM'] 
feature_coef

             precision    recall  f1-score   support

          0       0.81      0.85      0.83      6186
          1       0.76      0.70      0.73      4176

avg / total       0.79      0.79      0.79     10362

[[5258  928]
 [1259 2917]]
Gradient Boosted Tress Accuracy = 0.788940359004


Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,phone,surge_pct,trips_in_first_30_days,ultimate_black_user,weekday_pct,King's Landing,Winterfell
GBM,0.160731,0.102242,0.080368,0.09682,0.029125,0.122108,0.141955,0.026056,0.172688,0.043875,0.024035


In [11]:
logreg = LogisticRegression()
parametervec = {'C':[4.3,5,4],'solver':['sag','liblinear'],'tol':[.2,2.5,.3]}
grid2 = GridSearchCV(estimator=logreg,param_grid=parametervec,
                     scoring='accuracy',cv=5)
grid2.fit(xtr,ytr)
best_para_vec = grid2.best_params_
best_acc_vec = grid2.best_score_

print(best_para_vec,best_acc_vec)

{'C': 4, 'solver': 'sag', 'tol': 0.3} 0.711578676447


In [12]:
reg = LogisticRegression(C=4,tol=.3,solver='sag')
reg.fit(xtr,ytr)

ypredreg = reg.predict(xte)
print(classification_report(yte,ypredreg))
reg_result =confusion_matrix(yte,ypredreg)
print(reg_result)
reg_acc = (reg_result[1][1]+reg_result[0][0])/reg_result.sum()
print("Logistic Regression Accuracy =",reg_acc)
feature_coef1 = pd.DataFrame(reg.coef_,index=['Logistic Reg'])
feature_coef1.columns = list(model_data.columns[:-1])
feature_coef= pd.concat([feature_coef,feature_coef1])
feature_coef

             precision    recall  f1-score   support

          0       0.74      0.82      0.77      6186
          1       0.68      0.56      0.61      4176

avg / total       0.71      0.71      0.71     10362

[[5054 1132]
 [1822 2354]]
Logistic Regression Accuracy = 0.714919899633


Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,phone,surge_pct,trips_in_first_30_days,ultimate_black_user,weekday_pct,King's Landing,Winterfell
GBM,0.160731,0.102242,0.080368,0.09682,0.029125,0.122108,0.141955,0.026056,0.172688,0.043875,0.024035
Logistic Reg,-0.199073,-0.063966,0.000187,0.005912,-0.495931,0.093839,0.462612,0.501972,-0.006334,0.77438,0.349932
