In [3]:
import pandas as pd
import json
from datetime import datetime,timedelta
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import numpy as np


with open('data_challenge.json') as f:
    data = json.load(f)

In [4]:
# read in data and define which users are Active in the entire dataset using a list
data = pd.DataFrame(data)

frame = timedelta(days=30)
start = datetime.strptime(data.last_trip_date.max(),'%Y-%m-%d') - frame
active = []
for i in data.last_trip_date:
    inst = datetime.strptime(i,'%Y-%m-%d')
    if inst > start:
        active.append(1)
    else:
        active.append(0)
        
print('%',((np.sum(active) / len(active))*100),'-',np.sum(active),'total users active <------------')
 
#seperate cases with missing values in individual dataframes
print("BEOFRE <-------------")
data.info()

df,missing_rating_of= (data.drop(data[data.avg_rating_of_driver.isnull()].index),
                       data[data.avg_rating_of_driver.isnull()])

df,missing_rating_by= (df.drop(df[df.avg_rating_by_driver.isnull()].index),
                       df[df.avg_rating_by_driver.isnull()])

df,missing_phone= (df.drop(df[df.phone.isnull()].index),
                       df[df.phone.isnull()])
print("AFTER <-------------")
df.info()

% 36.62 - 18310 total users active <------------
BEOFRE <-------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 12 columns):
avg_dist                  50000 non-null float64
avg_rating_by_driver      49799 non-null float64
avg_rating_of_driver      41878 non-null float64
avg_surge                 50000 non-null float64
city                      50000 non-null object
last_trip_date            50000 non-null object
phone                     49604 non-null object
signup_date               50000 non-null object
surge_pct                 50000 non-null float64
trips_in_first_30_days    50000 non-null int64
ultimate_black_user       50000 non-null bool
weekday_pct               50000 non-null float64
dtypes: bool(1), float64(6), int64(1), object(4)
memory usage: 4.2+ MB
AFTER <-------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 41445 entries, 0 to 49998
Data columns (total 12 columns):
avg_dist                  41445 non-null

***Observing the data above, we can see that most of the missing data is contained in the rating features. Given this, I will split these cases off to inspect them further to determine the impact of missing ratings on user retention. Additionally, with these cases removed I will construct a predictive model to predict whether or not a user will be active into the future and determine the importance of each feature based on the coefficient that is calculated in the model.***

In [5]:
#generate labels list to determine which users were active in the past 30 days for the model data
frame = timedelta(days=30)
start = datetime.strptime(df.last_trip_date.max(),'%Y-%m-%d') - frame
act = []
for i in df.last_trip_date:
    inst = datetime.strptime(i,'%Y-%m-%d')
    if inst > start:
        act.append(1)
    else:
        act.append(0)

#dichotomize Phone Type and membership features
df['phone'] = df['phone'].apply(lambda x: 1 if x=='Android' else 0)
df['ultimate_black_user'] = df['ultimate_black_user'].apply(
        lambda x: 1 if x==True else 0)

In [6]:
#Create dummy variables for city feature and create fonal model data to be using for prediction
city_dummy = pd.get_dummies(df['city'],drop_first=True)

#Remove columns that won't be used in the model, add Active label list, and generate final model Data.
model_data = df.drop(['last_trip_date','signup_date','city'],axis=1)
model_data = pd.concat([model_data,city_dummy],axis=1)
model_data["Active"] = act
print("Target class split: %",(model_data.Active.sum() / len(model_data))*100,'active')

Target class split: % 40.07238508867174 active


In [7]:
pd.options.mode.chained_assignment=None 
#generate labels list to determine which users were active in the past 30 days for the missing values
#dataframes and then print the proportion of users that are inactive for each dataframe

of_act = []
for i in missing_rating_of.last_trip_date:
    inst = datetime.strptime(i,'%Y-%m-%d')
    if inst > start:
        of_act.append(1)
    else:
        of_act.append(0)
missing_rating_of['Active'] = of_act


by_act = []
for i in missing_rating_by.last_trip_date:
    inst = datetime.strptime(i,'%Y-%m-%d')
    if inst > start:
        by_act.append(1)
    else:
        by_act.append(0)
missing_rating_by['Active'] = by_act

#Cases that are missing both rating by driver and rating of driver
missing_ratings = missing_rating_of[missing_rating_of['avg_rating_by_driver'].isnull() ==True]

miss_act = []
for i in missing_ratings.last_trip_date:
    inst = datetime.strptime(i,'%Y-%m-%d')
    if inst > start:
        miss_act.append(1)
    else:
        miss_act.append(0)
missing_ratings['Active'] = miss_act

print('missing_rating_of_driver Percent inactive:%',
      (1-(missing_rating_of.Active.sum() / len(missing_rating_of)))*100)
print('missing_rating_by_driver Percent inactive:%',
      (1-(missing_rating_by.Active.sum() / len(missing_rating_by)))*100)
print('missing_ratings_of_both Percent inactive:%',
      (1-(missing_ratings.Active.sum() / len(missing_ratings)))*100)

missing_rating_of_driver Percent inactive:% 80.69441024378233
missing_rating_by_driver Percent inactive:% 89.55223880597015
missing_ratings_of_both Percent inactive:% 70.1492537313433


***We see here that cases that are missing ratings are very likely to be inactive with the rates of inactivity of this user being about double that of those in the model data used above. This indicates that cases where ratings are not given, especially by the driver, we can expect that the user is at significant risk of becoming inactive. This may be due to a poor experience during the ride so we could target theses users for additional surveys and incentives.***

In [8]:
def split_process(a,b):
#split data into training and test set and scales data
    X_train,X_test,Y_train,Y_test = train_test_split(a, b, test_size = 0.25,random_state=42)
    scaler_X = StandardScaler()
    X_train = scaler_X.fit_transform(X_train)
    X_test = scaler_X.transform(X_test)    
    return X_train,X_test,Y_train,Y_test

xtr,xte,ytr,yte = split_process(model_data.iloc[:,:-1],model_data['Active'])

In [46]:
#Tune Gradient Boosted Machine using grid search
graboost = GradientBoostingClassifier(max_features='sqrt')
parametergra = {'n_estimators':[155,165,160],'learning_rate':[.3,.2],
              'max_depth':[3,4]}
grid1 = GridSearchCV(estimator=graboost,param_grid=parametergra,
                     scoring='accuracy',cv=5)
grid1.fit(xtr,ytr)
best_para_gra = grid1.best_params_
best_acc_gra = grid1.best_score_

print('Best parameters:',best_para_gra,'Highest accuracy:',best_acc_gra)


Best parameters: {'learning_rate': 0.3, 'max_depth': 3, 'n_estimators': 160} Highest accuracy: 0.787214876299


***A Gradient Boosted Machine was decided on after trying both a Random Forest (RF) and Support Vector Machine (SVM) for its quick training time and high accuracy. A SVM took much too long to train and did not yield accuracy as high and a RF simply did not perform as well. This model incorporates many of the strengths of a RF and handles sparse data well like a SVM.***

In [10]:
#employ tuned GBM model
gbm = GradientBoostingClassifier(max_features='sqrt',
                                 n_estimators=160,learning_rate=.2,max_depth= 4)
gbm.fit(xtr,ytr)

ypredgra = gbm.predict(xte)
print(classification_report(yte,ypredgra))
boost_result =confusion_matrix(yte,ypredgra)
print(boost_result)
boo_acc = (boost_result[1][1]+boost_result[0][0])/boost_result.sum()
print("Gradient Boosted Tress Accuracy =",boo_acc)

feature_coef = pd.DataFrame(gbm.feature_importances_).transpose()
feature_coef.columns = list(model_data.columns[:-1])
feature_coef.index = ['GBM'] 
feature_coef

             precision    recall  f1-score   support

          0       0.81      0.85      0.83      6186
          1       0.76      0.70      0.73      4176

avg / total       0.79      0.79      0.79     10362

[[5240  946]
 [1258 2918]]
Gradient Boosted Tress Accuracy = 0.787299749083


Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,phone,surge_pct,trips_in_first_30_days,ultimate_black_user,weekday_pct,King's Landing,Winterfell
GBM,0.17728,0.09167,0.081108,0.097075,0.028614,0.116448,0.119356,0.026273,0.189843,0.04213,0.030203


***We can see above that the model performed well on the training set with ~%79 accuracy, precision, recall, and F1 score indicating it is well rounded in its consistency of predictions of both positive and negative cases. There is always the concern of over fitting a model and that concern is present here, however given the test result I am confident that give new data the model would still perform well.***

In [50]:
#Tune Logisitc Regression using grid search
logreg = LogisticRegression()
parametervec = {'C':[4.3,4.5,4.6],'solver':['sag','liblinear'],'tol':[.75,.9,.8],
                'class_weight':['balanced','']}
grid2 = GridSearchCV(estimator=logreg,param_grid=parametervec,
                     scoring='accuracy',cv=5)
grid2.fit(xtr,ytr)
best_para_vec = grid2.best_params_
best_acc_vec = grid2.best_score_

print('Best parameters:',best_para_vec,'Highest accuracy:',best_acc_vec)

Best parameters: {'C': 4.5, 'class_weight': '', 'solver': 'sag', 'tol': 0.8} Highest accuracy: 0.71144998874


***I employed a Logistic Regression model to act as a baseline accuracy and provide additional insight into the contribution of each feature to the probability of one target class or the other. ***

In [12]:
#employ tuned Logistic regression model
reg = LogisticRegression(C=4.3,tol=.9,solver='sag')
reg.fit(xtr,ytr)

ypredreg = reg.predict(xte)
print(classification_report(yte,ypredreg))
reg_result =confusion_matrix(yte,ypredreg)
print(reg_result)
reg_acc = (reg_result[1][1]+reg_result[0][0])/reg_result.sum()
print("Logistic Regression Accuracy =",reg_acc)
feature_coef1 = pd.DataFrame(reg.coef_,index=['Logistic Reg'])
feature_coef1.columns = list(model_data.columns[:-1])
feature_coef= pd.concat([feature_coef,feature_coef1])
feature_coef

             precision    recall  f1-score   support

          0       0.73      0.82      0.77      6186
          1       0.67      0.56      0.61      4176

avg / total       0.71      0.71      0.71     10362

[[5057 1129]
 [1841 2335]]
Logistic Regression Accuracy = 0.713375796178


Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,phone,surge_pct,trips_in_first_30_days,ultimate_black_user,weekday_pct,King's Landing,Winterfell
GBM,0.182478,0.08759,0.075669,0.105198,0.029028,0.108989,0.122585,0.029465,0.183744,0.047525,0.027729
Logistic Reg,-0.191431,-0.056726,-0.043851,-0.01506,-0.491365,0.12861,0.431373,0.514373,0.03232,0.776729,0.32624


Above we have the feature importance as determined by the Gradient Boosted Machine (GBM) and the coefficients produced by the Regression model that indicate the contribution of a given feature. The coefficients are helpful in determining the nature of the relationship between the features and the Active statuses.

According the GBM results, Percent of Weekday Trips, Average Distance, and Trips in the First 30 Days appear to be the most important features according to the GBM model. The Logistic Regression suggests Average Distance is negatively related to Active status, indicating that as average distance goes down the user is more likely to become inactive.

Both Trips in First 30 Days and Percent of Weekday Trips are positively related with Active status which suggests that users with a higher Percent of Weekday Trips and more Trips in the First 30 Days are more likely to be Active, and vice versa.

Given this insight this insight, Ultimate should look to target new users with promotions to incentivize users to use their service during the weekdays. Additionally, Ultimate should experiment with a cost structure that incentivizes users to take longer trips to promote consistent use and engagement.