In [1]:
# Imports
import json
import pandas as pd
from datetime import timedelta
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from sklearn.metrics import accuracy_score

In [2]:
# Read our json file
f = open('ultimate_data_challenge.json')
data_dict = json.load(f)
f.close()

# Convert to dataframe
df = pd.DataFrame(data_dict)
df.head()

Unnamed: 0,city,trips_in_first_30_days,signup_date,avg_rating_of_driver,avg_surge,last_trip_date,phone,surge_pct,ultimate_black_user,weekday_pct,avg_dist,avg_rating_by_driver
0,King's Landing,4,2014-01-25,4.7,1.1,2014-06-17,iPhone,15.4,True,46.2,3.67,5.0
1,Astapor,0,2014-01-29,5.0,1.0,2014-05-05,Android,0.0,False,50.0,8.26,5.0
2,Astapor,3,2014-01-06,4.3,1.0,2014-01-07,iPhone,0.0,False,100.0,0.77,5.0
3,King's Landing,9,2014-01-10,4.6,1.14,2014-06-29,iPhone,20.0,True,80.0,2.36,4.9
4,Winterfell,14,2014-01-27,4.4,1.19,2014-03-15,Android,11.8,False,82.4,3.13,4.9


In [3]:
# Check for missing
print('users', len(df))
for col in df.columns:
    print(col, df[col].isnull().sum())

# Check phone distribution
print('\niPhone', (df['phone'] == 'iPhone').sum())
print('Android', (df['phone'] == 'Android').sum())

# avg_rating_of_driver has many missing values. Could drop col or impute with median.
# avg_rating_by_driver has just a few missing values. Should probably impute with median.
# phone missing a few values. Should probably impute with mode (iPhone).

users 50000
city 0
trips_in_first_30_days 0
signup_date 0
avg_rating_of_driver 8122
avg_surge 0
last_trip_date 0
phone 396
surge_pct 0
ultimate_black_user 0
weekday_pct 0
avg_dist 0
avg_rating_by_driver 201

iPhone 34582
Android 15022


In [4]:
# Impute data
df['avg_rating_of_driver'].fillna(df['avg_rating_of_driver'].median(), inplace=True)
df['avg_rating_by_driver'].fillna(df['avg_rating_by_driver'].median(), inplace=True)
df['phone'].fillna(df['phone'].mode()[0], inplace=True)

In [5]:
# Check to make sure we have no missing values
df.isnull().sum().sum()

0

In [6]:
# Convert relevant cols to datetime
df['signup_date'] = pd.to_datetime(df['signup_date'])
df['last_trip_date'] = pd.to_datetime(df['last_trip_date'])
df.dtypes

city                              object
trips_in_first_30_days             int64
signup_date               datetime64[ns]
avg_rating_of_driver             float64
avg_surge                        float64
last_trip_date            datetime64[ns]
phone                             object
surge_pct                        float64
ultimate_black_user                 bool
weekday_pct                      float64
avg_dist                         float64
avg_rating_by_driver             float64
dtype: object

In [7]:
# Check for fraction of observed users that were retained
last_time = df.sort_values(by='last_trip_date', ascending=False).iloc[0]['last_trip_date']
n_retained = (df['last_trip_date'] > (last_time - timedelta(30))).sum()
percent_retained = n_retained / len(df) * 100.0
print(f'Percent retained: {round(percent_retained, 1)}%')

Percent retained: 36.6%


In [8]:
# Check our signup date range
print('Signup start:', df.sort_values(by='signup_date', ascending=True).iloc[0]['signup_date'])
print('Signup end:', df.sort_values(by='signup_date', ascending=False).iloc[0]['signup_date'])
print('Data collection end:', df.sort_values(by='last_trip_date', ascending=False).iloc[0]['last_trip_date'])

# It looks like we don't actually have the full 6th month of data for most of our users.
# I'm going to go ahead and use the last month of data we do have as a proxy for the actual sixth month.

Signup start: 2014-01-01 00:00:00
Signup end: 2014-01-31 00:00:00
Data collection end: 2014-07-01 00:00:00


In [9]:
# Set up a target variable
y = df['last_trip_date'] > (last_time - timedelta(30))
y.head()

0     True
1    False
2    False
3     True
4    False
Name: last_trip_date, dtype: bool

In [10]:
# One hot encode city and phone then drop unnecessary date columns
X = pd.get_dummies(df, columns=['phone'], drop_first=True)
X = pd.get_dummies(X, columns=['city'], drop_first=False)
X.drop(['signup_date', 'last_trip_date'], axis=1, inplace=True)
X.head()

Unnamed: 0,trips_in_first_30_days,avg_rating_of_driver,avg_surge,surge_pct,ultimate_black_user,weekday_pct,avg_dist,avg_rating_by_driver,phone_iPhone,city_Astapor,city_King's Landing,city_Winterfell
0,4,4.7,1.1,15.4,True,46.2,3.67,5.0,1,0,1,0
1,0,5.0,1.0,0.0,False,50.0,8.26,5.0,0,1,0,0
2,3,4.3,1.0,0.0,False,100.0,0.77,5.0,1,1,0,0
3,9,4.6,1.14,20.0,True,80.0,2.36,4.9,1,0,1,0
4,14,4.4,1.19,11.8,False,82.4,3.13,4.9,0,0,0,1


In [11]:
# Gradient boosting model
X_train, X_test, y_train, y_test = train_test_split(X, y)

params = {
    'learning_rate': uniform(loc=0.01, scale=0.1),
    'n_estimators': [i for i in range(100, 250)],
    'subsample': uniform(loc=0.1, scale=0.9),
    'max_depth': [i for i in range(1, 11)],
}

gbc = GradientBoostingClassifier()
search = RandomizedSearchCV(gbc, params, n_iter=100, n_jobs=-1)
results = search.fit(X_train, y_train)
print('Best params:', results.best_params_)
print('Best score:', results.best_score_)
final_model = results.best_estimator_

Best params: {'learning_rate': 0.10704286361695513, 'max_depth': 5, 'n_estimators': 234, 'subsample': 0.8233359472829527}
Best score: 0.7952


In [13]:
# Do final validation on test set
preds = final_model.predict(X_test)
acc = accuracy_score(y_test, preds)
print('Test accuracy:', acc)

Test accuracy: 0.79808


In [None]:
# This model could be used to predict which users are likely to churn and, with slight modification, when.
# This information could be used to offer special deals or some other incentive to users which are likely to churn so that they continue to use the service.