In [1]:
# Imports
import pandas as pd
from datetime import timedelta
from sklearn.model_selection import train_test_split

In [2]:
# Load user engagement csv
engage_df = pd.read_csv('takehome_user_engagement.csv')
engage_df.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [3]:
# Convert timestamp to datetime and make it the index
engage_df['time_stamp'] = pd.to_datetime(engage_df['time_stamp'])
#engage_df.set_index('time_stamp', inplace=True)

In [4]:
# feels a bit hackey since I don't use pandas much but oh well

# Create a dict that will become our new df
labels = {
    'user_id': [],
    'is_adopted_user': [],
}

n_users = engage_df['user_id'].max() # find out how many users we need to iterate through
week = timedelta(days=7) # for making comparisons later

for user_id in range(1, n_users+1): # iterate through all of our users
    labels['user_id'].append(user_id) # make sure their id is in the new df dict
    user_time_stamps = list(engage_df[engage_df['user_id'] == user_id]['time_stamp']) # Get all timestamps of when this user logged in as a list so we can index easily
    if len(user_time_stamps) < 3: # we need at least 3 data points to be an adopted user and this makes the following check easier
        labels['is_adopted_user'].append(False)
    else:
        is_adopted = False # Track if we've seen a reason to think this is an adopted user
        for idx in range(2, len(user_time_stamps)):
            td = user_time_stamps[idx] - user_time_stamps[idx-2] # Compare every pair of timestamps 2 indices apart and get their time delta
            if td < week: 
                is_adopted = True # If the time delta is less than a week we know we have 3 logins within a week
                break
        labels['is_adopted_user'].append(is_adopted)

label_df = pd.DataFrame(labels)
label_df.set_index('user_id', inplace=True)

In [5]:
# Lookin' good
label_df.head()

Unnamed: 0_level_0,is_adopted_user
user_id,Unnamed: 1_level_1
1,False
2,True
3,False
4,False
5,False


In [54]:
# Checking class balance
label_df['is_adopted_user'].sum()/len(label_df)

0.1335

In [6]:
# Load users csv
users_df = pd.read_csv('takehome_users.csv', index_col='object_id')
users_df.head()

Unnamed: 0_level_0,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [7]:
# Drop useless columns
users_df.drop(columns=['creation_time', 'name', 'last_session_creation_time', 'org_id', 'invited_by_user_id'], inplace=True)
users_df.head()

Unnamed: 0_level_0,email,creation_source,opted_in_to_mailing_list,enabled_for_marketing_drip
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,AugustCClausen@yahoo.com,GUEST_INVITE,1,0
2,MatthewPoole@gustr.com,ORG_INVITE,0,0
3,MitchellBottrill@gustr.com,ORG_INVITE,0,0
4,NicklasSClausen@yahoo.com,GUEST_INVITE,0,0
5,GraceRaw@yahoo.com,GUEST_INVITE,0,0


In [8]:
# Investigating the email column to see if we can use domain as a category
users_df['email'] = users_df['email'].astype('string')
users_df['email'] = users_df['email'].str.split('@').str[-1]
len(users_df['email'].unique())
# Woah, that's a lot of domains!

1184

In [9]:
# Checking domain counts
val_counts = users_df['email'].value_counts()
val_counts[:10]
# We have 6 common ones and the rest are all extremely rare

gmail.com         3562
yahoo.com         2447
jourrapide.com    1259
cuvox.de          1202
gustr.com         1179
hotmail.com       1165
mumdc.com            2
dqwln.com            2
oqpze.com            2
rerwl.com            2
Name: email, dtype: int64

In [10]:
# Group all rare domains into their own category
mapping = users_df['email'].map(val_counts)
users_df['email'].mask(mapping < 10, other='Other', inplace=True)
users_df['email'].value_counts()

gmail.com         3562
yahoo.com         2447
jourrapide.com    1259
cuvox.de          1202
Other             1186
gustr.com         1179
hotmail.com       1165
Name: email, dtype: int64

In [11]:
# Make our categoricals into one hot vectors
users_df = pd.get_dummies(users_df, columns=['creation_source'], drop_first=False)
users_df = pd.get_dummies(users_df, columns=['email'], drop_first=False)
users_df.head()

Unnamed: 0_level_0,opted_in_to_mailing_list,enabled_for_marketing_drip,creation_source_GUEST_INVITE,creation_source_ORG_INVITE,creation_source_PERSONAL_PROJECTS,creation_source_SIGNUP,creation_source_SIGNUP_GOOGLE_AUTH,email_Other,email_cuvox.de,email_gmail.com,email_gustr.com,email_hotmail.com,email_jourrapide.com,email_yahoo.com
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,1,0,1,0,0,0,0,0,0,0,0,0,0,1
2,0,0,0,1,0,0,0,0,0,0,1,0,0,0
3,0,0,0,1,0,0,0,0,0,0,1,0,0,0
4,0,0,1,0,0,0,0,0,0,0,0,0,0,1
5,0,0,1,0,0,0,0,0,0,0,0,0,0,1


In [12]:
# Sanity check before split
print(len(users_df))
print(len(label_df))

12000
12000


In [13]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(users_df, label_df['is_adopted_user'])

In [107]:
# Logistic regression model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from sklearn.metrics import accuracy_score

params = []
params.append({
    'solver': ['liblinear'],
    'class_weight': ['balanced'],
    'max_iter': [500],
    'penalty': ['l1', 'l2'],
    'C': uniform(loc=0.1, scale=2.9),
})
params.append({
    'solver': ['lbfgs'],
    'class_weight': ['balanced'],
    'max_iter': [500],
    'penalty': ['l2', 'none'],
    'C': uniform(loc=0.1, scale=2.9),
})
params.append({
    'solver': ['saga'],
    'class_weight': ['balanced'],
    'max_iter': [500],
    'penalty': ['l1', 'l2', 'none'],
    'C': uniform(loc=0.1, scale=2.9),
})
params.append({
    'solver': ['sag'],
    'class_weight': ['balanced'],
    'max_iter': [500],
    'tol': [1e-3],
    'penalty': ['l2', 'none'],
    'C': uniform(loc=0.1, scale=2.9),
})

lr = LogisticRegression()
search = RandomizedSearchCV(lr, params, n_iter=50, n_jobs=-1)
results = search.fit(X_train, y_train)
print('Best params:', results.best_params_)
print('Best score:', results.best_score_)
final_model = results.best_estimator_

# Check accuracy
preds = final_model.predict(X_train)
acc = accuracy_score(y_train, preds)
print('Accuracy:', acc)



Best params: {'C': 1.4603101021632316, 'class_weight': 'balanced', 'max_iter': 500, 'penalty': 'none', 'solver': 'sag', 'tol': 0.001}
Best score: 0.6289999999999999
Accuracy: 0.5541111111111111




In [108]:
# Show a list of the coefficients for each variable.
{k: v for k, v in zip(list(users_df.columns), list(final_model.coef_[0]))}

{'opted_in_to_mailing_list': 0.3510796114397215,
 'enabled_for_marketing_drip': -1.186497470364097,
 'creation_source_GUEST_INVITE': -0.048433366700160144,
 'creation_source_ORG_INVITE': -0.7654701178537057,
 'creation_source_PERSONAL_PROJECTS': -0.13947368205155602,
 'creation_source_SIGNUP': 0.107051106218398,
 'creation_source_SIGNUP_GOOGLE_AUTH': 0.881333815955818,
 'email_Other': 0.2943698732429618,
 'email_cuvox.de': -2.0279391428389597,
 'email_gmail.com': 0.530629061857492,
 'email_gustr.com': 0.528556540054714,
 'email_hotmail.com': 0.45986474353442003,
 'email_jourrapide.com': 0.5613389968648802,
 'email_yahoo.com': -0.31181231640364837}

In [109]:
# People who signup manually or through google auth are more likely to become adopted users than people using invites.
# People who opt into the mailing list are more likely to become adopted users.
# People who opt into the marketing drip are less likely to become adopted users.
# There could be a number of hypotheses for why these things might be the case and each would require further investigation.
# For example, maybe those who come on an invitation only do use the service as long as the person who invited them insists.
# In this case, doing a better job of showing these users the value of the service before they fulfill their obligations may result in them sticking around.