In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split

In [2]:
users = pd.read_csv('takehome_users.csv', encoding='latin-1')
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [3]:
engagement = pd.read_csv('takehome_user_engagement.csv')
engagement.head(10)

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1
5,2013-12-31 03:45:04,2,1
6,2014-01-08 03:45:04,2,1
7,2014-02-03 03:45:04,2,1
8,2014-02-08 03:45:04,2,1
9,2014-02-09 03:45:04,2,1


In [4]:
index = engagement['user_id'].unique()[(engagement['user_id'].value_counts().sort_index() 
                                           >= 3)]

In [5]:
engagement = engagement.loc[engagement['user_id'].isin(index)]
engagement.reset_index(drop=True, inplace=True)

In [6]:
days = [datetime.strptime(engagement.time_stamp.iloc[i], "%Y-%m-%d %H:%M:%S")
        for i in range(len(engagement))]

In [7]:
engaged_users = []
for i in range(len(engagement.user_id)-2):
    if ((engagement.user_id.iloc[i] == engagement.user_id.iloc[i+2]) &
        ((days[i+1] - days[i]).days > 0) &
        ((days[i+2] - days[i]).days > 0) &
        ((days[i+2] - days[i]).days <= 7)):
            engaged_users.append(engagement.user_id.iloc[i])
    

In [8]:
len(np.unique(engaged_users))

1656

In [9]:
len(np.unique(engagement.user_id))

2248

In [10]:
1656/2248

0.7366548042704626

In [11]:
# 73.67% of users are adopted users

In [12]:
engagement['engaged_users'] = engagement['user_id']

In [13]:
unique = np.unique(engaged_users)

In [14]:
engagement['engaged_users'] = engagement['engaged_users'].map(lambda x: 1 
                                                              if (x in unique) else 0)

In [15]:
engagement.loc[:, 'time_stamp'] = engagement.time_stamp.map(
    lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))

In [16]:
engagement.loc[:, 'time_stamp'] = engagement.time_stamp.map(
    lambda x: datetime.toordinal(x))

In [50]:
merged = pd.merge(engagement, users, left_on = 'user_id', right_on = 'object_id')

In [51]:
merged.isna().sum()

time_stamp                        0
user_id                           0
visited                           0
engaged_users                     0
object_id                         0
creation_time                     0
name                              0
email                             0
creation_source                   0
last_session_creation_time        0
opted_in_to_mailing_list          0
enabled_for_marketing_drip        0
org_id                            0
invited_by_user_id            87804
dtype: int64

In [53]:
merged.drop(['object_id', 'name', 'email', 'user_id', 'invited_by_user_id', 'visited'], 
            1, inplace=True)

In [54]:
merged.loc[:, 'creation_time'] = merged.creation_time.map(
    lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))

In [55]:
merged.loc[:, 'creation_time'] = merged.creation_time.map(
    lambda x: datetime.toordinal(x))

In [56]:
merged['creation_source'] = merged.creation_source.astype('category').cat.codes

In [58]:
X = merged.loc[:, merged.columns != 'engaged_users']
y = merged.loc[:, 'engaged_users']

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [60]:
clf = RandomForestClassifier(random_state = 0, n_estimators = 100)

In [61]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [62]:
clf.score(X_test, y_test)

0.9995223975642276

In [63]:
pd.DataFrame(list(zip(X.columns, clf.feature_importances_)), 
             columns = ['features', 'importance']).sort_values(by='importance', 
                                                               ascending=False)

Unnamed: 0,features,importance
3,last_session_creation_time,0.370388
1,creation_time,0.332774
6,org_id,0.146285
0,time_stamp,0.080998
2,creation_source,0.043646
4,opted_in_to_mailing_list,0.014758
5,enabled_for_marketing_drip,0.01115


In [30]:
'''
The most important feature is last_session_creation_time. This is followed by creation_time
and org_id. 

I used a random forest classifier to determine this because they are very powerful and I did
not have much data

'''

'\nThe most important feature is last_session_creation_time\n\n'