In [29]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from datetime import datetime
import time
import warnings
warnings.filterwarnings('ignore')

In [2]:
#import users engagement file
user_engagement = pd.read_csv('data/takehome_user_engagement.csv')
user_engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [3]:
#import users file
df_users = pd.read_csv('data/takehome_users.csv',encoding = "ISO-8859-1")
df_users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [4]:
#create datetime objects
user_engagement['time_stamp'] = pd.to_datetime(user_engagement['time_stamp'])

In [5]:
#Create function to group by frequency
def visits(group, frequency):
    return group.rolling(frequency, on='time_stamp')['user_id'].count()

In [6]:
#Apply visits to users
user_engagement['week_visits'] = user_engagement.groupby('user_id', as_index=False, 
                                                 group_keys=False).apply(visits, '7D')
user_engagement.head()

Unnamed: 0,time_stamp,user_id,visited,week_visits
0,2014-04-22 03:53:30,1,1,1.0
1,2013-11-15 03:45:04,2,1,1.0
2,2013-11-29 03:45:04,2,1,1.0
3,2013-12-09 03:45:04,2,1,1.0
4,2013-12-25 03:45:04,2,1,1.0


In [7]:
engagement_counts = []
adopted_user = []

for index, row in user_engagement.iterrows():
    if row.week_visits >= 3.0:
        if row.user_id not in engagement_counts:
            engagement_counts.append(row.user_id)

for i in df_users.object_id:
    if i in engagement_counts:
        adopted_user.append(1)
    else:
        adopted_user.append(0)
        
df_users['adopted_user'] = adopted_user
df_users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,1
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,0


In [8]:
#some missing data
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 11 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null object
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
adopted_user                  12000 non-null int64
dtypes: float64(2), int64(5), object(4)
memory usage: 1.0+ MB


In [9]:
#fill in NaN with zero
df_users = df_users.fillna(0)
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 11 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null object
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    12000 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            12000 non-null float64
adopted_user                  12000 non-null int64
dtypes: float64(2), int64(5), object(4)
memory usage: 1.0+ MB


In [10]:
#convert and fix timestamps
df_users.last_session_creation_time = df_users.last_session_creation_time.map(
    lambda x: datetime.fromtimestamp(int(x)).strftime('%Y-%m-%d %H:%M:%S'))
df_users.last_session_creation_time = pd.to_datetime(df_users.last_session_creation_time)
df_users.creation_time = pd.to_datetime(df_users.creation_time)

In [11]:
last_log_days = df_users.last_session_creation_time - df_users.creation_time

days = []
for i in last_log_days:
    val = i.days
    if val < 0:
        days.append(0)
    else:
        days.append(val)
        
df_users['last_login'] = days
df_users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user,last_login
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-21 20:53:30,1,0,11,10803.0,0,0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-30 20:45:04,0,0,1,316.0,1,135
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 16:14:52,0,0,94,1525.0,0,0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 01:09:28,0,0,1,5151.0,0,0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 02:14:20,0,0,193,5240.0,0,4


In [12]:
print('Adopted_users:' + str((df_users.adopted_user == 1).sum()))

Adopted_users:1602


Dataset is imbalanced

In [13]:
#drop personal information
cleaned_df = df_users.drop(["name", 'email'], axis=1)

In [14]:
#drop duplicate index
cleaned_df = cleaned_df.drop(['object_id'], axis = 1)

In [15]:
#drop timestamp data
cleaned_df = cleaned_df.drop(["creation_time", "last_session_creation_time"], axis=1)

In [16]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [17]:
#encode categorical variables
categorical = cleaned_df.loc[:,["creation_source", "org_id"]]
encoder = OneHotEncoder(categories="auto", sparse=False, drop='first')
encoded_categories = pd.DataFrame(encoder.fit_transform(categorical))
encoded_categories.columns = encoder.get_feature_names()
df2_model = cleaned_df.drop(["creation_source", "org_id"], axis=1)
df2_model = df2_model.join(encoded_categories, how="left")

In [18]:
df2_model.head()

Unnamed: 0,opted_in_to_mailing_list,enabled_for_marketing_drip,invited_by_user_id,adopted_user,last_login,x0_ORG_INVITE,x0_PERSONAL_PROJECTS,x0_SIGNUP,x0_SIGNUP_GOOGLE_AUTH,x1_1,...,x1_407,x1_408,x1_409,x1_410,x1_411,x1_412,x1_413,x1_414,x1_415,x1_416
0,1,0,10803.0,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,316.0,1,135,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,1525.0,0,0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,5151.0,0,0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,5240.0,0,4,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
#split data into training and testing data
X = df2_model.drop("adopted_user", axis=1)
y = df2_model[["adopted_user"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [20]:
#train random forest classifier 
classifier = RandomForestClassifier(n_estimators = 10, class_weight="balanced", criterion = 'entropy', 
                                    random_state = 42)
classifier.fit(X_train, y_train.values.ravel())

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=10, n_jobs=None, oob_score=False,
                       random_state=42, verbose=0, warm_start=False)

In [21]:
#confusion matrix
y_pred = classifier.predict(X_test)
y_prob = classifier.predict_proba(X_test)

print("Confusion Matrix:")
print(metrics.confusion_matrix(y_test, y_pred))

print("Classification Report")
print(metrics.classification_report(y_test, y_pred))

Confusion Matrix:
[[2060   20]
 [  40  280]]
Classification Report
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      2080
           1       0.93      0.88      0.90       320

    accuracy                           0.97      2400
   macro avg       0.96      0.93      0.94      2400
weighted avg       0.97      0.97      0.97      2400



slighly imbalanced  classifier but not terrible considering the imbalanced dataset.

In [22]:
#get accuracy
accuracy_score(y_test, y_pred)

0.975

In [23]:
#get feature importances 
feature_importances = pd.DataFrame()
feature_importances["feature"] = X_train.columns
feature_importances["importance"] = classifier.feature_importances_
feature_importances.sort_values(by='importance', ascending=False).head(10)

Unnamed: 0,feature,importance
3,last_login,0.819379
2,invited_by_user_id,0.022011
5,x0_PERSONAL_PROJECTS,0.00601
0,opted_in_to_mailing_list,0.005236
1,enabled_for_marketing_drip,0.005002
7,x0_SIGNUP_GOOGLE_AUTH,0.004641
4,x0_ORG_INVITE,0.004149
6,x0_SIGNUP,0.00252
134,x1_127,0.002395
178,x1_171,0.001909


In [24]:
#try model again taking out last_login feature
X = df2_model.drop(["adopted_user", 'last_login'], axis=1)
y = df2_model[["adopted_user"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [25]:
#train random forest classifier 
classifier = RandomForestClassifier(n_estimators = 20, class_weight="balanced", criterion = 'entropy', 
                                    random_state = 42)
classifier.fit(X_train, y_train.values.ravel())

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=20, n_jobs=None, oob_score=False,
                       random_state=42, verbose=0, warm_start=False)

In [26]:
#confusion matrix
y_pred = classifier.predict(X_test)
y_prob = classifier.predict_proba(X_test)

print("Confusion Matrix:")
print(metrics.confusion_matrix(y_test, y_pred))

print("Classification Report")
print(metrics.classification_report(y_test, y_pred))

Confusion Matrix:
[[1831  249]
 [ 275   45]]
Classification Report
              precision    recall  f1-score   support

           0       0.87      0.88      0.87      2080
           1       0.15      0.14      0.15       320

    accuracy                           0.78      2400
   macro avg       0.51      0.51      0.51      2400
weighted avg       0.77      0.78      0.78      2400



In [27]:
#get accuracy
accuracy_score(y_test, y_pred)

0.7816666666666666

Last login is clearly the best feature for the classfier, removing it creates a serious imbalance. 

In [28]:
#get feature importances 
feature_importances = pd.DataFrame()
feature_importances["feature"] = X_train.columns
feature_importances["importance"] = classifier.feature_importances_
feature_importances.sort_values(by='importance', ascending=False).head(10)

Unnamed: 0,feature,importance
2,invited_by_user_id,0.170027
0,opted_in_to_mailing_list,0.05177
1,enabled_for_marketing_drip,0.036015
3,x0_ORG_INVITE,0.022532
4,x0_PERSONAL_PROJECTS,0.011636
5,x0_SIGNUP,0.010049
6,x0_SIGNUP_GOOGLE_AUTH,0.008631
10,x1_4,0.005341
7,x1_1,0.005278
11,x1_5,0.005112


Key points
-  The most important feature by far is the last login, this makes sense since this is part of the criterion of an adopted user, and users are more likely to become adopted if they use the service more.
-  The next feature important feature is being invited by another user, indicating that referall marketing is an important channel. Org invite is a conceptually simliar feature, but is not nearly as signficant.
-  While the model overall has impressive accuracy. There is an imbalance in classification which yields a better performance in detecting users who don't adopt. This is likely due to the imbalanced dataset itself, which tilts heavily in favor of nonadopted users. More samples would likely create a more balanced performance. However, the current model would still be useful in production.