In [1]:
import pandas as pd
# Read the data into dataframe
user_df = pd.read_csv('takehome_users.csv', encoding = "ISO-8859-1", parse_dates=['creation_time', 'last_session_creation_time'])
sum_df = pd.read_csv('takehome_user_engagement.csv', encoding = "ISO-8859-1", parse_dates=['time_stamp'])

# Get the basic info of the data
print(user_df.head(), sum_df.head())
print(user_df.info(), sum_df.info())
print(user_df.describe(), sum_df.describe())

   object_id       creation_time               name  \
0          1 2014-04-22 03:53:30     Clausen August   
1          2 2013-11-15 03:45:04      Poole Matthew   
2          3 2013-03-19 23:14:52  Bottrill Mitchell   
3          4 2013-05-21 08:09:28    Clausen Nicklas   
4          5 2013-01-17 10:14:20          Raw Grace   

                        email creation_source last_session_creation_time  \
0    AugustCClausen@yahoo.com    GUEST_INVITE                 1398138810   
1      MatthewPoole@gustr.com      ORG_INVITE                 1396237504   
2  MitchellBottrill@gustr.com      ORG_INVITE                 1363734892   
3   NicklasSClausen@yahoo.com    GUEST_INVITE                 1369210168   
4          GraceRaw@yahoo.com    GUEST_INVITE                 1358849660   

   opted_in_to_mailing_list  enabled_for_marketing_drip  org_id  \
0                         1                           0      11   
1                         0                           0       1   
2          

In [2]:
# Parse the unix time into readable datetime
user_df.last_session_creation_time = pd.to_datetime(user_df.last_session_creation_time, unit='s')
user_df.last_session_creation_time


0       2014-04-22 03:53:30
1       2014-03-31 03:45:04
2       2013-03-19 23:14:52
3       2013-05-22 08:09:28
4       2013-01-22 10:14:20
5       2013-12-19 03:37:06
6       2012-12-20 13:24:32
7                       NaT
8                       NaT
9       2014-06-03 22:08:03
10      2013-12-27 03:55:54
11                      NaT
12      2014-03-30 16:19:38
13      2012-10-12 16:14:33
14                      NaT
15                      NaT
16      2014-04-12 14:39:38
17                      NaT
18      2013-05-30 14:56:36
19      2014-05-29 11:46:38
20      2013-01-22 12:27:42
21      2014-02-10 06:00:46
22      2012-08-18 08:30:27
23      2013-09-09 22:20:03
24      2014-02-26 00:11:13
25                      NaT
26      2014-01-15 17:35:11
27      2013-02-14 20:00:25
28      2013-09-17 02:08:41
29      2013-06-04 00:44:25
                ...        
11970                   NaT
11971                   NaT
11972                   NaT
11973   2012-12-28 16:42:08
11974   2014-05-22 1

In [3]:
import datetime
df2 = sum_df.copy()
df2 = df2.set_index('user_id')
# Creation a new column equals each loggin timestamp + 7days
df2['7+'] = df2['time_stamp'] + datetime.timedelta(days=7)
df2['ad_user'] = 0

# loop over each user id and each timestamp, if for any user, if exist 3 logins within 7 days, mark as adoped user(ad_user = 1)
for i in df2.index.unique():
    
    if (len(df2.loc[i].shape)>1) & (len(df2.loc[i]) >= 3):
        for j in range(len(df2.loc[i])-2):
            
            if df2.loc[i].iloc[j+2]['time_stamp'] < df2.loc[i].iloc[j]['7+']:
                df2.at[i,'ad_user'] = 1
                
                break

df2
        


Unnamed: 0_level_0,time_stamp,visited,7+,ad_user
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2014-04-22 03:53:30,1,2014-04-29 03:53:30,0
2,2013-11-15 03:45:04,1,2013-11-22 03:45:04,1
2,2013-11-29 03:45:04,1,2013-12-06 03:45:04,1
2,2013-12-09 03:45:04,1,2013-12-16 03:45:04,1
2,2013-12-25 03:45:04,1,2014-01-01 03:45:04,1
2,2013-12-31 03:45:04,1,2014-01-07 03:45:04,1
2,2014-01-08 03:45:04,1,2014-01-15 03:45:04,1
2,2014-02-03 03:45:04,1,2014-02-10 03:45:04,1
2,2014-02-08 03:45:04,1,2014-02-15 03:45:04,1
2,2014-02-09 03:45:04,1,2014-02-16 03:45:04,1


In [5]:
import numpy as np
df3 = df2.groupby(df2.index).sum()
# Set the Target column
df3['Target'] = np.where(df3.ad_user>0, 1, 0)


# Drop the columns that obviously has no effect on the target
df1 = user_df.drop(['creation_time', 'name', 'email','last_session_creation_time'], 1)
df1 = df1.set_index('object_id').join(df3.Target)
# fill the missing values with 0
df1 = df1.fillna(0)
# Convert the 'creation_source' feature into 0-1's
df1.creation_source = pd.get_dummies(df1.creation_source)
y = df1.Target
X = df1


In [6]:
y.value_counts()

0.0    10398
1.0     1602
Name: Target, dtype: int64

In [8]:
# Unsing the Naive Bayes algorithm to build the predictive model
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

# Split the data set into training set and test set, the size of the test set is 1/5 of the original data set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

# use SMOTE to over-sampling the train data
sm = SMOTE(random_state=10, ratio = 1.0)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)




In [9]:
# Set the parameters by cross-validation
tuned_parameters = {'alpha':[0.0001,0.001,0.01,0.1,1,10]}
scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(MultinomialNB(class_prior=[0.75,0.25]), tuned_parameters, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train_res, y_train_res)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'alpha': 0.0001}

Grid scores on development set:

0.693 (+/-0.068) for {'alpha': 0.0001}
0.669 (+/-0.059) for {'alpha': 0.001}
0.649 (+/-0.057) for {'alpha': 0.01}
0.629 (+/-0.048) for {'alpha': 0.1}
0.608 (+/-0.044) for {'alpha': 1}
0.586 (+/-0.033) for {'alpha': 10}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

             precision    recall  f1-score   support

        0.0       0.95      0.52      0.67      2579
        1.0       0.22      0.84      0.35       421

avg / total       0.85      0.56      0.63      3000


# Tuning hyper-parameters for recall

Best parameters set found on development set:

{'alpha': 0.0001}

Grid scores on development set:

0.674 (+/-0.055) for {'alpha': 0.0001}
0.656 (+/-0.050) for {'alpha': 0.001}
0.640 (+/-0.051) for {'alpha': 0.01}
0.623 (+/-0.044) for {'alpha': 0.1}
0