In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from sklearn.model_selection import train_test_split

In [208]:
data = pd.read_csv('takehome_user_engagement.csv')
data.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [109]:
x=data.groupby(['user_id']).agg({'visited':'sum'}).reset_index()
x.head()

Unnamed: 0,user_id,visited
0,1,1
1,2,14
2,3,1
3,4,1
4,5,1


In [15]:
data['user_id'].nunique()

8823

In [209]:
data['Date']=data['time_stamp'].apply(lambda x: datetime.datetime.date(pd.to_datetime(x)))

In [210]:
data['Time']=data['time_stamp'].apply(lambda x: datetime.datetime.time(pd.to_datetime(x)))

In [211]:
data['time_stamp'] = pd.to_datetime(data['time_stamp']) - pd.to_timedelta(7, unit='d')
session_stats = pd.Series([])
days = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN']
for day in days:
    k = data.groupby(['user_id', pd.Grouper(key='time_stamp', freq='W-' + day)])['visited'].count().reset_index().sort_values('user_id')
    k = k.groupby('user_id')['visited'].max()
    if len(session_stats) == 0:
        session_stats = k
    else:
        session_stats = pd.DataFrame([session_stats, k]).max()
session_stats.head()

  


user_id
1    1
2    3
3    1
4    1
5    1
dtype: int64

In [212]:
data['adopted_user'] = data['user_id'].apply(lambda x: 1 if x in session_stats and session_stats[x] >= 3 else 0)

In [213]:
data.head()

Unnamed: 0,time_stamp,user_id,visited,Date,Time,adopted_user
0,2014-04-15 03:53:30,1,1,2014-04-22,03:53:30,0
1,2013-11-08 03:45:04,2,1,2013-11-15,03:45:04,1
2,2013-11-22 03:45:04,2,1,2013-11-29,03:45:04,1
3,2013-12-02 03:45:04,2,1,2013-12-09,03:45:04,1
4,2013-12-18 03:45:04,2,1,2013-12-25,03:45:04,1


In [214]:
cols = ['object_id', 'creation_source', 'creation_time', 
        'last_session_creation_time', 'opted_in_to_mailing_list',
        'enabled_for_marketing_drip', 'org_id', 'invited_by_user_id']
users = pd.read_csv('takehome_users.csv', usecols=cols)
users.head()

Unnamed: 0,object_id,creation_time,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [215]:
users['creation_time'] = pd.to_datetime(users['creation_time'], format='%Y-%m-%d %H:%M:%S')
users['last_session_creation_time'] = pd.to_datetime(users['last_session_creation_time'], unit='s')

# calculate acount age
users['account_age'] = users['last_session_creation_time'] - users['creation_time']
users['account_age'] = users['account_age'].dt.days

In [216]:
users.head()

Unnamed: 0,object_id,creation_time,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,account_age
0,1,2014-04-22 03:53:30,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,10803.0,0.0
1,2,2013-11-15 03:45:04,ORG_INVITE,2014-03-31 03:45:04,0,0,1,316.0,136.0
2,3,2013-03-19 23:14:52,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1525.0,0.0
3,4,2013-05-21 08:09:28,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,5151.0,1.0
4,5,2013-01-17 10:14:20,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,5240.0,5.0


In [217]:
users['creation_source'].value_counts()

ORG_INVITE            4254
GUEST_INVITE          2163
PERSONAL_PROJECTS     2111
SIGNUP                2087
SIGNUP_GOOGLE_AUTH    1385
Name: creation_source, dtype: int64

In [218]:
creation_source = pd.get_dummies(users['creation_source'], drop_first=True)


In [219]:
creation_source.head()

Unnamed: 0,ORG_INVITE,PERSONAL_PROJECTS,SIGNUP,SIGNUP_GOOGLE_AUTH
0,0,0,0,0
1,1,0,0,0
2,1,0,0,0
3,0,0,0,0
4,0,0,0,0


In [220]:
model_data = pd.concat([users, creation_source], axis=1).drop('creation_source', axis=1)


In [221]:
model_data.head()

Unnamed: 0,object_id,creation_time,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,account_age,ORG_INVITE,PERSONAL_PROJECTS,SIGNUP,SIGNUP_GOOGLE_AUTH
0,1,2014-04-22 03:53:30,2014-04-22 03:53:30,1,0,11,10803.0,0.0,0,0,0,0
1,2,2013-11-15 03:45:04,2014-03-31 03:45:04,0,0,1,316.0,136.0,1,0,0,0
2,3,2013-03-19 23:14:52,2013-03-19 23:14:52,0,0,94,1525.0,0.0,1,0,0,0
3,4,2013-05-21 08:09:28,2013-05-22 08:09:28,0,0,1,5151.0,1.0,0,0,0,0
4,5,2013-01-17 10:14:20,2013-01-22 10:14:20,0,0,193,5240.0,5.0,0,0,0,0


In [260]:
model_data=pd.merge(model_data,x,how='left',left_on='object_id',right_on='user_id')


In [261]:
model_data.rename(columns={'visited':'visit_count'},inplace=True)
model_data.head()

Unnamed: 0,object_id,creation_time,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,account_age,ORG_INVITE,PERSONAL_PROJECTS,SIGNUP,SIGNUP_GOOGLE_AUTH,user_id_x,visit_count,user_id_y,visit_count.1
0,1,2014-04-22 03:53:30,2014-04-22 03:53:30,1,0,11,10803.0,0.0,0,0,0,0,1.0,1.0,1.0,1.0
1,2,2013-11-15 03:45:04,2014-03-31 03:45:04,0,0,1,316.0,136.0,1,0,0,0,2.0,14.0,2.0,14.0
2,3,2013-03-19 23:14:52,2013-03-19 23:14:52,0,0,94,1525.0,0.0,1,0,0,0,3.0,1.0,3.0,1.0
3,4,2013-05-21 08:09:28,2013-05-22 08:09:28,0,0,1,5151.0,1.0,0,0,0,0,4.0,1.0,4.0,1.0
4,5,2013-01-17 10:14:20,2013-01-22 10:14:20,0,0,193,5240.0,5.0,0,0,0,0,5.0,1.0,5.0,1.0


In [262]:
model_data=pd.merge(model_data,data,how='left',left_on='object_id',right_on='user_id')


In [278]:
model_data.columns

Index(['object_id', 'creation_time', 'last_session_creation_time',
       'opted_in_to_mailing_list', 'enabled_for_marketing_drip', 'org_id',
       'invited_by_user_id', 'account_age', 'ORG_INVITE', 'PERSONAL_PROJECTS',
       'SIGNUP', 'SIGNUP_GOOGLE_AUTH', 'user_id_x', 'visit_count', 'user_id_y',
       'visit_count', 'time_stamp', 'user_id', 'visited', 'Date', 'Time',
       'adopted_user'],
      dtype='object')

In [251]:
data['time_stamp'] = pd.to_datetime(data['time_stamp']) - pd.to_timedelta(7, unit='d')

In [270]:
model_data.dropna(inplace=True)

In [283]:
cols=['object_id', 'creation_time', 'last_session_creation_time',
       'opted_in_to_mailing_list', 'enabled_for_marketing_drip', 'org_id',
       'invited_by_user_id', 'account_age', 'ORG_INVITE', 'PERSONAL_PROJECTS',
       'SIGNUP', 'SIGNUP_GOOGLE_AUTH',  'visit_count', 
       'visit_count', 'time_stamp', 'visited', 'Date', 'Time',
       'adopted_user']
X_model_data=model_data[cols]

In [293]:
X= X_model_data.drop(['adopted_user'], axis=1)
y=X_model_data['adopted_user'].astype(int)

In [300]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

from sklearn.linear_model import LogisticRegression

lr=LogisticRegression()

lr.fit(X_train,y_train)

TypeError: invalid type promotion

In [292]:
model_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 116887 entries, 0 to 211091
Data columns (total 22 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   object_id                   116887 non-null  int64         
 1   creation_time               116887 non-null  datetime64[ns]
 2   last_session_creation_time  116887 non-null  datetime64[ns]
 3   opted_in_to_mailing_list    116887 non-null  int64         
 4   enabled_for_marketing_drip  116887 non-null  int64         
 5   org_id                      116887 non-null  int64         
 6   invited_by_user_id          116887 non-null  float64       
 7   account_age                 116887 non-null  float64       
 8   ORG_INVITE                  116887 non-null  uint8         
 9   PERSONAL_PROJECTS           116887 non-null  uint8         
 10  SIGNUP                      116887 non-null  uint8         
 11  SIGNUP_GOOGLE_AUTH          116887 non-