### Problem

Defining  an  "adopted  user"   as  a  user  who   has  logged  into  the  product  on  three  separate
days  in  at  least  one  seven­day  period ,  identify  which  factors  predict  future  user
adoption .


In [3]:
import numpy as np
import pandas as pd
import datetime  
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline


In [4]:
user_engagement = pd.read_csv('takehome_user_engagement.csv')

user_engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [5]:
user_engagement.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   time_stamp  207917 non-null  object
 1   user_id     207917 non-null  int64 
 2   visited     207917 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


In [6]:
user_engagement.describe()

Unnamed: 0,user_id,visited
count,207917.0,207917.0
mean,5913.314197,1.0
std,3394.941674,0.0
min,1.0,1.0
25%,3087.0,1.0
50%,5682.0,1.0
75%,8944.0,1.0
max,12000.0,1.0


In [7]:
users = pd.read_csv('takehome_users.csv')
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [8]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   12000 non-null  int64  
 1   creation_time               12000 non-null  object 
 2   name                        12000 non-null  object 
 3   email                       12000 non-null  object 
 4   creation_source             12000 non-null  object 
 5   last_session_creation_time  8823 non-null   float64
 6   opted_in_to_mailing_list    12000 non-null  int64  
 7   enabled_for_marketing_drip  12000 non-null  int64  
 8   org_id                      12000 non-null  int64  
 9   invited_by_user_id          6417 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


In [9]:
users.describe()

Unnamed: 0,object_id,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
count,12000.0,8823.0,12000.0,12000.0,12000.0,6417.0
mean,6000.5,1379279000.0,0.2495,0.149333,141.884583,5962.957145
std,3464.24595,19531160.0,0.432742,0.356432,124.056723,3383.761968
min,1.0,1338452000.0,0.0,0.0,0.0,3.0
25%,3000.75,1363195000.0,0.0,0.0,29.0,3058.0
50%,6000.5,1382888000.0,0.0,0.0,108.0,5954.0
75%,9000.25,1398443000.0,0.0,0.0,238.25,8817.0
max,12000.0,1402067000.0,1.0,1.0,416.0,11999.0


## Data Pre Processing

In [10]:
# Convert the time_stamp column into a datetime object and then set that as the index, cannot group with range index
user_engagement['time_stamp'] = pd.to_datetime(user_engagement['time_stamp'])

In [11]:
user_engagement_new_index = user_engagement.set_index('time_stamp')

In [12]:
user_engagement_new_index.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 207917 entries, 2014-04-22 03:53:30 to 2014-01-26 08:57:12
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   user_id  207917 non-null  int64
 1   visited  207917 non-null  int64
dtypes: int64(2)
memory usage: 4.8 MB


In [13]:
# Group the user engagement table by user_id and resample for each day 
# check that the count for each day is not greater than 1
#I dont want to have multiple counts for the same day

user_engagement_grouped = user_engagement_new_index.groupby('user_id')['visited'].resample('D').count()

In [14]:
user_engagement_grouped = pd.DataFrame(user_engagement_grouped)

In [15]:
print(user_engagement_grouped['visited'].max())

1


In [16]:
# I now have the user_engagement table grouped by day I need to apply a rolling 7 days to see if a user is active
user_engagement_grouped = user_engagement_grouped.rolling(window=7, min_periods=1).sum()

In [17]:
user_engagement_grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,visited
user_id,time_stamp,Unnamed: 2_level_1
1,2014-04-22,1.0
2,2013-11-15,2.0
2,2013-11-16,2.0
2,2013-11-17,2.0
2,2013-11-18,2.0


In [18]:
# Group again so that I have just the user and the rolling 7 day count
# then filter so that a count of 3 or more is considered active

user_engagement_final = user_engagement_grouped.groupby('user_id')[['visited']].max()

user_engagement_final.head()

Unnamed: 0_level_0,visited
user_id,Unnamed: 1_level_1
1,1.0
2,3.0
3,2.0
4,3.0
5,4.0


## Feature Engineering

In [19]:
user_engagement_final['active'] = 0

user_engagement_final.loc[(user_engagement_final['visited'] >= 3),'active']=1

user_engagement_final.head()

Unnamed: 0_level_0,visited,active
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.0,0
2,3.0,1
3,2.0,0
4,3.0,1
5,4.0,1


## Joining Tables

In [20]:
# Merge the user_engagement and user information table
#first I need to change the object_id column name in the users table to user_id
users  = users.rename(columns={'object_id': 'user_id'})

In [21]:
merged_users = user_engagement_final.merge(users, how='inner', on = 'user_id')

In [22]:
merged_users.head()

Unnamed: 0,user_id,visited,active,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,1.0,0,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,3.0,1,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2.0,0,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,3.0,1,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,4.0,1,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [23]:
# Save Dataframe to CSV
merged_users.to_csv('merged.csv')