# Relax Data Science Take Home Challenge

### Problem Statement

Defining  an  "adopted  user"   as  a  user  who   has  logged  into  the  product  on  three  separate
days  in  at  least  one  seven-day  period ,  identify  which  factors  predict  future  user
adoption.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
#Loading Data
user = pd.read_csv('takehome_users.csv')
user_eng = pd.read_csv('takehome_user_engagement.csv')
user_eng = user_eng.set_index(pd.DatetimeIndex(user_eng['time_stamp']))

In [3]:
user.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [6]:
user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   12000 non-null  int64  
 1   creation_time               12000 non-null  object 
 2   name                        12000 non-null  object 
 3   email                       12000 non-null  object 
 4   creation_source             12000 non-null  object 
 5   last_session_creation_time  8823 non-null   float64
 6   opted_in_to_mailing_list    12000 non-null  int64  
 7   enabled_for_marketing_drip  12000 non-null  int64  
 8   org_id                      12000 non-null  int64  
 9   invited_by_user_id          6417 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


In [10]:
user_eng.head()

Unnamed: 0_level_0,time_stamp,user_id,visited
time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-04-22 03:53:30,2014-04-22 03:53:30,1,1
2013-11-15 03:45:04,2013-11-15 03:45:04,2,1
2013-11-29 03:45:04,2013-11-29 03:45:04,2,1
2013-12-09 03:45:04,2013-12-09 03:45:04,2,1
2013-12-25 03:45:04,2013-12-25 03:45:04,2,1


In [11]:
user_eng.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 207917 entries, 2014-04-22 03:53:30 to 2014-01-26 08:57:12
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   time_stamp  207917 non-null  object
 1   user_id     207917 non-null  int64 
 2   visited     207917 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 6.3+ MB


In [3]:
#Grouping by user id, then resampling by 1 week and then summing up the number of visits

user_group = user_eng.groupby('user_id').resample('1W').sum()

#Removing NaNs
user_group = user_group[user_group['user_id'].isnull()==False].drop('user_id',axis=1)
user_group.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,visited
user_id,time_stamp,Unnamed: 2_level_1
1,2014-04-27,1
2,2013-11-17,1
2,2013-11-24,0
2,2013-12-01,1
2,2013-12-08,0
2,2013-12-15,1
2,2013-12-22,0
2,2013-12-29,1
2,2014-01-05,1
2,2014-01-12,1


In [4]:
#Users having less than 3 visits in a week
adopted_users = user_group['visited'] >= 3

#Grouping by users and suming up the number of weeks that had 3 or more visits
adopted_users = adopted_users.groupby(level = 0).apply(np.sum)

#Removes users with no weeks that had 3 or more visits
adopted_users = adopted_users[adopted_users != 0]
adopted_users = pd.DataFrame(adopted_users, index = adopted_users.index)
adopted_users.columns = ['Active Weeks']
adopted_users.head()

Unnamed: 0_level_0,Active Weeks
user_id,Unnamed: 1_level_1
2,1
10,52
20,1
33,1
42,68


In [5]:
#Merges adopted users data with users data
adopt_user_m = adopted_users.merge(user, left_index = True, right_on = 'object_id', how='inner')
adopt_user_m.head()

Unnamed: 0,Active Weeks,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
1,1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
9,52,10,2013-01-16 22:08:03,Santos Carla,CarlaFerreiraSantos@gustr.com,ORG_INVITE,1401833000.0,1,1,318,4143.0
19,1,20,2014-03-06 11:46:38,Helms Mikayla,lqyvjilf@uhzdq.com,SIGNUP,1401364000.0,0,0,58,
32,1,33,2014-03-11 06:29:09,Araujo Jos�,JoseMartinsAraujo@cuvox.de,GUEST_INVITE,1401518000.0,0,0,401,79.0
41,68,42,2012-11-11 19:05:07,Pinto Giovanna,GiovannaCunhaPinto@cuvox.de,SIGNUP,1401045000.0,1,0,235,


In [6]:
#Adding Adopted column to origional data
user['Adopted'] = 0
user.iloc[list(adopt_user_m.index),10] = 1
users_1 = user.set_index('object_id').sort_index()

#Prep data for modeling by removing unecessary features
users_1 = users_1.drop([ 'name', 'email', 'creation_time', 'invited_by_user_id'], axis=1)
users_1.head()

Unnamed: 0_level_0,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,Adopted
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,GUEST_INVITE,1398139000.0,1,0,11,0
2,ORG_INVITE,1396238000.0,0,0,1,1
3,ORG_INVITE,1363735000.0,0,0,94,0
4,GUEST_INVITE,1369210000.0,0,0,1,0
5,GUEST_INVITE,1358850000.0,0,0,193,0


In [7]:
users_1.isnull().sum()

creation_source                  0
last_session_creation_time    3177
opted_in_to_mailing_list         0
enabled_for_marketing_drip       0
org_id                           0
Adopted                          0
dtype: int64

In [8]:
#users_1['invited_by_user_id'] = users_1['invited_by_user_id'].fillna(0)
avg = users_1['last_session_creation_time'].mean()
users_1['last_session_creation_time'] = users_1['last_session_creation_time'].fillna(avg)

In [9]:
users_1.isnull().sum()

creation_source               0
last_session_creation_time    0
opted_in_to_mailing_list      0
enabled_for_marketing_drip    0
org_id                        0
invited_by_user_id            0
Adopted                       0
dtype: int64

In [9]:
org = users_1.groupby('org_id').sum()
org_id_cat = []
for i in users_1['org_id']:
    cat = org.loc[i,'Adopted']
    org_id_cat.append(cat)
users_1['org_id'] = org_id_cat

In [10]:
users_1 = pd.get_dummies(data = users_1, columns = ['creation_source', 'org_id'])
users_1.head()

Unnamed: 0_level_0,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,Adopted,creation_source_GUEST_INVITE,creation_source_ORG_INVITE,creation_source_PERSONAL_PROJECTS,creation_source_SIGNUP,creation_source_SIGNUP_GOOGLE_AUTH,org_id_0,...,org_id_6,org_id_7,org_id_8,org_id_9,org_id_10,org_id_11,org_id_12,org_id_13,org_id_14,org_id_16
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1398139000.0,1,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,1396238000.0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1363735000.0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,1369210000.0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,1358850000.0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split

In [12]:
X = users_1.drop('Adopted', axis = 1)
y = users_1['Adopted']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [13]:
rf_clf= RandomForestClassifier()
rf_clf.fit(X_train, y_train)
rf_clf.score(X_test, y_test)

0.9026666666666666

In [14]:
gb_clf = GradientBoostingClassifier()
gb_clf.fit(X_train, y_train)
gb_clf.score(X_test, y_test)

0.927

In [15]:
feat_imp = pd.DataFrame(gb_clf.feature_importances_).transpose()
feat_imp.columns = list(X.columns)
feat_imp.index = ['Importance'] 
feat_imp.transpose().sort_values(by='Importance',ascending=False)

Unnamed: 0,Importance
last_session_creation_time,0.967954
org_id_0,0.006758
org_id_5,0.006498
org_id_6,0.005809
org_id_1,0.004665
enabled_for_marketing_drip,0.001305
org_id_2,0.001016
creation_source_GUEST_INVITE,0.000977
org_id_7,0.00076
org_id_3,0.000654


Our model was able to achieve ~93% accuracy. The above list shows the importance of each feature. The most important features to make this prediction are __last_session_creation_time, the organization the user is from and whether they are on the marketing email drip__. 