In [480]:
# import libraries
import pandas as pd
import numpy as np
import os

# Feature Engineering
from sklearn.model_selection import train_test_split

# soundtrack - 
# Inspiration (Theme) : Relax, inc.
##############
# Snakes & Ladders - Single - Mzuki & Himalia
# Transitions - World Alight (feat. Rhea) - Himalia
# Kingdom (feat. Sakima) [Barefoot Remix] = Himalia
##############

# Where to save the figures (I adapted this from Aurelien Geron's code will fill in rest later)
PROJECT_ROOT_DIR = "."
PROJECT_ID = "relax"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", PROJECT_ID)
#MODEL_PATH = os.path.join(PROJECT_ROOT_DIR, 'models', PROJECT_ID)
#$os.makedirs(MODEL_PATH, exist_ok=True)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=72):
    """ 
    resolution quality
    300 high 
    150 medium
    72 low
    """
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [481]:
# read in table, set encoding to latin-1 
# see https://stackoverflow.com/questions/5552555/unicodedecodeerror-invalid-continuation-byte 

users = pd.read_csv('./takehome_users.csv', encoding='latin-1')
user_engagement = pd.read_csv('./takehome_user_engagement.csv')

table 1 "takehome_users" - 12,000 users who signed up for the product in the last two years:
    
    ● name: the user's name
    ● object_id: the user's id
    ● email: email address
    ● creation_source: how their account was created. This takes on one of 5 values:
        ○ PERSONAL_PROJECTS: invited to join another user's personal workspace
        ○ GUEST_INVITE: invited to an organization as a guest (limited permissions)
        ○ ORG_INVITE: invited to an organization (as a full member)
        ○ SIGNUP: signed up via the website
        ○ SIGNUP_GOOGLE_AUTH: signed up using Google Authentication (using a Google email account for their login id)
    ● creation_time: when they created their account
    ● last_session_creation_time: unix timestamp of last login
    ● opted_in_to_mailing_list: whether they have opted into receiving marketing emails
    ● enabled_for_marketing_drip: whether they are on the regular marketing email drip
    ● org_id: the organization (group of users) they belong to
    ● invited_by_user_id: which user invited them to join (if applicable).
    ● is_adopted: target feature created by calling function is_adopted() on object_id (done here)
    
table 2 A usage summary table ("takehome_user_engagement") that has a row for each day that a user logged into the product.

Task Defining an "adopted user" as a user who has logged into the product on three separate days in at least one seven-day period, identify which factors predict future user adoption.

### Cleanup/Data Normalization

In [482]:
# User engagement first

user_engagement['time_stamp'] = pd.to_datetime(user_engagement['time_stamp'], errors='raise', infer_datetime_format=True)
user_engagement.dtypes

time_stamp    datetime64[ns]
user_id                int64
visited                int64
dtype: object

In [483]:
user_engagement.isna().sum()

time_stamp    0
user_id       0
visited       0
dtype: int64

In [484]:
user_engagement.shape

(207917, 3)

In [485]:
user_engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [486]:
# number of user ids
user_engagement['user_id'].nunique()

8823

In [487]:
# Users next

users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   12000 non-null  int64  
 1   creation_time               12000 non-null  object 
 2   name                        12000 non-null  object 
 3   email                       12000 non-null  object 
 4   creation_source             12000 non-null  object 
 5   last_session_creation_time  8823 non-null   float64
 6   opted_in_to_mailing_list    12000 non-null  int64  
 7   enabled_for_marketing_drip  12000 non-null  int64  
 8   org_id                      12000 non-null  int64  
 9   invited_by_user_id          6417 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


In [488]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [489]:

users['creation_source'][users['last_session_creation_time'].isna()].value_counts()

PERSONAL_PROJECTS    1347
ORG_INVITE           1066
GUEST_INVITE          575
SIGNUP                189
Name: creation_source, dtype: int64

In [490]:
users['creation_source'].value_counts()

ORG_INVITE            4254
GUEST_INVITE          2163
PERSONAL_PROJECTS     2111
SIGNUP                2087
SIGNUP_GOOGLE_AUTH    1385
Name: creation_source, dtype: int64

In [491]:
# Cool, this was by accident but I'm resusing this for looking at percentages of missing values in a feature
users.isna().sum() / len(users)

object_id                     0.00000
creation_time                 0.00000
name                          0.00000
email                         0.00000
creation_source               0.00000
last_session_creation_time    0.26475
opted_in_to_mailing_list      0.00000
enabled_for_marketing_drip    0.00000
org_id                        0.00000
invited_by_user_id            0.46525
dtype: float64

Cleaning 

In [492]:
users['creation_time'] = pd.to_datetime(users['creation_time'], errors='raise', infer_datetime_format=True)
users['last_session_creation_time'] = pd.to_datetime(users['last_session_creation_time'], errors='raise', unit='s')
users['invited_by_user_id'] = users['invited_by_user_id'].fillna(value=0).astype(int)

In [493]:
users

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,10803
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-31 03:45:04,0,0,1,316
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1525
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,5151
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,5240
...,...,...,...,...,...,...,...,...,...,...
11995,11996,2013-09-06 06:14:15,Meier Sophia,SophiaMeier@gustr.com,ORG_INVITE,2013-09-06 06:14:15,0,0,89,8263
11996,11997,2013-01-10 18:28:37,Fisher Amelie,AmelieFisher@gmail.com,SIGNUP_GOOGLE_AUTH,2013-01-15 18:28:37,0,0,200,0
11997,11998,2014-04-27 12:45:16,Haynes Jake,JakeHaynes@cuvox.de,GUEST_INVITE,2014-04-27 12:45:16,1,1,83,8074
11998,11999,2012-05-31 11:55:59,Faber Annett,mhaerzxp@iuxiw.com,PERSONAL_PROJECTS,2012-06-02 11:55:59,0,0,6,0


In [494]:
# drop na for now
# users_na = users.dropna()

# Clean up columns
#users_na['creation_time'] = pd.to_datetime(users_na['creation_time'], errors='raise', infer_datetime_format=True)
#users_na['last_session_creation_time'] = pd.to_datetime(users_na['last_session_creation_time'], errors='raise', unit='s')
#users_na['invited_by_user_id'] = users_na['invited_by_user_id'].astype(int)

In [495]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   object_id                   12000 non-null  int64         
 1   creation_time               12000 non-null  datetime64[ns]
 2   name                        12000 non-null  object        
 3   email                       12000 non-null  object        
 4   creation_source             12000 non-null  object        
 5   last_session_creation_time  8823 non-null   datetime64[ns]
 6   opted_in_to_mailing_list    12000 non-null  int64         
 7   enabled_for_marketing_drip  12000 non-null  int64         
 8   org_id                      12000 non-null  int64         
 9   invited_by_user_id          12000 non-null  int64         
dtypes: datetime64[ns](2), int64(5), object(3)
memory usage: 937.6+ KB


## Target Feature 
Defining an "adopted user" as a user who has logged into the product on three separate days in at least one seven-day period, identify which factors predict future user adoption.

In [496]:
def adopted_user(user_id):
    
    """
    accepts user_id as an argument of type int. 
    returns true if in the span of the user's history
    if visits <= 3 for a period in the weekly frequency, it checks the span of that week in daily frequencies to check it each login occurs on an individual day in that week
    if all is true then user is given status of "adopted user" bool:True
    if visits <= 3, but there is more than one visit on a given day, move on to next date.
    If nothing found return bool False
    """
    # get user id history
    user_engagement['time_stamp'] = pd.to_datetime(user_engagement['time_stamp'], errors='raise', infer_datetime_format=True)
    history = user_engagement[user_engagement['user_id'] == user_id]
    #assert history['time_stamp'] != 'Inde
    # for debugging
    #history
    
    # set bool check
    is_good = True
    
    # downsample
    weekly = history.set_index('time_stamp').resample(rule='w').sum()
    daily = history.set_index('time_stamp').resample(rule='d').sum()
    
    # For each week over the number of total weeks
    for i in range(len(weekly)):
        
        # if there are three or more visits in a given week
        if weekly.iloc[i][1] >= 3:
        
        # Get the start day and end day of that week
            good_week_start = weekly.iloc[i].name
            good_week_end = good_week_start + pd.Timedelta('7 days')
        
        # Get the daily sample of the span of that week
            week = daily.loc[good_week_start:good_week_end]
        
        # For each day in that given week
            for j in range(len(week)):
            
            # if there is a day that has more than 1 visit (i.e. not distinct days)
            # CHECK THIS!! SOMETING FEELS WRONG!!
                if week.iloc[j][1] > 1:
                
                # Set off a flag
                    is_good = False
                # if that week met the criterion, return true
            if is_good == True:
                return True
        
            # Else reset the flag
            else:
                is_good = True
# at the end there was nothing
    return False

In [497]:
# Important!
users.set_index('object_id', inplace=True)

In [498]:
%%time
# Generate Target
users['is_adopted'] = [adopted_user(user) for user in users.index]

CPU times: user 6min 17s, sys: 6.49 s, total: 6min 24s
Wall time: 6min 46s


In [499]:
# Save my work
# pd.to_pickle(users, './users_target.pkl')

## Set Aside Test Data
Need to check class imbalance to determine how to split the data

In [500]:
dataset = users.copy(deep=True)

In [501]:
X, y = dataset.drop('is_adopted', axis=1), dataset['is_adopted']

In [502]:
y.value_counts()

False    10555
True      1445
Name: is_adopted, dtype: int64

In [503]:
# 80/20 split leaves 289 of the minor class for the test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=0.8, stratify=y)

In [504]:
y_test.value_counts()

False    2111
True      289
Name: is_adopted, dtype: int64

In [505]:
# Save Test Data And No Peeking!
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

pd.to_pickle(train, './train.pkl')
pd.to_pickle(test, './test.pkl')

# Get a small model going


## Feature Engineering

In [506]:
train.head()

Unnamed: 0_level_0,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,is_adopted
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
4880,2014-02-05 14:09:24,White Freya,zgglolpq@itusf.com,ORG_INVITE,2014-02-07 14:09:24,0,0,20,10318,False
9399,2013-05-27 11:20:43,Martins Giovana,GiovanaAraujoMartins@jourrapide.com,ORG_INVITE,NaT,0,1,57,1055,False
83,2013-01-26 09:44:23,Araujo Gabriela,GabrielaFernandesAraujo@cuvox.de,ORG_INVITE,2013-01-27 09:44:23,1,0,172,9016,False
4051,2014-05-29 14:58:50,Holm Alexander,AlexanderOHolm@gustr.com,ORG_INVITE,2014-06-06 14:58:50,0,1,47,5340,False
656,2013-01-24 15:50:50,Lowry Elise,EliseRLowry@gmail.com,SIGNUP_GOOGLE_AUTH,2013-01-24 15:50:50,1,1,42,0,False


In [507]:
train['creation_source'][users['invited_by_user_id'].isna()].value_counts()

Series([], Name: creation_source, dtype: int64)

In [508]:
train['creation_source'][users['invited_by_user_id'].notna()].value_counts()

ORG_INVITE            3454
GUEST_INVITE          1707
PERSONAL_PROJECTS     1658
SIGNUP                1655
SIGNUP_GOOGLE_AUTH    1126
Name: creation_source, dtype: int64

In [509]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9600 entries, 4880 to 8426
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   creation_time               9600 non-null   datetime64[ns]
 1   name                        9600 non-null   object        
 2   email                       9600 non-null   object        
 3   creation_source             9600 non-null   object        
 4   last_session_creation_time  7056 non-null   datetime64[ns]
 5   opted_in_to_mailing_list    9600 non-null   int64         
 6   enabled_for_marketing_drip  9600 non-null   int64         
 7   org_id                      9600 non-null   int64         
 8   invited_by_user_id          9600 non-null   int64         
 9   is_adopted                  9600 non-null   bool          
dtypes: bool(1), datetime64[ns](2), int64(4), object(3)
memory usage: 759.4+ KB


In [510]:
train['total_session_time'] = train['last_session_creation_time'] - train['creation_time']

In [511]:
train['creation_source'].value_counts()

ORG_INVITE            3454
GUEST_INVITE          1707
PERSONAL_PROJECTS     1658
SIGNUP                1655
SIGNUP_GOOGLE_AUTH    1126
Name: creation_source, dtype: int64

In [512]:
train['name'].value_counts()

Araujo Gabriela     5
Carvalho Bruna      4
Cunha Melissa       4
Costa Lavinia       4
Oliveira Rafael     3
                   ..
Frei Marko          1
Connolly Nathan     1
Sanderson Archie    1
Walters Joe         1
Rucker Douglas      1
Name: name, Length: 9183, dtype: int64

In [513]:
train

Unnamed: 0_level_0,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,is_adopted,total_session_time
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
4880,2014-02-05 14:09:24,White Freya,zgglolpq@itusf.com,ORG_INVITE,2014-02-07 14:09:24,0,0,20,10318,False,2 days
9399,2013-05-27 11:20:43,Martins Giovana,GiovanaAraujoMartins@jourrapide.com,ORG_INVITE,NaT,0,1,57,1055,False,NaT
83,2013-01-26 09:44:23,Araujo Gabriela,GabrielaFernandesAraujo@cuvox.de,ORG_INVITE,2013-01-27 09:44:23,1,0,172,9016,False,1 days
4051,2014-05-29 14:58:50,Holm Alexander,AlexanderOHolm@gustr.com,ORG_INVITE,2014-06-06 14:58:50,0,1,47,5340,False,8 days
656,2013-01-24 15:50:50,Lowry Elise,EliseRLowry@gmail.com,SIGNUP_GOOGLE_AUTH,2013-01-24 15:50:50,1,1,42,0,False,0 days
...,...,...,...,...,...,...,...,...,...,...,...
3190,2014-01-10 21:51:06,Edgerton Jack,JackEdgerton@gmail.com,SIGNUP_GOOGLE_AUTH,2014-01-10 21:51:06,0,0,167,0,False,0 days
1358,2013-12-24 05:24:22,Hoffmann Peter,smjhhror@uznoh.com,SIGNUP,2013-12-24 05:24:22,0,0,276,0,False,0 days
11084,2013-02-23 20:11:35,Nepean Ben,BenNepean@gmail.com,GUEST_INVITE,2013-04-09 20:11:35,1,0,5,8487,False,45 days
7853,2014-04-24 07:15:58,Barrallier Stella,StellaBarrallier@gustr.com,GUEST_INVITE,2014-04-25 07:15:58,0,0,178,2751,False,1 days


## Modeling

In [514]:
X_train, X_val, y_train, y_val = train_test_split(train.drop('is_adopted', axis=1), train['is_adopted'], train_size=0.8, random_state=0, stratify=train['is_adopted'])

In [515]:
y_val.value_counts()

False    1689
True      231
Name: is_adopted, dtype: int64

In [516]:
X_train['invited_by_user_id'].fillna(value=0).astype(int)

object_id
36        6805
1728      1728
8089      7375
3160         0
9667     11750
         ...  
10571     2251
11626        0
11212        0
3458         0
1974      8494
Name: invited_by_user_id, Length: 7680, dtype: int64

In [301]:
target

1        False
2         True
3        False
4        False
5        False
         ...  
11996    False
11997    False
11998    False
11999    False
12000    False
Length: 8823, dtype: bool

In [302]:
adopted_user(users_na.index[0])

False

In [303]:
%%time
users_na['is_adopted'] = users_na['object_id'].apply(lambda x: adopted_user(x))


KeyError: 'object_id'

In [261]:
users_na['object_id'].iloc[0]

1

time_stamp    0
user_id       0
visited       0
dtype: int64