In [877]:
# import libraries
import pandas as pd
import numpy as np
import os
import re

import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')

# Feature Engineering
from sklearn.model_selection import train_test_split

# soundtrack - 
# Inspiration (Theme) : Relax, inc.
##############
# Snakes & Ladders - Single - Mzuki & Himalia
# Situations - Escape This - Himalia
# Transitions - World Alight (feat. Rhea) - Himalia
# Kingdom (feat. Sakima) [Barefoot Remix] = Himalia
# Situations - Do You Wanna (feat. Laurelle) - Himalia
##############

# Where to save the figures (I adapted this from Aurelien Geron's code will fill in rest later)
PROJECT_ROOT_DIR = "."
PROJECT_ID = "relax"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", PROJECT_ID)
#MODEL_PATH = os.path.join(PROJECT_ROOT_DIR, 'models', PROJECT_ID)
#$os.makedirs(MODEL_PATH, exist_ok=True)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=72):
    """ 
    resolution quality
    300 high 
    150 medium
    72 low
    """
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [878]:
# read in table, set encoding to latin-1 
# see https://stackoverflow.com/questions/5552555/unicodedecodeerror-invalid-continuation-byte 

users = pd.read_csv('./takehome_users.csv', encoding='latin-1')
user_engagement = pd.read_csv('./takehome_user_engagement.csv')

table 1 "takehome_users" - 12,000 users who signed up for the product in the last two years:
    
    ● name: the user's name
    ● object_id: the user's id
    ● email: email address
    ● creation_source: how their account was created. This takes on one of 5 values:
        ○ PERSONAL_PROJECTS: invited to join another user's personal workspace
        ○ GUEST_INVITE: invited to an organization as a guest (limited permissions)
        ○ ORG_INVITE: invited to an organization (as a full member)
        ○ SIGNUP: signed up via the website
        ○ SIGNUP_GOOGLE_AUTH: signed up using Google Authentication (using a Google email account for their login id)
    ● creation_time: when they created their account
    ● last_session_creation_time: unix timestamp of last login
    ● opted_in_to_mailing_list: whether they have opted into receiving marketing emails
    ● enabled_for_marketing_drip: whether they are on the regular marketing email drip
    ● org_id: the organization (group of users) they belong to
    ● invited_by_user_id: which user invited them to join (if applicable).
    ● is_adopted: target feature created by calling function is_adopted() on object_id (done here)
    
table 2 A usage summary table ("takehome_user_engagement") that has a row for each day that a user logged into the product.

Task Defining an "adopted user" as a user who has logged into the product on three separate days in at least one seven-day period, identify which factors predict future user adoption.

### Cleanup/Data Normalization

In [879]:
# User engagement first

user_engagement['time_stamp'] = pd.to_datetime(user_engagement['time_stamp'], errors='raise', infer_datetime_format=True)
user_engagement.dtypes

time_stamp    datetime64[ns]
user_id                int64
visited                int64
dtype: object

In [880]:
user_engagement.isna().sum()

time_stamp    0
user_id       0
visited       0
dtype: int64

In [914]:
# save oldest date for login activity
today = user_engagement['time_stamp'].max()

In [915]:
user_engagement.shape

(207917, 3)

In [916]:
user_engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [917]:
# number of user ids
user_engagement['user_id'].nunique()

8823

In [918]:
# Users next

users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12000 entries, 1 to 12000
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   creation_time               12000 non-null  datetime64[ns]
 1   name                        12000 non-null  object        
 2   email                       12000 non-null  object        
 3   creation_source             12000 non-null  object        
 4   last_session_creation_time  8823 non-null   datetime64[ns]
 5   opted_in_to_mailing_list    12000 non-null  int64         
 6   enabled_for_marketing_drip  12000 non-null  int64         
 7   org_id                      12000 non-null  int64         
 8   invited_by_user_id          12000 non-null  int64         
 9   is_adopted                  12000 non-null  bool          
dtypes: bool(1), datetime64[ns](2), int64(4), object(3)
memory usage: 949.2+ KB


In [919]:
users.head()

Unnamed: 0_level_0,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,is_adopted
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,10803,False
2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-31 03:45:04,0,0,1,316,True
3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1525,False
4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,5151,False
5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,5240,False


In [920]:

users['creation_source'][users['last_session_creation_time'].isna()].value_counts()

PERSONAL_PROJECTS    1347
ORG_INVITE           1066
GUEST_INVITE          575
SIGNUP                189
Name: creation_source, dtype: int64

In [921]:
users['creation_source'].value_counts()

ORG_INVITE            4254
GUEST_INVITE          2163
PERSONAL_PROJECTS     2111
SIGNUP                2087
SIGNUP_GOOGLE_AUTH    1385
Name: creation_source, dtype: int64

In [922]:
# Cool, this was by accident but I'm resusing this for looking at percentages of missing values in a feature
users.isna().sum() / len(users)

creation_time                 0.00000
name                          0.00000
email                         0.00000
creation_source               0.00000
last_session_creation_time    0.26475
opted_in_to_mailing_list      0.00000
enabled_for_marketing_drip    0.00000
org_id                        0.00000
invited_by_user_id            0.00000
is_adopted                    0.00000
dtype: float64

Cleaning 

In [889]:
users['creation_time'] = pd.to_datetime(users['creation_time'], errors='raise', infer_datetime_format=True)
users['last_session_creation_time'] = pd.to_datetime(users['last_session_creation_time'], errors='raise', unit='s')
users['invited_by_user_id'] = users['invited_by_user_id'].fillna(value=0).astype(int)

In [890]:
users

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,10803
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-31 03:45:04,0,0,1,316
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1525
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,5151
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,5240
...,...,...,...,...,...,...,...,...,...,...
11995,11996,2013-09-06 06:14:15,Meier Sophia,SophiaMeier@gustr.com,ORG_INVITE,2013-09-06 06:14:15,0,0,89,8263
11996,11997,2013-01-10 18:28:37,Fisher Amelie,AmelieFisher@gmail.com,SIGNUP_GOOGLE_AUTH,2013-01-15 18:28:37,0,0,200,0
11997,11998,2014-04-27 12:45:16,Haynes Jake,JakeHaynes@cuvox.de,GUEST_INVITE,2014-04-27 12:45:16,1,1,83,8074
11998,11999,2012-05-31 11:55:59,Faber Annett,mhaerzxp@iuxiw.com,PERSONAL_PROJECTS,2012-06-02 11:55:59,0,0,6,0


In [891]:
# drop na for now
# users_na = users.dropna()

# Clean up columns
#users_na['creation_time'] = pd.to_datetime(users_na['creation_time'], errors='raise', infer_datetime_format=True)
#users_na['last_session_creation_time'] = pd.to_datetime(users_na['last_session_creation_time'], errors='raise', unit='s')
#users_na['invited_by_user_id'] = users_na['invited_by_user_id'].astype(int)

In [892]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   object_id                   12000 non-null  int64         
 1   creation_time               12000 non-null  datetime64[ns]
 2   name                        12000 non-null  object        
 3   email                       12000 non-null  object        
 4   creation_source             12000 non-null  object        
 5   last_session_creation_time  8823 non-null   datetime64[ns]
 6   opted_in_to_mailing_list    12000 non-null  int64         
 7   enabled_for_marketing_drip  12000 non-null  int64         
 8   org_id                      12000 non-null  int64         
 9   invited_by_user_id          12000 non-null  int64         
dtypes: datetime64[ns](2), int64(5), object(3)
memory usage: 937.6+ KB


## Target Feature 
Defining an "adopted user" as a user who has logged into the product on three separate days in at least one seven-day period, identify which factors predict future user adoption.

In [893]:
def adopted_user(user_id):
    
    """
    accepts user_id as an argument of type int. 
    returns true if in the span of the user's history
    if visits <= 3 for a period in the weekly frequency, it checks the span of that week in daily frequencies to check it each login occurs on an individual day in that week
    if all is true then user is given status of "adopted user" bool:True
    if visits <= 3, but there is more than one visit on a given day, move on to next date.
    If nothing found return bool False
    """
    # get user id history
    user_engagement['time_stamp'] = pd.to_datetime(user_engagement['time_stamp'], errors='raise', infer_datetime_format=True)
    history = user_engagement[user_engagement['user_id'] == user_id]
    #assert history['time_stamp'] != 'Inde
    # for debugging
    #history
    
    # set bool check
    is_good = True
    
    # downsample
    weekly = history.set_index('time_stamp').resample(rule='w').sum()
    daily = history.set_index('time_stamp').resample(rule='d').sum()
    
    # For each week over the number of total weeks
    for i in range(len(weekly)):
        
        # if there are three or more visits in a given week
        if weekly.iloc[i][1] >= 3:
        
        # Get the start day and end day of that week
            good_week_start = weekly.iloc[i].name
            good_week_end = good_week_start + pd.Timedelta('7 days')
        
        # Get the daily sample of the span of that week
            week = daily.loc[good_week_start:good_week_end]
        
        # For each day in that given week
            for j in range(len(week)):
            
            # if there is a day that has more than 1 visit (i.e. not distinct days)
            # CHECK THIS!! SOMETING FEELS WRONG!!
                if week.iloc[j][1] > 1:
                
                # Set off a flag
                    is_good = False
                # if that week met the criterion, return true
            if is_good == True:
                return True
        
            # Else reset the flag
            else:
                is_good = True
# at the end there was nothing
    return False

In [894]:
# Important!
users.set_index('object_id', inplace=True)

In [895]:
%%time
# Generate Target
users['is_adopted'] = [adopted_user(user) for user in users.index]

CPU times: user 6min 30s, sys: 7.06 s, total: 6min 37s
Wall time: 7min 26s


In [901]:
# Save my work
#pd.to_pickle(users, './users_target.pkl')

## Set Aside Test Data
Need to check class imbalance to determine how to split the data

In [902]:
dataset = users.copy(deep=True)
X, y = dataset.drop('is_adopted', axis=1), dataset['is_adopted']

In [903]:
# 80/20 split leaves 289 of the minor class for the test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=0.8, stratify=y)
y_test.value_counts()

False    2111
True      289
Name: is_adopted, dtype: int64

In [905]:
# Save Test Data And No Peeking!
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

#pd.to_pickle(train, './train.pkl')
#pd.to_pickle(test, './test.pkl')

## Feature Engineering

In [906]:
# Break down strings

def decompose_email(address):
    user, domain = address.split('@')
    domain_name, ext = domain.split('.')
    return user, domain_name, ext

# Check is user_id has any red flags
def flag_user(user_id):
    flagged_id = train.loc[user_id]
    full_name = flagged_id['name'].split()
    username = flagged_id['username']
    
    for name in full_name:
        #print(name)
        if re.match(name, username):
            return 0
    return 1

def flag_domain(user_id):
    flagged_id =  train.loc[user_id]
    domain = flagged_id['domain']
    if domain =='flag':
        return 1
    return 0

In [860]:
flagged_id = train.loc[train.index[0]]
full_name = flagged_id['name'].split()
username = flagged_id['username']

In [945]:
# Feature Engineering

# Decomponose Emails Addresses

username, domain, extension = [], [], []
for i in range(len(train['email'])):
    address = decompose_email(train['email'].iloc[i])
    username.append(address[0])
    domain.append(address[1])
    extension.append(address[2])

train['username'] = username
train['domain'] = domain
train['extention'] = extension

# check for sketchy domains
valid_domains = train['domain'].value_counts()[:6].index
train['domain'] = train['domain'].apply(lambda x: x if x in valid_domains else 'flag')

# Suspicious emails
flags = []
for user_id in train.index:
    flags.append(flag_user(user_id))
    
train['flags_username'] = flags
train['flags_domain'] = train['domain'].apply(lambda x: 1 if x =='flag' else 0)
train['last_seen_active'] = today - train['last_session_creation_time']
train['account_age'] = today - train['creation_time']
#train[train['last_seen_active'].isna()]

#train['account_age_days'] = train['account_age'].dt.days.fillna(0.0).astype(int)

#days_old_flag = 550.0
#ratio = train['flags'][train['account_age_days'] >=days_old_flag].value_counts()[1] / train['flags'][train['account_age_days'] >=days_old_flag].value_counts()[0]

#print('flag to non-flag ratio {:.2f}'.format(ratio))

In [946]:
inactive_users = train[train['last_seen_active'].isnull()] 
active_users = train[train['last_seen_active'].notnull()] 

In [948]:
print('percentage of flags for inactive users')
print('flags_username', inactive_users['flags_username'].value_counts()[1] / inactive_users['flags_username'].value_counts()[0])
print('flags_domain', inactive_users['flags_domain'].value_counts()[1] / inactive_users['flags_domain'].value_counts()[0])


percentage of flags for inactive users
flags_username 0.17614424410540916
flags_domain 0.13470115967885815


In [949]:
print('percentage of flags for inactive users')
print('flags_username', active_users['flags_username'].value_counts()[1] / active_users['flags_username'].value_counts()[0])
print('flags_domain', active_users['flags_domain'].value_counts()[1] / active_users['flags_domain'].value_counts()[0])


percentage of flags for inactive users
flags_username 0.13806451612903226
flags_domain 0.10181136789506559


There is a ~5% increase in the ratio of flags for inative users to active ones for username, and a 3% increase from domains

In [964]:
inactive = train.loc[inactive_users.index]
active = train.loc[active_users.index]
inactive['last_seen_active'] = inactive['account_age'] 
inactive['flags_inactivity'] = 1
active['flags_inactivity'] = 0

train_new = pd.concat([active, inactive])

In [965]:
train_new

Unnamed: 0_level_0,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,is_adopted,username,domain,extention,flags_username,last_seen_active,account_age,flags_doman,flags_domain,flags_inactivity
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
4880,2014-02-05 14:09:24,White Freya,zgglolpq@itusf.com,ORG_INVITE,2014-02-07 14:09:24,0,0,20,10318,False,zgglolpq,flag,com,1,119 days 00:49:26,121 days 00:49:26,1,1,0
83,2013-01-26 09:44:23,Araujo Gabriela,GabrielaFernandesAraujo@cuvox.de,ORG_INVITE,2013-01-27 09:44:23,1,0,172,9016,False,GabrielaFernandesAraujo,cuvox,de,0,495 days 05:14:27,496 days 05:14:27,0,0,0
4051,2014-05-29 14:58:50,Holm Alexander,AlexanderOHolm@gustr.com,ORG_INVITE,2014-06-06 14:58:50,0,1,47,5340,False,AlexanderOHolm,gustr,com,0,0 days 00:00:00,8 days 00:00:00,0,0,0
656,2013-01-24 15:50:50,Lowry Elise,EliseRLowry@gmail.com,SIGNUP_GOOGLE_AUTH,2013-01-24 15:50:50,1,1,42,0,False,EliseRLowry,gmail,com,0,497 days 23:08:00,497 days 23:08:00,0,0,0
2935,2014-04-06 05:15:42,Pfaff Frank,FrankPfaff@cuvox.de,SIGNUP,2014-04-09 05:15:42,1,1,394,0,False,FrankPfaff,cuvox,de,0,58 days 09:43:08,61 days 09:43:08,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6720,2014-02-09 00:27:11,Meier Ralph,RalphMeier@hotmail.com,ORG_INVITE,NaT,0,0,18,5723,False,RalphMeier,hotmail,com,0,117 days 14:31:39,117 days 14:31:39,0,0,1
7407,2012-08-12 13:39:11,Spears Delmar,muqvejim@jnkly.com,PERSONAL_PROJECTS,NaT,0,0,10,0,False,muqvejim,flag,com,1,663 days 01:19:39,663 days 01:19:39,1,1,1
11485,2014-04-27 08:14:30,Lind Amanda,rlgtdknm@ungql.com,PERSONAL_PROJECTS,NaT,0,0,348,0,False,rlgtdknm,flag,com,1,40 days 06:44:20,40 days 06:44:20,1,1,1
9010,2012-06-08 18:04:38,Gomes Otávio,OtavioCardosoGomes@yahoo.com,PERSONAL_PROJECTS,NaT,0,1,111,0,False,OtavioCardosoGomes,yahoo,com,1,727 days 20:54:12,727 days 20:54:12,0,0,1


In [951]:
train.index[0]

4880

In [952]:
flag_user(train.index[1])

0

In [953]:
train.iloc[0]

creation_time                 2014-02-05 14:09:24
name                                  White Freya
email                          zgglolpq@itusf.com
creation_source                        ORG_INVITE
last_session_creation_time    2014-02-07 14:09:24
opted_in_to_mailing_list                        0
enabled_for_marketing_drip                      0
org_id                                         20
invited_by_user_id                          10318
is_adopted                                  False
username                                 zgglolpq
domain                                       flag
extention                                     com
flags_username                                  1
last_seen_active                119 days 00:49:26
account_age                     121 days 00:49:26
flags_doman                                     1
flags_domain                                    1
Name: 4880, dtype: object

In [954]:
train.apply()

TypeError: apply() missing 1 required positional argument: 'func'

In [874]:
flagged_domain = train[train['domain'] == 'flag']
normal_domain = train[train['domain'] != 'flag']

In [762]:
np.log(flagged_domain['account_age_days'])

  result = getattr(ufunc, method)(*inputs, **kwargs)


object_id
4880     0.693147
3747         -inf
1286         -inf
2422     0.000000
6926     0.000000
           ...   
8979         -inf
7407         -inf
11485        -inf
10099        -inf
1358         -inf
Name: account_age_days, Length: 954, dtype: float64

In [872]:
flagged_domain

Unnamed: 0_level_0,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,is_adopted,addresses,username,domain,extention,flags,account_age,account_age_days
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
4880,2014-02-05 14:09:24,White Freya,zgglolpq@itusf.com,ORG_INVITE,2014-02-07 14:09:24,0,0,20,10318,False,"(zgglolpq, itusf, com)",zgglolpq,flag,com,True,2 days,2.0
3747,2014-03-23 06:22:10,Parry Louis,pcoqljlb@avfnw.com,ORG_INVITE,2014-03-23 06:22:10,0,0,11,7040,False,"(pcoqljlb, avfnw, com)",pcoqljlb,flag,com,True,0 days,0.0
1286,2013-10-19 09:20:23,Svendsen Peter,wlrbdngr@rhmlt.com,SIGNUP,2013-10-19 09:20:23,1,0,220,0,False,"(wlrbdngr, rhmlt, com)",wlrbdngr,flag,com,True,0 days,0.0
2422,2014-01-15 22:30:53,Dahl Mette,mhbkxgpe@ykbci.com,SIGNUP,2014-01-16 22:30:53,0,0,18,0,False,"(mhbkxgpe, ykbci, com)",mhbkxgpe,flag,com,True,1 days,1.0
6926,2012-10-26 12:29:09,Goncalves Julia,jlcbxnvs@ceneg.com,PERSONAL_PROJECTS,2012-10-27 12:29:09,1,0,3,0,False,"(jlcbxnvs, ceneg, com)",jlcbxnvs,flag,com,True,1 days,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8979,2013-11-27 00:15:44,Østergaard Benjamin,latrfzir@jvdde.com,PERSONAL_PROJECTS,NaT,0,0,14,0,False,"(latrfzir, jvdde, com)",latrfzir,flag,com,True,NaT,0.0
7407,2012-08-12 13:39:11,Spears Delmar,muqvejim@jnkly.com,PERSONAL_PROJECTS,NaT,0,0,10,0,False,"(muqvejim, jnkly, com)",muqvejim,flag,com,True,NaT,0.0
11485,2014-04-27 08:14:30,Lind Amanda,rlgtdknm@ungql.com,PERSONAL_PROJECTS,NaT,0,0,348,0,False,"(rlgtdknm, ungql, com)",rlgtdknm,flag,com,True,NaT,0.0
10099,2013-05-28 08:30:48,Nielsen Naja,tnkjosyo@xycmg.com,ORG_INVITE,NaT,1,1,0,1420,False,"(tnkjosyo, xycmg, com)",tnkjosyo,flag,com,True,NaT,0.0


In [736]:
import seaborn as sns
sns.boxplot(data = train[train['domain'] == 'flag']['account_age_days'])

KeyError: 0

In [732]:
first_name, last_name = train['name'].iloc[1].split()
re.match(first_name, train['username'].iloc[1])

In [645]:
re.match(last_name, train['username'].iloc[1])

'GiovanaAraujoMartins'

In [638]:
train['username'].iloc[1]

'GiovanaAraujoMartins'

In [632]:
train['username'].value_counts()

GabriellyBarbosaAlves    2
EdwardMcEvilly           2
LisaBaader               2
ThomasBrandt             2
BrunaCunhaCarvalho       2
                        ..
SpencerMackrell          1
yhmpqisq                 1
BrendaCardosoBarros      1
HarrisonLamb             1
DouglasJRucker           1
Name: username, Length: 9548, dtype: int64

gmail         2852
yahoo         1940
cuvox          980
jourrapide     976
sus            954
gustr          953
hotmail        945
Name: domain, dtype: int64

# Get a small model going


## Feature Engineering

In [506]:
train.head()

Unnamed: 0_level_0,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,is_adopted
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
4880,2014-02-05 14:09:24,White Freya,zgglolpq@itusf.com,ORG_INVITE,2014-02-07 14:09:24,0,0,20,10318,False
9399,2013-05-27 11:20:43,Martins Giovana,GiovanaAraujoMartins@jourrapide.com,ORG_INVITE,NaT,0,1,57,1055,False
83,2013-01-26 09:44:23,Araujo Gabriela,GabrielaFernandesAraujo@cuvox.de,ORG_INVITE,2013-01-27 09:44:23,1,0,172,9016,False
4051,2014-05-29 14:58:50,Holm Alexander,AlexanderOHolm@gustr.com,ORG_INVITE,2014-06-06 14:58:50,0,1,47,5340,False
656,2013-01-24 15:50:50,Lowry Elise,EliseRLowry@gmail.com,SIGNUP_GOOGLE_AUTH,2013-01-24 15:50:50,1,1,42,0,False


In [507]:
train['creation_source'][users['invited_by_user_id'].isna()].value_counts()

Series([], Name: creation_source, dtype: int64)

In [508]:
train['creation_source'][users['invited_by_user_id'].notna()].value_counts()

ORG_INVITE            3454
GUEST_INVITE          1707
PERSONAL_PROJECTS     1658
SIGNUP                1655
SIGNUP_GOOGLE_AUTH    1126
Name: creation_source, dtype: int64

In [509]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9600 entries, 4880 to 8426
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   creation_time               9600 non-null   datetime64[ns]
 1   name                        9600 non-null   object        
 2   email                       9600 non-null   object        
 3   creation_source             9600 non-null   object        
 4   last_session_creation_time  7056 non-null   datetime64[ns]
 5   opted_in_to_mailing_list    9600 non-null   int64         
 6   enabled_for_marketing_drip  9600 non-null   int64         
 7   org_id                      9600 non-null   int64         
 8   invited_by_user_id          9600 non-null   int64         
 9   is_adopted                  9600 non-null   bool          
dtypes: bool(1), datetime64[ns](2), int64(4), object(3)
memory usage: 759.4+ KB


In [510]:
train['total_session_time'] = train['last_session_creation_time'] - train['creation_time']

In [511]:
train['creation_source'].value_counts()

ORG_INVITE            3454
GUEST_INVITE          1707
PERSONAL_PROJECTS     1658
SIGNUP                1655
SIGNUP_GOOGLE_AUTH    1126
Name: creation_source, dtype: int64

In [512]:
train['name'].value_counts()

Araujo Gabriela     5
Carvalho Bruna      4
Cunha Melissa       4
Costa Lavinia       4
Oliveira Rafael     3
                   ..
Frei Marko          1
Connolly Nathan     1
Sanderson Archie    1
Walters Joe         1
Rucker Douglas      1
Name: name, Length: 9183, dtype: int64

In [513]:
train

Unnamed: 0_level_0,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,is_adopted,total_session_time
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
4880,2014-02-05 14:09:24,White Freya,zgglolpq@itusf.com,ORG_INVITE,2014-02-07 14:09:24,0,0,20,10318,False,2 days
9399,2013-05-27 11:20:43,Martins Giovana,GiovanaAraujoMartins@jourrapide.com,ORG_INVITE,NaT,0,1,57,1055,False,NaT
83,2013-01-26 09:44:23,Araujo Gabriela,GabrielaFernandesAraujo@cuvox.de,ORG_INVITE,2013-01-27 09:44:23,1,0,172,9016,False,1 days
4051,2014-05-29 14:58:50,Holm Alexander,AlexanderOHolm@gustr.com,ORG_INVITE,2014-06-06 14:58:50,0,1,47,5340,False,8 days
656,2013-01-24 15:50:50,Lowry Elise,EliseRLowry@gmail.com,SIGNUP_GOOGLE_AUTH,2013-01-24 15:50:50,1,1,42,0,False,0 days
...,...,...,...,...,...,...,...,...,...,...,...
3190,2014-01-10 21:51:06,Edgerton Jack,JackEdgerton@gmail.com,SIGNUP_GOOGLE_AUTH,2014-01-10 21:51:06,0,0,167,0,False,0 days
1358,2013-12-24 05:24:22,Hoffmann Peter,smjhhror@uznoh.com,SIGNUP,2013-12-24 05:24:22,0,0,276,0,False,0 days
11084,2013-02-23 20:11:35,Nepean Ben,BenNepean@gmail.com,GUEST_INVITE,2013-04-09 20:11:35,1,0,5,8487,False,45 days
7853,2014-04-24 07:15:58,Barrallier Stella,StellaBarrallier@gustr.com,GUEST_INVITE,2014-04-25 07:15:58,0,0,178,2751,False,1 days


## Modeling

In [514]:
X_train, X_val, y_train, y_val = train_test_split(train.drop('is_adopted', axis=1), train['is_adopted'], train_size=0.8, random_state=0, stratify=train['is_adopted'])

In [515]:
y_val.value_counts()

False    1689
True      231
Name: is_adopted, dtype: int64

In [516]:
X_train['invited_by_user_id'].fillna(value=0).astype(int)

object_id
36        6805
1728      1728
8089      7375
3160         0
9667     11750
         ...  
10571     2251
11626        0
11212        0
3458         0
1974      8494
Name: invited_by_user_id, Length: 7680, dtype: int64

In [301]:
target

1        False
2         True
3        False
4        False
5        False
         ...  
11996    False
11997    False
11998    False
11999    False
12000    False
Length: 8823, dtype: bool

In [302]:
adopted_user(users_na.index[0])

False

In [303]:
%%time
users_na['is_adopted'] = users_na['object_id'].apply(lambda x: adopted_user(x))


KeyError: 'object_id'

In [261]:
users_na['object_id'].iloc[0]

1

time_stamp    0
user_id       0
visited       0
dtype: int64