# Relax Inc. Take Home Practice

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

# Introduction

# Predicting User Adoption in an Online Platform

The goal of this project is to identify factors that predict future user adoption in an online platform. We have two main datasets:

1. **User Data**: This dataset contains information about 12,000 users who signed up for the product. It includes user details such as name, email, account creation source, creation time, last session creation time, and more.

2. **User Engagement Data**: This dataset contains a summary of user activity, including login dates.

To define an "adopted user," we consider a user who has logged into the product on three separate days within at least one seven-day period. The task is to identify which factors predict whether a user will become an adopted user.

In this analysis, we will perform data cleaning, exploratory data analysis, and build predictive models to understand the key indicators of user adoption. The insights gained will help the platform improve long-term user retention.

Let's start by loading the data and conducting initial data exploration.


In [2]:
# Load 'takehome_users.csv' into a DataFrame
users = pd.read_csv('takehome_users.csv', encoding='latin-1')

# Load 'takehome_user_engagement.csv' into a DataFrame
user_engagement = pd.read_csv('takehome_user_engagement.csv')

In [3]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [4]:
#Rename object id as user id for easier processing
users.rename(columns={'object_id': 'user_id'}, inplace=True)

In [5]:
#Count nulls in each column of users
null_counts = users.isnull().sum()
print(null_counts)

user_id                          0
creation_time                    0
name                             0
email                            0
creation_source                  0
last_session_creation_time    3177
opted_in_to_mailing_list         0
enabled_for_marketing_drip       0
org_id                           0
invited_by_user_id            5583
dtype: int64


In [6]:
unique_users = users['user_id'].nunique()
unique_users

12000

In [7]:
# Drop the email& name columns. User Id will be the unique identifier between users
users.drop(columns=['email', 'name'], inplace=True)

# Set 'user_id' as the index
users.set_index('user_id', inplace=True)

In [8]:
#Convert creation_time column to datetime type
users['creation_time'] = pd.to_datetime(users['creation_time'])

users['last_session_creation_time'] = pd.to_datetime(users['last_session_creation_time'], unit='s')

## Dealing with Null Values

In the dataset, there were null values in the "invited_by_user_id" column. We decided to create a new binary column named "invited_by_user" to indicate whether a user was referred by another user. Users with a non-null "invited_by_user_id" were assigned the value 1, indicating that they were referred by another user, while users with null values in "invited_by_user_id" were assigned the value 0, indicating that they were not referred by anyone. This approach allowed us to retain the information about whether a user was referred without needing to identify the specific user who referred them.

By doing so, we handled the null values in a way that aligns with the binary nature of the information we were interested in, which is whether or not a user was referred. This approach simplifies the dataset while preserving the relevant information.


In [9]:
# Create a new column "invited_by_user" with 1 for non-null values and 0 for null values
users['invited_by_user'] = users['invited_by_user_id'].notna().astype(int)

# Now, drop the original column
users.drop('invited_by_user_id', axis=1, inplace=True)

In [10]:
users.head()

Unnamed: 0_level_0,creation_time,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,2014-04-22 03:53:30,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,1
2,2013-11-15 03:45:04,ORG_INVITE,2014-03-31 03:45:04,0,0,1,1
3,2013-03-19 23:14:52,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1
4,2013-05-21 08:09:28,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,1
5,2013-01-17 10:14:20,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,1


In [11]:
user_engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [12]:
unique_users = user_engagement['user_id'].nunique()
unique_users

8823

In our analysis, we've addressed null values in the dataset. For the "invited_by_user_id" column, we created a new binary column, "invited_by_user," to indicate whether users were invited by another user. Null values in this column were treated as uninvited users.

Regarding the "last_session_creation_time" column, it's important to note that the 8823 non-null values correspond to users who have logged into the product at least once, while the remaining 3177 null values indicate users who have not logged in since their account creation. This column will be dropped before modeling, as it was used solely for identifying adopted users.

Our focus now is to identify adopted users based on user engagement data.


In [13]:
# Create the target variable column and set all values to 0
users['adopted_user'] = 0

users.head()

Unnamed: 0_level_0,creation_time,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user,adopted_user
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,2014-04-22 03:53:30,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,1,0
2,2013-11-15 03:45:04,ORG_INVITE,2014-03-31 03:45:04,0,0,1,1,0
3,2013-03-19 23:14:52,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1,0
4,2013-05-21 08:09:28,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,1,0
5,2013-01-17 10:14:20,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,1,0


In [14]:
# Find the latest date in user_engagement
latest_date = user_engagement['time_stamp'].max()

# Calculate the duration using the latest date as the reference
users['usage_duration'] = (latest_date - users['creation_time']).dt.days

TypeError: unsupported operand type(s) for -: 'DatetimeArray' and 'str'