# Relax Data Science Challenge




We are given the data of the users and their engagement data. We must find the most important factors contributing to user adoption.
Let us first read in the user-engagement data and filter those users who have accessed the product atleast thrice on three separate days in a seven day period.

#### Import necessary libraries

As this is an investigative study, we will import and use libraries as and when necessary.

In [1]:
import pandas as pd
import numpy as np 

#### Read in the user-engagement data and explore it.

In [2]:
user_engagement = pd.read_csv('../relax_challenge/takehome_user_engagement.csv')
user_engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


#### Deal with dates
We convert the 'timestamp' column into the appropriate datatype and also group the engagement data by date(as multiple times within the same day counts as once towards adoption)

In [3]:
user_engagement['time_stamp'] = user_engagement['time_stamp'].astype('datetime64')
user_engagement['date'] = user_engagement['time_stamp'].dt.date
user_engagement = user_engagement.groupby(['date','user_id']).sum().reset_index()

#### Demarcate Adopted users

In [4]:
condition = user_engagement.sort_values(['user_id', 'date']) - user_engagement.sort_values(['user_id', 'date']).shift(-2)
adopted_users = user_engagement.sort_values(['user_id', 'date'])[(condition['date'].dt.days >= -7) & (condition['user_id'] == 0)]['user_id'].unique()

#### Read in the users data and convert dates to appropriate datatypes

In [5]:
user_data = pd.read_csv('../relax_challenge/takehome_users.csv', encoding = 'latin-1')
user_data['creation_time'] = user_data['creation_time'].astype('datetime64')

In addition to the features given in the dataset, we add an additional feature which indicates if the user was invited by an adopted user.
We then drop other details such as personal information so that they will not interfere in our predictive modelling. We also onehot encode the 'creation_source' variable.

In [6]:
user_data['invited_by_adopted_user'] = user_data['invited_by_user_id'].map(lambda x: int(x in adopted_users))
user_data = pd.concat([user_data, pd.get_dummies(user_data['creation_source'])], axis = 1)
user_data['adopted_user'] = user_data['object_id'].map(lambda x: int(x in adopted_users))
user_data.drop(['name', 'email', 'creation_source', 'last_session_creation_time', 'object_id', 'invited_by_user_id', 'org_id', 'creation_time'], axis = 1, inplace = True)

#### Logistic Regression

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
X = user_data.values[:,:-1]
y = user_data.values[:,-1]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, shuffle = True)
Cs = [0.001, 0.1, 1, 10, 100]
for C in Cs:
    clf = LogisticRegression(C = C)
    clf.fit(X_train, y_train)
    Y_pred_valid = clf.predict(X_valid)
    print(classification_report(y_valid, Y_pred_valid))

clf = LogisticRegression()
clf.fit(X_train, y_train)
Y_pred_valid = clf.predict(X_valid)
print(classification_report(y_valid, Y_pred_valid))

              precision    recall  f1-score   support

           0       0.86      1.00      0.92      2581
           1       0.00      0.00      0.00       419

    accuracy                           0.86      3000
   macro avg       0.43      0.50      0.46      3000
weighted avg       0.74      0.86      0.80      3000

              precision    recall  f1-score   support

           0       0.86      1.00      0.92      2581
           1       0.00      0.00      0.00       419

    accuracy                           0.86      3000
   macro avg       0.43      0.50      0.46      3000
weighted avg       0.74      0.86      0.80      3000

              precision    recall  f1-score   support

           0       0.86      1.00      0.92      2581
           1       0.00      0.00      0.00       419

    accuracy                           0.86      3000
   macro avg       0.43      0.50      0.46      3000
weighted avg       0.74      0.86      0.80      3000

              preci

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### SMOTE analysis

In [8]:
import imblearn.over_sampling as imb
oversample = imb.SMOTE()
X, y = oversample.fit_resample(X, y)

Using TensorFlow backend.


In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, shuffle = True)
Cs = [0.001, 0.1, 1, 10, 100]
for C in Cs:
    clf = LogisticRegression(C = C)
    clf.fit(X_train, y_train)
    Y_pred_valid = clf.predict(X_valid)
    print(classification_report(y_valid, Y_pred_valid))

clf = LogisticRegression()
clf.fit(X_train, y_train)
Y_pred_valid = clf.predict(X_valid)
print(classification_report(y_valid, Y_pred_valid))

              precision    recall  f1-score   support

           0       0.57      0.48      0.52      2602
           1       0.55      0.64      0.59      2570

    accuracy                           0.56      5172
   macro avg       0.56      0.56      0.55      5172
weighted avg       0.56      0.56      0.55      5172

              precision    recall  f1-score   support

           0       0.57      0.48      0.52      2602
           1       0.55      0.64      0.59      2570

    accuracy                           0.56      5172
   macro avg       0.56      0.56      0.55      5172
weighted avg       0.56      0.56      0.55      5172

              precision    recall  f1-score   support

           0       0.57      0.48      0.52      2602
           1       0.55      0.64      0.59      2570

    accuracy                           0.56      5172
   macro avg       0.56      0.56      0.55      5172
weighted avg       0.56      0.56      0.55      5172

              preci

#### Decision Tree Classifier

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
X = user_data.values[:,:-1]
y = user_data.values[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, test_size = 0.1)
clf = DecisionTreeClassifier()
param_grid = {"criterion": ["gini", "entropy"], 
                       "max_depth": [1,2,.3,4,5,6,None], 
                       "max_features": ["sqrt", "log2", None], 
                       "random_state" : [5], 
                       "class_weight" : ["balanced", {0: 1, 1:10}, {0:1, 1:15}], "presort": [True]}
gs = GridSearchCV(estimator = clf, 
                   param_grid = param_grid, 
                   n_jobs = 2, 
                   cv = None)
gs.fit(X_train, y_train)
best_clf = gs.best_estimator_
Y_test = best_clf.predict(X_test)
print(classification_report(y_test, Y_test))

              precision    recall  f1-score   support

           0       0.87      0.82      0.84      1037
           1       0.17      0.24      0.20       163

    accuracy                           0.74      1200
   macro avg       0.52      0.53      0.52      1200
weighted avg       0.78      0.74      0.76      1200





In [11]:
import graphviz.backend as be
from dtreeviz.trees import *
viz = dtreeviz(best_clf, 
               X, 
               y,
               target_name = 'Adopted User',
               feature_names = user_data.columns[:-1],
               class_names=["Adopted User", "Not adopted user"],
              fancy=False )

viz.view()

In [12]:
?viz.view