## Onboarding Impact Analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data_path = "/Users/Owner/Documents/Work_transfer/CLSep/Onboarding Analysis/Onboarding Data/"

In [3]:
onboardcolumns = ['User', 'Onboard', 'Account.Created', 'Onboard.Time', 'Last.Action']
comcolumns = ['User','Type', 'Comment.Time']
colleaguescolumns = ['User', 'User2', 'Friend.Time']
grpcolumns = ['User', 'Group', 'Join.Time']



In [4]:
#Importing the four files that will be joined eventually

ob = pd.read_csv(data_path+'Onboarding Messages and Users.csv', names = onboardcolumns)
com = pd.read_csv(data_path+'Discussions Blogs.csv', names = comcolumns)
col = pd.read_csv(data_path+'Colleague Counts.csv', names = colleaguescolumns)
grp = pd.read_csv(data_path+'Groups.csv', names = grpcolumns)

In [5]:
#How many people have created an account since the onboarding process?

usercom = com.groupby('User', as_index = False).count()
usercol = col.groupby('User', as_index = False).count()
usergrp = grp.groupby('User', as_index = False).count()

In [6]:
usercom = usercom.drop('Comment.Time',axis = 1)
usercom.columns = ['User', 'Comments']
usercol = usercol.drop('User2', axis = 1)
usercol.columns = ['User', 'Colleagues']
usergrp = usergrp.drop('Join.Time', axis  = 1)
usergrp.columns = ['User', 'Groups']

In [7]:
users = pd.merge(ob, usercom, how = 'outer')
users = pd.merge(users, usercol, how = 'outer')
users = pd.merge(users, usergrp, how = 'outer')

In [8]:
users.head()
onboarded = users[users['Onboard'] == 'true']

In [9]:
ob.count()

User               5416
Onboard            5416
Account.Created    5416
Onboard.Time       5416
Last.Action        5416
dtype: int64

In [10]:
onboarded = onboarded.fillna(0)
onboarded.tail()

Unnamed: 0,User,Onboard,Account.Created,Onboard.Time,Last.Action,Comments,Colleagues,Groups
5391,19879834,True,2016-03-18,2016-10-28,2016-03-18,0,0,3
5395,23814834,True,2016-10-28,2016-10-28,2016-10-28,0,0,1
5400,834686,True,2013-07-29,2016-10-28,2016-09-16,1,64,34
5403,6188649,True,2014-09-30,2016-10-28,2016-06-14,0,3,5
5405,4665274,True,2014-05-09,2016-10-28,2016-10-28,0,16,4


In [11]:
onboardedoncreation = onboarded[onboarded['Account.Created'] >= '2016-10-18']

In [12]:
onboardedoncreation.describe()

Unnamed: 0,User,Comments,Colleagues,Groups
count,297.0,297.0,297.0,297.0
mean,23682491.006734,0.013468,0.373737,1.646465
std,65591.156446,0.115462,0.968272,1.141773
min,23534414.0,0.0,0.0,0.0
25%,23630767.0,0.0,0.0,1.0
50%,23673019.0,0.0,0.0,1.0
75%,23728192.0,0.0,0.0,2.0
max,23814834.0,1.0,6.0,7.0


In [13]:
notonboarded = users[users['Onboard'] == 'passed']
notonboarded = notonboarded.fillna(0)
notonboardedoncreation = notonboarded[notonboarded['Account.Created'] >= '2016-10-18']

In [14]:
notonboardedoncreation.describe()

Unnamed: 0,User,Comments,Colleagues,Groups
count,134.0,134.0,134.0,134.0
mean,23681587.395522,0.029851,0.402985,1.634328
std,70480.002051,0.272562,1.343989,1.259795
min,23533414.0,0.0,0.0,0.0
25%,23631322.25,0.0,0.0,1.0
50%,23670245.5,0.0,0.0,1.0
75%,23740584.5,0.0,0.0,2.0
max,23815270.0,3.0,9.0,10.0


From averages, it looks like there hasn't been an impact on actual usage of onboarding process for people who create their account. Possible reasons for this is that maybe people who skip the onboarding process don't feel it necessary to be walked through the process.

Also outliers. This is just a really quick look, and doesn't mean anything. Additionally, sample sizes are different

In [15]:
notonboardedoncreation = notonboardedoncreation.reset_index()

In [16]:
onboardedoncreation = onboardedoncreation.reset_index()
notonboardedoncreat = notonboardedoncreation.reset_index()

In [17]:
#Visualizing how people who have joined since onboarding was implemented behave
#There really isn't anything groundbreaking we can show here to be honest
data = [onboardedoncreation['Groups'], notonboardedoncreation['Groups'],
       onboardedoncreation['Comments'], notonboardedoncreation['Comments'],
       onboardedoncreation['Colleagues'], notonboardedoncreation['Colleagues']]



xaxes = ['Groups','Groups', 'Comments', 'Comments', 'Colleagues', 'Colleagues']
yaxes = ['Frequency','Frequency', 'Frequency','Frequency','Frequency','Frequency']
titles = ['Groups Joined Users Onboarded','Groups Joined Users Not Onboarded',
         'Comments  Users Onboarded','Comments  Users Not Onboarded',
         'Colleagues Added Users Onboarded','Colleagues Added Users Not Onboarded'] 

f,a = plt.subplots(3,2)
a = a.ravel()
for idx,ax in enumerate(a):
    ax.hist(data[idx])
    ax.set_title(titles[idx])
    ax.set_xlabel(xaxes[idx])
    ax.set_ylabel(yaxes[idx])
plt.tight_layout()
plt.show()

In [18]:
users = users.fillna(0)

In [19]:
def zubat(x):
        if type(x) == str:
            return 2
        else:
            return x
        


In [20]:
users['Onboard'][users['Onboard'] == 'passed'] = 0
users['Onboard'][users['Onboard'] == 'true'] = 1
users['Onboard'] = users['Onboard'].apply(zubat)

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [21]:
#Only 5416 people have been exposed to the onboarding process. Kind of throws off our numbers a bit

In [22]:
users = users[users['Account.Created'] != 0]
users.describe()

Unnamed: 0,User,Onboard,Comments,Colleagues,Groups
count,5416.0,5416.0,5416.0,5416.0,5416.0
mean,12459925.707164,1.292836,1.715473,11.140879,6.72212
std,8794428.035131,0.846432,9.816926,37.098107,10.183658
min,6.0,0.0,0.0,0.0,0.0
25%,4320210.0,0.0,0.0,0.0,1.0
50%,12460378.0,2.0,0.0,2.0,3.0
75%,21422112.75,2.0,0.0,10.0,8.0
max,23815270.0,2.0,324.0,1176.0,140.0


In [23]:
def separator(x):
    if x >= '2016-10-18':
        return 1
    else:
        return 0
    


In [24]:
users['CreateBeforeOnboarding'] = users['Account.Created'].apply(separator)

In [25]:
users.describe()

Unnamed: 0,User,Onboard,Comments,Colleagues,Groups,CreateBeforeOnboarding
count,5416.0,5416.0,5416.0,5416.0,5416.0,5416.0
mean,12459925.707164,1.292836,1.715473,11.140879,6.72212,0.144202
std,8794428.035131,0.846432,9.816926,37.098107,10.183658,0.351327
min,6.0,0.0,0.0,0.0,0.0,0.0
25%,4320210.0,0.0,0.0,0.0,1.0,0.0
50%,12460378.0,2.0,0.0,2.0,3.0,0.0
75%,21422112.75,2.0,0.0,10.0,8.0,0.0
max,23815270.0,2.0,324.0,1176.0,140.0,1.0


In [26]:
users0 = users[users['Onboard'] == 0]
users1 = users[users['Onboard'] == 1]
users2 = users[users['Onboard'] == 2]

In [27]:
from sklearn import linear_model

In [28]:
u0_x = users.drop(['User', 'Onboard', 'Onboard.Time', 'Last.Action', 'Account.Created'], axis = 1)
u0_y = users['Onboard']

clf0 = linear_model.LinearRegression()
clf0.fit(u0_x, u0_y)
print (clf0.coef_)

[-0.00042583 -0.0001708  -0.00212614 -0.03489531]


In [29]:
regression_frame = users

In [30]:
regression_frame['Onboard*Create'] = regression_frame['Onboard']*regression_frame['CreateBeforeOnboarding']

In [31]:
def interactioner(x):
    return x*regression_frame['CreateBeforeOnboarding']

regression_frame['Comments*Create'] = regression_frame['Comments']*regression_frame['CreateBeforeOnboarding']
regression_frame['Colleagues*Create'] = regression_frame['Colleagues']*regression_frame['CreateBeforeOnboarding']
regression_frame['Groups*Create'] = regression_frame['Groups']*regression_frame['CreateBeforeOnboarding']

In [32]:
regression_frame.head()

Unnamed: 0,User,Onboard,Account.Created,Onboard.Time,Last.Action,Comments,Colleagues,Groups,CreateBeforeOnboarding,Onboard*Create,Comments*Create,Colleagues*Create,Groups*Create
0,2839,1,2009-05-20,2016-10-18,2016-09-19,4,51,17,0,0,0,0,0
1,12509067,2,2015-06-15,2016-10-18,2016-10-27,0,99,9,0,0,0,0,0
2,16860392,2,2015-12-02,2016-10-18,2015-12-06,0,1,0,0,0,0,0,0
3,4275280,0,2014-03-07,2016-10-19,2016-10-28,25,34,19,0,0,0,0,0
4,4275270,1,2014-03-07,2016-10-19,2016-10-26,4,35,10,0,0,0,0,0


In [33]:
create_rf = regression_frame[regression_frame['CreateBeforeOnboarding'] == 1]

In [40]:
create_rf_y_grp = create_rf['Groups']
create_rf_x_grp = create_rf['Onboard']


clf_grp = linear_model.LinearRegression()
clf_grp.fit(create_rf_x_grp.reshape(len(create_rf_x_grp), 1), create_rf_y_grp)
print (clf_grp.coef_)

[-0.13878762]


In [53]:
create_rf.head()

Unnamed: 0,User,Onboard,Account.Created,Onboard.Time,Last.Action,Comments,Colleagues,Groups,CreateBeforeOnboarding,Onboard*Create,Comments*Create,Colleagues*Create,Groups*Create
14,23595485,1,2016-10-19,2016-10-19,2016-10-19,0,0,0,1,1,0,0,0
19,23595498,1,2016-10-19,2016-10-19,2016-10-19,0,0,1,1,1,0,0,1
38,23595528,1,2016-10-19,2016-10-19,2016-10-19,0,0,0,1,1,0,0,0
45,23595339,2,2016-10-18,2016-10-19,2016-10-18,0,0,0,1,2,0,0,0
46,23595547,2,2016-10-19,2016-10-19,2016-10-19,0,0,0,1,2,0,0,0


In [42]:
import statsmodels.formula.api as sm

In [45]:
result = sm.OLS(create_rf_y_grp, create_rf_x_grp).fit()
result.summary()

0,1,2,3
Dep. Variable:,Groups,R-squared:,0.467
Model:,OLS,Adj. R-squared:,0.467
Method:,Least Squares,F-statistic:,684.3
Date:,"Wed, 02 Nov 2016",Prob (F-statistic):,8.8e-109
Time:,10:02:42,Log-Likelihood:,-1351.8
No. Observations:,781,AIC:,2706.0
Df Residuals:,780,BIC:,2710.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Onboard,0.8680,0.033,26.159,0.000,0.803 0.933

0,1,2,3
Omnibus:,312.255,Durbin-Watson:,1.749
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1631.058
Skew:,1.754,Prob(JB):,0.0
Kurtosis:,9.15,Cond. No.,1.0


In [52]:
results_1 = sm.ols('Groups ~ Onboard', data = create_rf).fit()
results_2 = sm.ols('Comments ~ Onboard', data = create_rf).fit()
results_3 = sm.ols('Colleagues ~ Onboard', data = create_rf).fit()

print (results_1.summary())
print (results_2.summary())
print (results_3.summary())

                            OLS Regression Results                            
Dep. Variable:                 Groups   R-squared:                       0.009
Model:                            OLS   Adj. R-squared:                  0.008
Method:                 Least Squares   F-statistic:                     7.201
Date:                Wed, 02 Nov 2016   Prob (F-statistic):            0.00744
Time:                        10:10:17   Log-Likelihood:                -1156.6
No. Observations:                 781   AIC:                             2317.
Df Residuals:                     779   BIC:                             2326.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      1.7137      0.076     22.478      0.0

In [58]:
#Need to control for time

create_rf['Account.Created'] = pd.to_datetime(create_rf['Account.Created'], format = '%Y-%m-%d')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [66]:
create_rf['Time'] = pd.to_datetime('2016-10-28', format = '%Y-%m-%d') - create_rf['Account.Created']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [77]:
create_rf['Time'] = (create_rf['Time']/np.timedelta64(1,'D')).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [78]:
#Seems like there just isn't enough variation in the observations to do anything

results_1 = sm.ols('Groups ~ Onboard + Time', data = create_rf).fit()
results_2 = sm.ols('Comments ~ Onboard + Time', data = create_rf).fit()
results_3 = sm.ols('Colleagues ~ Onboard + Time', data = create_rf).fit()

print (results_1.summary())
print (results_2.summary())
print (results_3.summary())

                            OLS Regression Results                            
Dep. Variable:                 Groups   R-squared:                       0.010
Model:                            OLS   Adj. R-squared:                  0.007
Method:                 Least Squares   F-statistic:                     3.745
Date:                Wed, 02 Nov 2016   Prob (F-statistic):             0.0241
Time:                        10:32:42   Log-Likelihood:                -1156.4
No. Observations:                 781   AIC:                             2319.
Df Residuals:                     778   BIC:                             2333.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      1.7455      0.096     18.158      0.0

Regression Analysis doesn't says that there is no real statistical difference that onboarding makes in the activity of groups colleagues and comments.

It's not really that surprising. A better metric would be to see if they feel more comfortable with the page, and maybe how often they log on. But we can't match Google Analytics data anyway