## Onboarding Impact Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data_path = "/Users/Owner/Documents/Work_transfer/CLSep/Onboarding Analysis/Onboarding Files update/"

In [None]:
onboardcolumns = ['User', 'Onboard', 'Account.Created', 'Onboard.Time', 'Last.Action']
comcolumns = ['User','Type', 'Comment.Time']
colleaguescolumns = ['User', 'User2', 'Friend.Time']
grpcolumns = ['User', 'Group', 'Join.Time']



In [None]:
#Importing the four files that will be joined eventually

ob = pd.read_csv(data_path+'Onboarding Messages and Users.csv', names = onboardcolumns)
com = pd.read_csv(data_path+'Discussions Blogs.csv', names = comcolumns)
col = pd.read_csv(data_path+'Colleague Counts.csv', names = colleaguescolumns)
grp = pd.read_csv(data_path+'Groups.csv', names = grpcolumns)

In [None]:
#How many people have created an account since the onboarding process?

usercom = com.groupby('User', as_index = False).count()
usercol = col.groupby('User', as_index = False).count()
usergrp = grp.groupby('User', as_index = False).count()

In [None]:
usercom = usercom.drop('Comment.Time',axis = 1)
usercom.columns = ['User', 'Comments']
usercol = usercol.drop('User2', axis = 1)
usercol.columns = ['User', 'Colleagues']
usergrp = usergrp.drop('Join.Time', axis  = 1)
usergrp.columns = ['User', 'Groups']

In [None]:
users = pd.merge(ob, usercom, how = 'outer')
users = pd.merge(users, usercol, how = 'outer')
users = pd.merge(users, usergrp, how = 'outer')

In [None]:
users.head()
onboarded = users[users['Onboard'] == 'true']

In [None]:
ob.count()

In [None]:
onboarded = onboarded.fillna(0)
onboarded.tail()

In [None]:
onboardedoncreation = onboarded[onboarded['Account.Created'] >= '2016-10-18']

In [None]:
onboardedoncreation.describe()

In [None]:
notonboarded = users[users['Onboard'] == 'passed']
notonboarded = notonboarded.fillna(0)
notonboardedoncreation = notonboarded[notonboarded['Account.Created'] >= '2016-10-18']

In [None]:
notonboardedoncreation.describe()

From averages, it looks like there hasn't been an impact on actual usage of onboarding process for people who create their account. Possible reasons for this is that maybe people who skip the onboarding process don't feel it necessary to be walked through the process.

Also outliers. This is just a really quick look, and doesn't mean anything. Additionally, sample sizes are different

In [None]:
notonboardedoncreation = notonboardedoncreation.reset_index()

In [None]:
onboardedoncreation = onboardedoncreation.reset_index()

In [None]:
#Visualizing how people who have joined since onboarding was implemented behave
#There really isn't anything groundbreaking we can show here to be honest
data = [onboardedoncreation['Groups'], notonboardedoncreation['Groups'],
       onboardedoncreation['Comments'], notonboardedoncreation['Comments'],
       onboardedoncreation['Colleagues'], notonboardedoncreation['Colleagues']]



xaxes = ['Groups','Groups', 'Comments', 'Comments', 'Colleagues', 'Colleagues']
yaxes = ['Frequency','Frequency', 'Frequency','Frequency','Frequency','Frequency']
titles = ['Groups Joined Users Onboarded','Groups Joined Users Not Onboarded',
         'Comments  Users Onboarded','Comments  Users Not Onboarded',
         'Colleagues Added Users Onboarded','Colleagues Added Users Not Onboarded'] 

f,a = plt.subplots(3,2)
a = a.ravel()
for idx,ax in enumerate(a):
    ax.hist(data[idx])
    ax.set_title(titles[idx])
    ax.set_xlabel(xaxes[idx])
    ax.set_ylabel(yaxes[idx])
plt.tight_layout()
plt.show()

In [None]:
users = users.fillna(0)

In [None]:
def zubat(x):
        if type(x) == str:
            return 2
        else:
            return x
        


In [None]:
users['Onboard'][users['Onboard'] == 'passed'] = 0
users['Onboard'][users['Onboard'] == 'true'] = 1
users['Onboard'] = users['Onboard'].apply(zubat)

In [None]:
#Only 5416 people have been exposed to the onboarding process. Kind of throws off our numbers a bit

In [None]:
users = users[users['Account.Created'] != 0]
users.describe()


In [None]:
def separator(x):
    if x >= '2016-10-18':
        return 1
    else:
        return 0
    


In [None]:
users['CreateAfterOnboarding'] = users['Account.Created'].apply(separator)

users0 = users[users['Onboard'] == 0]
users1 = users[users['Onboard'] == 1]
users2 = users[users['Onboard'] == 2]

from sklearn import linear_model

u0_x = users.drop(['User', 'Onboard', 'Onboard.Time', 'Last.Action', 'Account.Created'], axis = 1)
u0_y = users['Onboard']

clf0 = linear_model.LinearRegression()
clf0.fit(u0_x, u0_y)
print (clf0.coef_)

regression_frame = users

regression_frame['Onboard*Create'] = regression_frame['Onboard']*regression_frame['CreateBeforeOnboarding']

def interactioner(x):
    return x*regression_frame['CreateBeforeOnboarding']

regression_frame['Comments*Create'] = regression_frame['Comments']*regression_frame['CreateBeforeOnboarding']
regression_frame['Colleagues*Create'] = regression_frame['Colleagues']*regression_frame['CreateBeforeOnboarding']
regression_frame['Groups*Create'] = regression_frame['Groups']*regression_frame['CreateBeforeOnboarding']

regression_frame.head()

In [None]:
create_rf = users[users['CreateAfterOnboarding'] == 1]

create_rf_y_grp = create_rf['Groups']
create_rf_x_grp = create_rf['Onboard']


clf_grp = linear_model.LinearRegression()
clf_grp.fit(create_rf_x_grp.reshape(len(create_rf_x_grp), 1), create_rf_y_grp)
print (clf_grp.coef_)

In [None]:
create_rf.describe()

In [None]:
import statsmodels.formula.api as sm

result = sm.OLS(create_rf_y_grp, create_rf_x_grp).fit()
result.summary()

In [None]:
results_1 = sm.ols('Groups ~ C(Onboard)', data = create_rf).fit()
results_2 = sm.ols('Comments ~ C(Onboard)', data = create_rf).fit()
results_3 = sm.ols('Colleagues ~ C(Onboard)', data = create_rf).fit()

print (results_1.summary())
print (results_2.summary())
print (results_3.summary())

In [None]:
#Need to control for time

create_rf['Account.Created'] = pd.to_datetime(create_rf['Account.Created'], format = '%Y-%m-%d')

In [None]:
create_rf['Time'] = pd.to_datetime('2016-10-28', format = '%Y-%m-%d') - create_rf['Account.Created']

In [None]:
create_rf['Time'] = (create_rf['Time']/np.timedelta64(1,'D')).astype(int)

In [None]:
#Seems like there just isn't enough variation in the observations to do anything

results_1 = sm.ols('Groups ~ C(Onboard) + Time', data = create_rf).fit()
results_2 = sm.ols('Comments ~ C(Onboard) + Time', data = create_rf).fit()
results_3 = sm.ols('Colleagues ~ C(Onboard) + Time', data = create_rf).fit()

print (results_1.summary())
print (results_2.summary())
print (results_3.summary())

Regression Analysis doesn't says that there is no real statistical difference that onboarding makes in the activity of groups colleagues and comments.

It's not really that surprising. A better metric would be to see if they feel more comfortable with the page, and maybe how often they log on. But we can't match Google Analytics data anyway

In [None]:
#Seems like there just isn't enough variation in the observations to do anything

results_1 = sm.ols('Groups ~ C(Onboard) + Time', data = create_rf).fit()
results_2 = sm.ols('Comments ~ C(Onboard) + Time', data = create_rf).fit()
results_3 = sm.ols('Colleagues ~ C(Onboard) + Time', data = create_rf).fit()

print (results_1.summary())
print (results_2.summary())
print (results_3.summary())

In [None]:
create_rf.describe()

 There's one last thing that we can look at to gain more evidence of the Onboarding process being effective
 It's statistically a little more straight forward, and it's fun.

 Compare three groups of people: 
#1 People who made their account one week before the onboarding process
#2 People who made their account when onboarding was launched and went through with it
#3 People who made their account when onboarding was launched and did not go through with it

By identifying the average metrics of the first week for people who have joined a week before Onboarding, and then calculating the average metrics for people made their account after onboarding was a thing, we can identify (with some sort of error) whether or not we see people who have gone through onboarding 


In [None]:
#First there's more cleaning to do. In order to caputre the effects in the first week for people who have onboarded and not
#We'll have to redo the user datasets for those doing before and after
#First for before

ob_b = ob[(ob['Account.Created'] >= '2016-10-12') & (ob['Account.Created'] < '2016-10-19')]
ob_b.describe()

In [None]:

datejoin = users.groupby(users['Account.Created']).count().reset_index()
datejoin['Account.Created'] = pd.to_datetime(datejoin['Account.Created'], format = '%Y-%m-%d')




In [None]:
#This graph is incorrect, and gives a misleading conclusion, butI like the code in it so I'm keeping it


import matplotlib.dates as mdates
datejoin2015 = datejoin[datejoin['Account.Created'] >= '2015-01-01']


fig, ax = plt.subplots()
ax.plot(datejoin2015['Account.Created'], datejoin2015['User'])



years = mdates.YearLocator()
months = mdates.MonthLocator()
days = mdates.DayLocator()
Fmt = mdates.DateFormatter('%Y-%m')


ax.xaxis.set_major_locator(months)
ax.xaxis.set_minor_locator(days)
ax.xaxis.set_major_formatter(Fmt)

plt.title('GCconnex Registrations per Day')
plt.xlabel('Date')
plt.ylabel('Registrations')



plt.annotate('Onboarding and Career Connexions', xy=('2016-10-19', 100), xytext=('2015-08-01', 60),
            arrowprops = dict(facecolor='red', shrink = 0.05))

fig.autofmt_xdate()

#plt.show()


In [None]:
users.describe()

In [None]:
ob_b = ob[ob['Account Created'] == '2016-10-12']
com 
col 
grp



usercom = com.groupby('User', as_index = False).count()
usercol = col.groupby('User', as_index = False).count()
usergrp = grp.groupby('User', as_index = False).count()