# KA User Survey Analysis
_Ravi Dayabhai_ (**@ravi**)

In [189]:
# Import dependencies
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
%matplotlib inline

# Show all columns
pd.set_option('display.max_columns', None)

## Entrance Survey

### Data Exploration

First, getting a sense of the data.

In [190]:
# Open files
rolling_entrance = pd.read_csv("KA Entrance Survey Rolling (June 2017).csv", low_memory=False)

# Check column types
rolling_entrance.dtypes

Response ID                                                                                       int64
Date                                                                                             object
Identity                                                                                         object
IP Address                                                                                       object
Page                                                                                             object
Referrer                                                                                        float64
User Agent                                                                                       object
bibliotron_new_design                                                                            object
window_width                                                                                    float64
window_height                                                   

In [191]:
rolling_entrance.shape

(92741, 26)

Making sure the primary key consists of unique values.

In [192]:
if len(rolling_entrance["Response ID"]) == len(rolling_entrance["Response ID"].unique()):
    print("All values in column are unique!")
else:
    print("There are duplicate or missing data in column.")

All values in column are unique!


In [193]:
rolling_entrance.set_index(["Response ID"], inplace=True)
rolling_entrance

Unnamed: 0_level_0,Date,Identity,IP Address,Page,Referrer,User Agent,bibliotron_new_design,window_width,window_height,eligible_for_nps_survey,eligible_for_sat_intent_survey_1,eligible_for_sat_satisfaction_survey,eligible_for_unregistered_user_goal_survey,eligible_for_sat_intent_survey_2,eligible_for_registered_user_goal_survey,view_project_failure,bibliotron_homepage,article_satisfaction_survey,view_project_completed,submitted_project,submitted_project_eval,Q1: Would you tell a friend to use Khan Academy?,Q2: I am a ....,Q3: Last question! Are you using Khan Academy because a teacher/tutor is assigning you work?,Q3. Other explain
Response ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
74035042,2016-02-09 18:55:44 UTC,_gae_bingo_random:kxJYqic_Vkt8td7Q9Kw0d1-0xVmo...,146.166.7.33,https://www.khanacademy.org/math/arithmetic/mu...,,Mozilla/5.0 (compatible; MSIE 10.0; Windows NT...,,,,True,False,False,False,False,,,,,,,,No,,,
74035064,2016-02-09 18:55:14 UTC,_gae_bingo_random:A058otzFdNLfeEpK0IElWMO08YPu...,205.154.155.232,https://www.khanacademy.org/math/cc-fourth-gra...,,Mozilla/5.0 (X11; CrOS x86_64 7647.73.0) Apple...,,,,False,False,False,False,False,,,,,,,,Yes!,,,
74035065,2016-02-09 18:57:56 UTC,_gae_bingo_random:c-Y8xQZKoB2M4hRQPqFVQbKlo6JU...,208.108.119.41,https://www.khanacademy.org/mission/early-math,,Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKi...,,,,False,False,False,False,False,,,,,,,,Yes!,,,
74035066,2016-02-09 18:59:49 UTC,_gae_bingo_random:0B9kdV_0d_GIotkGjI4ISLSNqNIj...,169.204.230.154,https://www.khanacademy.org/mission/cc-seventh...,,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.3...,,,,False,False,False,False,False,,,,,,,,Yes!,,,
74035384,2016-02-09 18:53:30 UTC,_gae_bingo_random:VLoXs-i3_uXAm_aSvgrai0tvOOjC...,12.145.227.242,https://www.khanacademy.org/computing/computer...,,Mozilla/5.0 (X11; CrOS x86_64 7647.73.0) Apple...,,,,True,False,False,False,False,,,,,,,,Yes!,,,
74035494,2016-02-09 18:54:17 UTC,_gae_bingo_random:_jlW0BQPIJlmCjbRwyebymj9X3gs...,98.114.22.147,https://www.khanacademy.org/mission/arithmetic,,Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebK...,,,,False,False,False,False,False,,,,,,,,Yes!,,,
74035549,2016-02-09 18:51:31 UTC,_gae_bingo_random:HWRXyMKRaBo5M6Vi3aT58Ayhu9Gr...,162.247.149.75,https://www.khanacademy.org/computing/hour-of-...,,Mozilla/5.0 (X11; CrOS x86_64 7647.73.0) Apple...,,,,False,False,False,False,False,,,,,,,,Yes!,,,
74035662,2016-02-09 18:57:25 UTC,_gae_bingo_random:83PnFwK83yqR3TvYgJS-eXxDN4LX...,204.38.47.132,https://www.khanacademy.org/mission/early-math...,,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.3...,,,,False,False,False,False,False,,,,,,,,Yes!,,,
74035865,2016-02-09 18:54:34 UTC,_gae_bingo_random:_t9I3qAcRnj4yJckyibpNEm0fUXR...,104.245.111.66,https://www.khanacademy.org/mission/early-math...,,Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537....,,,,False,False,False,False,False,,,,,,,,Maybe / not sure,,,
74036314,2016-02-09 18:50:28 UTC,_gae_bingo_random:PC7nqu6kGoNS2wxUra7zNJ3WsVfL...,50.205.172.65,https://www.khanacademy.org/mission/cc-seventh...,,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.3...,,,,True,False,False,False,False,,,,,,,,No,,,


Then, checking the nulls in each column.

In [194]:
rolling_entrance.isnull().sum()

Date                                                                                                0
Identity                                                                                            0
IP Address                                                                                          0
Page                                                                                                0
Referrer                                                                                        92741
User Agent                                                                                          0
bibliotron_new_design                                                                           67284
window_width                                                                                    67206
window_height                                                                                   67206
eligible_for_nps_survey                                                           

In [195]:
cols_of_interest = ["Q1: Would you tell a friend to use Khan Academy?", "Q2: I am a ....", "Q3: Last question! Are you using Khan Academy because a teacher/tutor is assigning you work?"]
rolling_entrance[cols_of_interest].isnull().all(axis=1).sum()

0

From above it's clear that there are no rows where the responses to all of the survey questions are null. I'm a little confused how 140 rows don't have a response for "Q1." Given that this is the primary question we're interested in, any responses with null values for this question are dropped.

In [196]:
rolling_entrance = rolling_entrance[rolling_entrance["Q1: Would you tell a friend to use Khan Academy?"].notnull()]

Next, converting the "Date" column to a DateTime type.

In [197]:
# Ignore 'SettingWithCopyWarning'
pd.options.mode.chained_assignment = None

rolling_entrance.loc[:,"Date"] = pd.to_datetime(rolling_entrance["Date"])
rolling_entrance.dtypes

Date                                                                                            datetime64[ns]
Identity                                                                                                object
IP Address                                                                                              object
Page                                                                                                    object
Referrer                                                                                               float64
User Agent                                                                                              object
bibliotron_new_design                                                                                   object
window_width                                                                                           float64
window_height                                                                                          float64
e

Looking at the range of dates.

In [198]:
rolling_entrance.Date.describe()

count                   92601
unique                  91510
top       2017-04-28 15:49:52
freq                        4
first     2016-02-09 18:50:28
last      2017-06-13 08:53:42
Name: Date, dtype: object

In [201]:
rolling_entrance.reset_index("Date", inplace=True)
date_df = rolling_entrance.set_index("Date")
date_df[['Response ID']].groupby(pd.TimeGrouper(freq='M')).count()

Unnamed: 0_level_0,Response ID
Date,Unnamed: 1_level_1
2016-02-29,26333
2016-03-31,17957
2016-04-30,1
2016-05-31,0
2016-06-30,0
2016-07-31,0
2016-08-31,0
2016-09-30,0
2016-10-31,0
2016-11-30,0
