In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
import regex as re
import wranglerer as wr
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import DBSCAN

# Answers to questions 1-4 and 7
### NB: answers are results of quick exploratory investigation.  Given the time-sensitive nature of the requests, more refined answers to the nuanced aspects of each question will be forthcoming.  The following represent a starting point for reporting:
---
---
---

# But first, some housekeeping:

In [2]:
# acquire the logs and create two copies, one with normal indeces and one with date indeces
df = pd.read_csv('anonymized-curriculum-access.txt',delimiter=' ',header=None)
column_names = ['date', 'time', 'page', 'user', 'cohort','ip']
df.columns =column_names
df['timestamp'] = df.date + ' ' + df.time
df_time = df
df = df.set_index('timestamp')
df = df.drop(columns=['date','time'])
df['user'] = df.user.astype('str')
df_time['user'] = df_time.user.astype('str')
df.cohort.fillna(777,inplace=True)
df_time.cohort.fillna(777,inplace=True)
df['cohort'] = df.cohort.astype('int').astype('str')
df_time['cohort'] = df_time.cohort.astype('int').astype('str')

---
# 1. Which lesson attracts most traffic across cohorts?
### Sorted lessons by the raw number of cohorts which have accessed the lesson.  Time-series analysis is forthcoming:

In [3]:
# create a dataframe grouped-by lesson, which shows a count of unique users/ip/cohorts
page_unq_df = df.groupby('page')[['user','ip','cohort']].nunique()

In [4]:
page_unq_df.sort_values('cohort',ascending=False).head(10)

Unnamed: 0_level_0,user,ip,cohort
page,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
/,962,3366,48
search/search_index.json,701,1289,42
toc,656,1439,41
html-css,724,1205,41
javascript-i,725,1302,41
java-i,634,1057,41
appendix,615,855,40
spring,612,1035,40
mysql,592,960,39
java-iii,576,959,39


# html-css, javascript-i, java-i lessons have been accessed by 41 cohorts each

---
# 2. Is there a cohort that referred to a lesson more than other cohorts?
### Sorted lessons by the raw number of cohorts which have accessed the lesson.  Also sorted lessons by the percentage of page hits which are from a particular cohort.  See below!

In [5]:
# Given a particular page, what is probability of a certain cohort
page_given_cohort = (
    df.groupby('cohort')\
    .page.value_counts(normalize=True)\
    .rename('proba_cohort_given_page')\
    .reset_index())
page_given_cohort

page_tot_by_cohort = page_given_cohort.groupby('cohort').nunique()

In [6]:
page_tot_by_cohort.sort_values('page',ascending=False).head(10)

Unnamed: 0_level_0,page,proba_cohort_given_page
cohort,Unnamed: 1_level_1,Unnamed: 2_level_1
28,1404,229
777,1112,207
22,544,148
34,533,137
14,486,86
25,421,129
59,420,148
55,418,133
23,406,151
17,383,56


# cohort 22 referrenced 544 unique pages, cohort 34 referenced 533 unique pages and cohort 14 referenced 486 unique pages.  Also, cohorts 9, 4, and 139 appear to have accessed particular lessons more frequently than other cohorts, see below...

In [7]:
page_given_cohort[(page_given_cohort.proba_cohort_given_page < 1)&(
    page_given_cohort.proba_cohort_given_page >= 0.1)&(
    page_given_cohort.page != '/')].sort_values('proba_cohort_given_page',ascending=False)

Unnamed: 0,cohort,page,proba_cohort_given_page
14673,9,content/html-css,0.4
10038,4,mkdocs/search_index.json,0.25
10039,4,prework/databases,0.25
10040,4,prework/versioning/github,0.25
14675,9,content/html-css/gitbook/images/favicon.ico,0.2
14676,9,content/html-css/introduction.html,0.2
12532,6,javascript-ii/es6,0.138889
2190,139,javascript-i/introduction/operators,0.107057
2191,139,javascript-i/introduction/primitive-types,0.105861
2192,139,javascript-i/functions,0.102273


# ...for example, 10.7% of all page hits for "javascript-i/introduction/operators" come from COHORT 139 (Oberon)

---
# 3. Are there students who hardly access the curriculum?
### Sorted student total page hits and filtered by total number of page hits as well as total UNIQUE page hits.

In [8]:
# grab the page count, grab the unique page access (per user)
page_views = df.groupby(['user'])['page'].agg(['count','nunique'])

In [14]:
page_views[(page_views['count'] < 300)&(page_views['nunique'] > 100)]

Unnamed: 0_level_0,count,nunique
user,Unnamed: 1_level_1,Unnamed: 2_level_1
162,207,134
168,209,107
179,259,110
313,270,194
36,243,106
90,269,167


In [17]:
page_views.sort_values('nunique',ascending=False)

Unnamed: 0_level_0,count,nunique
user,Unnamed: 1_level_1,Unnamed: 2_level_1
1,7404,976
11,17913,871
248,5075,625
64,16347,437
581,6434,354
...,...,...
952,1,1
592,1,1
212,1,1
593,1,1
