In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import itertools

import env

In [2]:
url = f'mysql+pymysql://{env.username}:{env.password}@{env.host}/curriculum_logs'
df = pd.read_sql('SELECT * FROM logs JOIN cohorts on logs.user_id = cohorts.id', url)

In [3]:
df.head()

Unnamed: 0,date,time,path,user_id,cohort_id,ip,id,name,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61,2,Badlands,#badlands,2014-06-04,2014-08-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73739 entries, 0 to 73738
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date        73739 non-null  object 
 1   time        73739 non-null  object 
 2   path        73739 non-null  object 
 3   user_id     73739 non-null  int64  
 4   cohort_id   72405 non-null  float64
 5   ip          73739 non-null  object 
 6   id          73739 non-null  int64  
 7   name        73739 non-null  object 
 8   slack       73739 non-null  object 
 9   start_date  73739 non-null  object 
 10  end_date    73739 non-null  object 
 11  created_at  73739 non-null  object 
 12  updated_at  73739 non-null  object 
 13  deleted_at  0 non-null      object 
 14  program_id  73739 non-null  int64  
dtypes: float64(1), int64(3), object(11)
memory usage: 8.4+ MB


### Prepare

In [5]:
#change the dtypes to the correct types

In [6]:
df.user_id = df.user_id.astype('object')

In [7]:
df.cohort_id = df.cohort_id.astype('object')

In [8]:
df.id = df.id.astype('object')

In [9]:
df.program_id = df.program_id.astype('object')

1. Which lesson appears to attract the most traffic consistently across cohorts (per program)?

In [10]:
df.path.value_counts(ascending = False)

/                           5056
index.html                  1520
javascript-i                1486
spring                      1401
java-iii                    1237
                            ... 
Clustering_KMeans_1            1
8-timeseries/1-overview        1
10.10_Exercises                1
appendix/cli/more-topics       1
2-sql/database-design          1
Name: path, Length: 1442, dtype: int64

A: It looks like lesson jacascript-i is the most referenced lesson with 1486 requests

2. Is there a cohort that referred to a lesson significantly more than other cohorts seemed to gloss over?


In [11]:
# I want to concatenate the time and date column
df['date'] = df['date'] + ' ' + df['time']

In [12]:
df.head()

Unnamed: 0,date,time,path,user_id,cohort_id,ip,id,name,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id
0,2018-01-26 09:55:03,09:55:03,/,1,8.0,97.105.19.61,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
1,2018-01-26 09:56:02,09:56:02,java-ii,1,8.0,97.105.19.61,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
2,2018-01-26 09:56:05,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
3,2018-01-26 09:56:06,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
4,2018-01-26 09:56:24,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61,2,Badlands,#badlands,2014-06-04,2014-08-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1


In [13]:
df = df.drop(columns='time')

In [14]:
df.date = pd.to_datetime(df.date, utc=True)
df = df.set_index('date')

In [15]:
df.head()

Unnamed: 0_level_0,path,user_id,cohort_id,ip,id,name,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2018-01-26 09:55:03+00:00,/,1,8.0,97.105.19.61,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
2018-01-26 09:56:02+00:00,java-ii,1,8.0,97.105.19.61,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
2018-01-26 09:56:05+00:00,java-ii/object-oriented-programming,1,8.0,97.105.19.61,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
2018-01-26 09:56:06+00:00,slides/object_oriented_programming,1,8.0,97.105.19.61,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
2018-01-26 09:56:24+00:00,javascript-i/conditionals,2,22.0,97.105.19.61,2,Badlands,#badlands,2014-06-04,2014-08-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1


In [16]:
# now I need to do a little more prepping

In [17]:
df = df.drop(columns= 'deleted_at')

In [18]:
df.isna().sum()

path             0
user_id          0
cohort_id     1334
ip               0
id               0
name             0
slack            0
start_date       0
end_date         0
created_at       0
updated_at       0
program_id       0
dtype: int64

In [19]:
df.dropna(axis = 0,inplace = True)

In [20]:
df.head()

Unnamed: 0_level_0,path,user_id,cohort_id,ip,id,name,slack,start_date,end_date,created_at,updated_at,program_id
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2018-01-26 09:55:03+00:00,/,1,8.0,97.105.19.61,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,1
2018-01-26 09:56:02+00:00,java-ii,1,8.0,97.105.19.61,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,1
2018-01-26 09:56:05+00:00,java-ii/object-oriented-programming,1,8.0,97.105.19.61,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,1
2018-01-26 09:56:06+00:00,slides/object_oriented_programming,1,8.0,97.105.19.61,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,1
2018-01-26 09:56:24+00:00,javascript-i/conditionals,2,22.0,97.105.19.61,2,Badlands,#badlands,2014-06-04,2014-08-22,2016-06-14 19:52:26,2016-06-14 19:52:26,1


In [21]:
df.isna().sum()

path          0
user_id       0
cohort_id     0
ip            0
id            0
name          0
slack         0
start_date    0
end_date      0
created_at    0
updated_at    0
program_id    0
dtype: int64

In [22]:
request_path_and_params = df.path.str.split('/', expand=True)
request_path_and_params = request_path_and_params.drop(columns=[2,3,4,5,6])

In [27]:
df.shape

(72405, 12)

In [28]:
request_path_and_params.columns = ['request_path', 'request_params']
#df = df.drop(columns='path').join(request_path_and_params)
request_path_and_params.shape

(72405, 2)

In [24]:
result = pd.concat([df, request_path_and_params])

In [25]:
result.shape

(144810, 14)

QUESTION: Why is the row count doubled? I concatenated them horizontally

In [26]:
result.head()

Unnamed: 0_level_0,path,user_id,cohort_id,ip,id,name,slack,start_date,end_date,created_at,updated_at,program_id,request_path,request_params
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2018-01-26 09:55:03+00:00,/,1,8.0,97.105.19.61,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,1,,
2018-01-26 09:56:02+00:00,java-ii,1,8.0,97.105.19.61,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,1,,
2018-01-26 09:56:05+00:00,java-ii/object-oriented-programming,1,8.0,97.105.19.61,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,1,,
2018-01-26 09:56:06+00:00,slides/object_oriented_programming,1,8.0,97.105.19.61,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,1,,
2018-01-26 09:56:24+00:00,javascript-i/conditionals,2,22.0,97.105.19.61,2,Badlands,#badlands,2014-06-04,2014-08-22,2016-06-14 19:52:26,2016-06-14 19:52:26,1,,


In [None]:
result.isna().sum()

In [None]:
# result = result.dropna()
# result.shape

In [None]:
df = df.rename(columns={'oldName1': 'newName1', 'oldName2': 'newName2'})
result.rename(olumns={0: 'lesson', 1: 'path'}))

for these ip addresses with more than one start date any ip location other than 
97.105.19.61 which I determined was the Code up location may be anomalies, so let's look at just those other addresses