In [4]:
import matplotlib.pyplot as plt
import os

import numpy as np
import pandas as pd
import seaborn as sns
from env import get_db_url

In [5]:
def get_log_data():
    df = pd.read_csv('anonymized-curriculum-access.txt', sep=' ', header=None,
                     names=['Timestamp', 'Resource', 'User_ID', 'Access_Type', 'IP_Address'])
    return df
    

In [6]:
def prep_log_data(df):
    
    '''
    This function preps the data from get_crime_data and does the necessary step to cleaning the dataframe
    '''
    
    
    df['date'] = pd.to_datetime(df['date'])
    df.set_index('date', inplace=True) 
    df = df.resample('D')['path'].count()
    
    
    return df    

In [7]:
df = get_log_data()
df.head()

Unnamed: 0,Timestamp,Resource,User_ID,Access_Type,IP_Address
2018-01-26,09:55:03,/,1,8.0,97.105.19.61
2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61
2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61
2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61
2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61


In [9]:
df.shape

(900223, 5)

In [10]:
df.describe()

Unnamed: 0,User_ID,Access_Type
count,900223.0,847330.0
mean,458.825707,48.501049
std,249.296767,32.795482
min,1.0,1.0
25%,269.0,28.0
50%,475.0,33.0
75%,660.0,57.0
max,981.0,139.0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 900223 entries, 2018-01-26 to 2021-04-21
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Timestamp    900223 non-null  object 
 1   Resource     900222 non-null  object 
 2   User_ID      900223 non-null  int64  
 3   Access_Type  847330 non-null  float64
 4   IP_Address   900223 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 41.2+ MB


In [14]:
df.Access_Type.value_counts()

array([  8.,  22.,  21.,   1.,  16.,  18.,  19.,  nan,  13.,  14.,  15.,
         7.,   4.,  12.,  17.,  23.,   2.,   9.,  11.,  24.,  25.,  26.,
         6.,  27.,  28.,  29.,  31.,  32.,  33.,  34.,  51.,  52.,  53.,
        55.,  56.,  57.,   5.,  58.,  59.,  61.,  62., 132., 134., 133.,
       135., 138., 137., 139.])

In [20]:
df.Resource.nunique()

2313

In [22]:
df_users = df.groupby('User_ID')
df_users

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x15cfe5cf0>

In [27]:
resource_given_user = (
    df.groupby('User_ID')
    .Resource.value_counts(normalize=True)
    .rename('resource_given_user')
    .reset_index()
)
resource_given_user


Unnamed: 0,User_ID,Resource,resource_given_user
0,1,/,0.132496
1,1,6-regression/1-overview,0.019854
2,1,classification/overview,0.015937
3,1,appendix,0.014587
4,1,spring,0.011750
...,...,...,...
116103,981,/,0.047619
116104,981,appendix/professional-development/t-block-resume,0.023810
116105,981,examples/bootstrap-grid/assets.zip,0.023810
116106,981,html-css/css-ii,0.023810


In [32]:
resource_given_user.sort_values(by='resource_given_user').head(100)

Unnamed: 0,User_ID,Resource,resource_given_user
3240,11,content/php_iv/classes-and-objects-ii/build-a-...,0.000056
3150,11,5-stats/2.8-feature-scaling,0.000056
3149,11,5-stats/2.5-distributions-and-testing,0.000056
3148,11,5-stats/2.4-power-analysis,0.000056
3147,11,5-stats/2.1-probability_and_event_interactions,0.000056
...,...,...,...
3230,11,content/php_i,0.000056
3231,11,content/php_i/gitbook/images/favicon.ico,0.000056
3232,11,content/php_ii,0.000056
3233,11,content/php_ii/functions-i/arguments.html,0.000056


In [30]:
')

3240    0.000056
3150    0.000056
3149    0.000056
3148    0.000056
3147    0.000056
3146    0.000056
3145    0.000056
3144    0.000056
3143    0.000056
3142    0.000056
3141    0.000056
3140    0.000056
3139    0.000056
3138    0.000056
3137    0.000056
3136    0.000056
3151    0.000056
3152    0.000056
3153    0.000056
3154    0.000056
3170    0.000056
3169    0.000056
3168    0.000056
3167    0.000056
3166    0.000056
3165    0.000056
3164    0.000056
3134    0.000056
3163    0.000056
3161    0.000056
3160    0.000056
3159    0.000056
3158    0.000056
3157    0.000056
3156    0.000056
3155    0.000056
3162    0.000056
3133    0.000056
3132    0.000056
3131    0.000056
3110    0.000056
3109    0.000056
3108    0.000056
3107    0.000056
3106    0.000056
3105    0.000056
3104    0.000056
3111    0.000056
3103    0.000056
3101    0.000056
Name: resource_given_user, dtype: float64