## Imports

Imports

In [2]:
import pandas as pd

## Reading

Here we read data from file to pandas...

In [3]:
df = pd.read_csv('../data/feed-views.log', 
                 sep='\t', 
                 names=('datetime','user'), 
                 parse_dates=['datetime'])

...and print the dataframe samples

In [4]:
df.head()

Unnamed: 0,datetime,user
0,2020-04-17 12:01:08.463179,artem
1,2020-04-17 12:01:23.743946,artem
2,2020-04-17 12:27:30.646665,artem
3,2020-04-17 12:35:44.884757,artem
4,2020-04-17 12:35:52.735016,artem


In [5]:
df.tail()

Unnamed: 0,datetime,user
1071,2020-05-21 18:45:20.441142,valentina
1072,2020-05-21 23:03:06.457819,maxim
1073,2020-05-21 23:23:49.995349,pavel
1074,2020-05-21 23:49:22.386789,artem
1075,2020-05-22 10:36:14.662600,artem


## Make new cols from datetime

Separately work with each collumn...

In [6]:
df['year'] = pd.DatetimeIndex(df['datetime']).year
df['month'] = pd.DatetimeIndex(df['datetime']).month
df['day'] = pd.DatetimeIndex(df['datetime']).day
df['hour'] = pd.DatetimeIndex(df['datetime']).hour
df['minute'] = pd.DatetimeIndex(df['datetime']).minute
df['second'] = pd.DatetimeIndex(df['datetime']).second

...and print the dataframe

In [7]:
df

Unnamed: 0,datetime,user,year,month,day,hour,minute,second
0,2020-04-17 12:01:08.463179,artem,2020,4,17,12,1,8
1,2020-04-17 12:01:23.743946,artem,2020,4,17,12,1,23
2,2020-04-17 12:27:30.646665,artem,2020,4,17,12,27,30
3,2020-04-17 12:35:44.884757,artem,2020,4,17,12,35,44
4,2020-04-17 12:35:52.735016,artem,2020,4,17,12,35,52
...,...,...,...,...,...,...,...,...
1071,2020-05-21 18:45:20.441142,valentina,2020,5,21,18,45,20
1072,2020-05-21 23:03:06.457819,maxim,2020,5,21,23,3,6
1073,2020-05-21 23:23:49.995349,pavel,2020,5,21,23,23,49
1074,2020-05-21 23:49:22.386789,artem,2020,5,21,23,49,22


## Insert new daytime column

Do it with pd.cut...

In [9]:
labels = ['night', 
          'early morning', 
          'morning', 'afternoon', 
          'early evening', 
          'evening']
bins = [-1, 3, 6, 10, 16, 19, 24]
df['daytime'] = pd.cut(df.hour, bins=bins, labels=labels)

...and print some random sector

In [12]:
df[20:40]

Unnamed: 0,datetime,user,year,month,day,hour,minute,second,daytime
20,2020-04-18 12:49:16.684426,artem,2020,4,18,12,49,16,afternoon
21,2020-04-18 17:45:27.234821,artem,2020,4,18,17,45,27,early evening
22,2020-04-18 18:14:53.813657,konstantin,2020,4,18,18,14,53,early evening
23,2020-04-18 18:20:05.419381,konstantin,2020,4,18,18,20,5,early evening
24,2020-04-18 18:20:32.196892,konstantin,2020,4,18,18,20,32,early evening
25,2020-04-18 18:20:37.860671,konstantin,2020,4,18,18,20,37,early evening
26,2020-04-18 18:24:59.632526,konstantin,2020,4,18,18,24,59,early evening
27,2020-04-18 18:50:15.409435,konstantin,2020,4,18,18,50,15,early evening
28,2020-04-18 19:20:27.487265,artem,2020,4,18,19,20,27,early evening
29,2020-04-18 21:01:33.970629,konstantin,2020,4,18,21,1,33,evening


## Assign User as index
in df

In [13]:
df.set_index('user', inplace=True)

and print df

In [14]:
df.head()

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
artem,2020-04-17 12:01:08.463179,2020,4,17,12,1,8,afternoon
artem,2020-04-17 12:01:23.743946,2020,4,17,12,1,23,afternoon
artem,2020-04-17 12:27:30.646665,2020,4,17,12,27,30,afternoon
artem,2020-04-17 12:35:44.884757,2020,4,17,12,35,44,afternoon
artem,2020-04-17 12:35:52.735016,2020,4,17,12,35,52,afternoon


In [15]:
df.tail()

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
valentina,2020-05-21 18:45:20.441142,2020,5,21,18,45,20,early evening
maxim,2020-05-21 23:03:06.457819,2020,5,21,23,3,6,evening
pavel,2020-05-21 23:23:49.995349,2020,5,21,23,23,49,evening
artem,2020-05-21 23:49:22.386789,2020,5,21,23,49,22,evening
artem,2020-05-22 10:36:14.662600,2020,5,22,10,36,14,morning


## Calculating stats by count and value_counts

Number of elements in dataframe:

In [20]:
df.count()

datetime    1076
year        1076
month       1076
day         1076
hour        1076
minute      1076
second      1076
daytime     1076
dtype: int64

In [19]:
df.value_counts('daytime')

daytime
evening          509
afternoon        252
early evening    145
night            129
morning           36
early morning      5
dtype: int64

## Sort df by hour, min and sec
In ascending order simultaniously

In [21]:
df.sort_values(by=['hour', 'minute', 'second'], inplace=True)

In [22]:
df

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
valentina,2020-05-15 00:00:13.222265,2020,5,15,0,0,13,night
valentina,2020-05-15 00:01:05.153738,2020,5,15,0,1,5,night
pavel,2020-05-12 00:01:27.764025,2020,5,12,0,1,27,night
pavel,2020-05-12 00:01:38.444917,2020,5,12,0,1,38,night
pavel,2020-05-12 00:01:55.395042,2020,5,12,0,1,55,night
...,...,...,...,...,...,...,...,...
artem,2020-05-21 23:49:22.386789,2020,5,21,23,49,22,evening
anatoliy,2020-05-09 23:53:55.599821,2020,5,9,23,53,55,evening
pavel,2020-05-09 23:54:54.260791,2020,5,9,23,54,54,evening
valentina,2020-05-14 23:58:56.754866,2020,5,14,23,58,56,evening


## Using min / max / mode

Calculate the maximum of hour for the rows where the time of day is night

In [27]:
df.hour.loc[df.daytime == 'night'].max()

3

Calculate the minimum of hour for the rows where the time of day is morning

In [25]:
df.hour.loc[df.daytime == 'morning'].min()

8

Find out who visited the page at those hours (make one example from that)

In [33]:
df.loc[df.hour == df.hour.loc[df.daytime == 'night'].max()]

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
konstantin,2020-04-19 03:23:35.471598,2020,4,19,3,23,35,night
konstantin,2020-04-19 03:23:55.473926,2020,4,19,3,23,55,night
konstantin,2020-04-19 03:33:07.757714,2020,4,19,3,33,7,night


In [32]:
df.loc[df.hour == df.hour.loc[df.daytime == 'morning'].min()]

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
alexander,2020-05-15 08:16:03.918402,2020,5,15,8,16,3,morning
alexander,2020-05-15 08:35:01.471463,2020,5,15,8,35,1,morning


Calculate the mode for the hour and daytime

In [39]:
df.hour.mode()

0    22
Name: hour, dtype: int64

In [40]:
df.daytime.mode()

0    evening
Name: daytime, dtype: category
Categories (6, object): ['night' < 'early morning' < 'morning' < 'afternoon' < 'early evening' < 'evening']

## Using nsmallest() and nlargest()

to show the 3 earliest hours in the morning and the corresponding usernames and the
3 latest hours and the usernames

In [41]:
df.hour.loc[df.daytime == 'morning'].nlargest(3)

user
konstantin    10
maxim         10
maxim         10
Name: hour, dtype: int64

In [42]:
df.hour.loc[df.daytime == 'morning'].nsmallest(3)

user
alexander    8
alexander    8
alexander    9
Name: hour, dtype: int64

## Basic stats with describe()

In [45]:
descriptions = df.describe()

In [46]:
descriptions

Unnamed: 0,year,month,day,hour,minute,second
count,1076.0,1076.0,1076.0,1076.0,1076.0,1076.0
mean,2020.0,4.870818,13.552974,16.249071,29.629182,29.500929
std,0.0,0.335557,4.906567,6.95549,17.689388,17.405506
min,2020.0,4.0,1.0,0.0,0.0,0.0
25%,2020.0,5.0,11.0,13.0,14.0,14.0
50%,2020.0,5.0,13.0,19.0,29.0,30.0
75%,2020.0,5.0,15.0,22.0,46.0,45.0
max,2020.0,5.0,30.0,23.0,59.0,59.0


Calculate the interquartile range for the hour by extracting values from the result of the describe() method

In [47]:
iqr = descriptions.hour['75%'] - descriptions.hour['25%']

In [49]:
iqr

9.0