# Exercise 01: Basic Operations

In [1]:
import pandas as pd

In [2]:
# 1. Create dataframe views
views = pd.read_csv('../data/feed-views.log', 
                    names=['datetime', 'user'], 
                    engine='python')
# Convert datetime
views['datetime'] = pd.to_datetime(views['datetime'])
# Extract components
views['year'] = views['datetime'].dt.year
views['month'] = views['datetime'].dt.month
views['day'] = views['datetime'].dt.day
views['hour'] = views['datetime'].dt.hour
views['minute'] = views['datetime'].dt.minute
views['second'] = views['datetime'].dt.second
views.head()

Unnamed: 0,datetime,user,year,month,day,hour,minute,second
0,2020-04-01 00:08:54,pavel,2020,4,1,0,8,54
1,2020-04-01 01:04:56,pavel,2020,4,1,1,4,56
2,2020-04-01 01:24:15,anastasia,2020,4,1,1,24,15
3,2020-04-01 04:00:03,valentina,2020,4,1,4,0,3
4,2020-04-01 10:21:42,valentina,2020,4,1,10,21,42


In [3]:
# 2. Create column daytime
labels = ['night', 'early morning', 'morning', 'afternoon', 'early evening', 'evening']
bins = [-1, 3, 6, 10, 16, 19, 23]
views['daytime'] = pd.cut(views['hour'], bins=bins, labels=labels)
# Assign user as index
views = views.set_index('user')
views.head()

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
pavel,2020-04-01 00:08:54,2020,4,1,0,8,54,night
pavel,2020-04-01 01:04:56,2020,4,1,1,4,56,night
anastasia,2020-04-01 01:24:15,2020,4,1,1,24,15,night
valentina,2020-04-01 04:00:03,2020,4,1,4,0,3,early morning
valentina,2020-04-01 10:21:42,2020,4,1,10,21,42,morning


In [4]:
# 3. Calculate number of elements
count_elements = views.count()
print("Count elements:")
print(count_elements)
# Count elements in each daytime category
daytime_counts = views['daytime'].value_counts()
print("\nDaytime counts:")
print(daytime_counts)

Count elements:
datetime    500
year        500
month       500
day         500
hour        500
minute      500
second      500
daytime     500
dtype: int64

Daytime counts:
daytime
afternoon        122
night             90
morning           90
evening           81
early evening     64
early morning     53
Name: count, dtype: int64


In [5]:
# 4. Sort values
views_sorted = views.sort_values(by=['hour', 'minute', 'second'])
views_sorted.head()

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
maxim,2020-05-23 00:04:34,2020,5,23,0,4,34,night
oksana,2020-05-09 00:05:29,2020,5,9,0,5,29,night
artem,2020-05-24 00:06:04,2020,5,24,0,6,4,night
anastasia,2020-04-28 00:06:09,2020,4,28,0,6,9,night
pavel,2020-04-01 00:08:54,2020,4,1,0,8,54,night


In [6]:
# 5. Stats
# Max hour for night
max_hour_night = views[views['daytime'] == 'night']['hour'].max()
print(f"Max hour night: {max_hour_night}")
# Min hour for morning
min_hour_morning = views[views['daytime'] == 'morning']['hour'].min()
print(f"Min hour morning: {min_hour_morning}")

# Who visited
print(f"User at max hour night: {views[(views['daytime'] == 'night') & (views['hour'] == max_hour_night)].index[0]}")
print(f"User at min hour morning: {views[(views['daytime'] == 'morning') & (views['hour'] == min_hour_morning)].index[0]}")

# Modes
mode_hour = views['hour'].mode()[0]
mode_daytime = views['daytime'].mode()[0]
print(f"Mode hour: {mode_hour}")
print(f"Mode daytime: {mode_daytime}")

Max hour night: 3
Min hour morning: 7
User at max hour night: artem
User at min hour morning: valentina
Mode hour: 8
Mode daytime: afternoon


In [7]:
# 6. Earliest and latest hours
print("Earliest hours:")
print(views.nsmallest(3, 'hour')['hour'])
print("Latest hours:")
print(views.nlargest(3, 'hour')['hour'])

Earliest hours:
user
pavel        0
valentina    0
anastasia    0
Name: hour, dtype: int32
Latest hours:
user
anastasia    23
oksana       23
maxim        23
Name: hour, dtype: int32


In [None]:
# 7. Describe and IQR
desc = views.describe()
iqr = desc.loc['75%', 'hour'] - desc.loc['25%', 'hour']
print(f"IQR: {iqr}")