In [1]:
import warnings

# Disable warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from rectools import Columns
import requests
import zipfile as zf
from pathlib import Path
import os

In [3]:
DATA_SAVE_PATH = Path('../../data/03_primary')

## Data loading

In [4]:
url = 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'

req = requests.get(url, stream=True)

with open('kion.zip', 'wb') as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

In [5]:
files = zf.ZipFile('kion.zip','r')
files.extractall()
files.close()

In [6]:
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')
interactions = pd.read_csv('data_original/interactions.csv')

### Preparation of interactions dataframe

In [7]:
# Renaming columns to rectools style
interactions.rename(columns={
    'last_watch_dt': 'datetime',
    'watched_pct': 'weight'
}, inplace=True)

In [8]:
# Drop all rows with non 10 symbols date 
# And transform remaining dates to pandas datetime 
interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format='%Y-%m-%d')

In [9]:
# Check first and last date in dataset
min_date = interactions[Columns.Datetime].min()
max_date = interactions[Columns.Datetime].max()
print((min_date, max_date))

(Timestamp('2021-03-13 00:00:00'), Timestamp('2021-08-22 00:00:00'))


In [10]:
def get_score(weight: float) -> int:
    score = 0
    if weight > 80.:
        score = 5
    elif weight > 60.:
        score = 4
    elif weight > 40.:
        score = 3
    elif weight > 20.:
        score = 2
    else:
        score = 1
        
    return score

vec_get_score = np.vectorize(get_score)

In [11]:
# Set weights to rows based on percentage
interactions[Columns.Weight] = vec_get_score(interactions[Columns.Weight])


## Adding artificial users

In [12]:
# Merge interactions with items descriptions 
itr_with_desc = interactions.merge(items, on = Columns.Item)

### Case 1: Parent of a preschool child \ primary school student

In [13]:
# Get top 20 most viewed films for infants
itr_with_desc[itr_with_desc['age_rating'] == 0][Columns.Item].value_counts().head(20)

12743    6550
4718     6148
13159    5986
741      5128
2956     4565
9506     4297
7310     4284
9164     4066
15078    4026
6774     4010
12974    4000
10119    3934
8254     3575
14120    3575
11654    3477
1978     3451
12225    3327
565      3249
12837    2990
10876    2960
Name: item_id, dtype: int64

In [14]:
# Get top 20 most viewed films for kids
itr_with_desc[itr_with_desc['age_rating'] == 6][Columns.Item].value_counts().head(20)

8636     35631
7571     28372
13018    14568
16166    13995
3784     13744
3182     11851
11310     9759
5411      8158
13915     7526
7582      7081
10761     6877
11749     6341
11756     5916
12537     5687
13243     5685
15266     5674
12965     5305
2954      5029
11985     4860
1554      4800
Name: item_id, dtype: int64

In [15]:
# Now I select 5 random items from both ratings to create artificial user who is the parent of the child
parent_items_ids = [12743, 7310, 7571, 7582, 11985]

In [16]:
# Lets check what I'd choose
# P.s: Pretty normal films for kids, nothing special
itr_with_desc.drop_duplicates(Columns.Item).set_index(Columns.Item).loc[parent_items_ids]

Unnamed: 0_level_0,user_id,datetime,total_dur,weight,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
12743,818936,2021-05-11,2477,3,film,Ледниковый период 4: Континентальный дрейф,Ice Age: Continental Drift,2012.0,"мультфильм, приключения, комедии",США,,0.0,,"Стив Мартино, Майк Тёрмайер","Рэй Романо, Джон Легуизамо, Куин Латифа, Дэнис...",После приключений под землей прошло семь лет. ...,"тюлень (животное), доисторические времена, таю..."
7310,413440,2021-07-23,1,1,film,Гадкий я 2,Despicable Me 2,2013.0,"мультфильм, приключения, фантастика, фэнтези, ...","США, Франция, Япония",,0.0,,"Пьер Коффан, Крис Рено","Стив Карелл, Кристен Уиг, Бенджамин Брэтт, Мир...","В то время как Грю, бывший суперзлодей, приспо...","отношения родитель-ребенок, секретный агент, п..."
7571,988709,2021-07-07,6558,5,film,100% волк,100% Wolf,2020.0,"мультфильм, приключения, семейное, фэнтези, ко...","Австралия, Бельгия",,6.0,,Алекс Стадерманн,"Илай Суинделлс, Самара Уивинг, Джай Кортни, Ру...",Наследник семьи оборотней Фредди Люпин отчаянн...,"пудель, подростковая тревога, оборотень, приня..."
7582,190790,2021-06-23,14808,5,film,Холодное сердце II,Frozen II,2019.0,"фэнтези, мультфильм, музыкальные",США,,6.0,,"Крис Бак, Дженнифер Ли","Идина Мензел, Кристен Белл, Джонатан Грофф, Дж...","Анна, Эльза, Кристоф, его верный олень Свен и ...","королева, магия, королевство, плотина, дух, же..."
11985,309469,2021-06-28,808,1,film,История игрушек 4,Toy Story 4,2019.0,"мультфильм, фэнтези, комедии",США,,6.0,,Джош Кули,"Том Хэнкс, Тим Аллен, Энни Поттс, Тони Хейл, К...","Космический рейнджер Баз Лайтер, ковбой Вуди, ...","игрушка, дружба, ковбой, история игрушек 4, , ..."


In [17]:
# Ckeck max user id in interactions to get our new user unique id
# greater than max
parent_id = interactions[Columns.User].max() + 2

In [18]:
# Add artificial user 
users = users.append({
    Columns.User: parent_id,
    'age': 'age_25_34',
    'income': 'income_60_90',
    'sex': 'Ж',
    'kids_flg': 1
}, ignore_index=True)

In [19]:
# And her interactions
interactions = interactions.append([
    {
        Columns.User: parent_id,
        Columns.Item: parent_items_ids[0],
        Columns.Datetime: np.datetime64('2021-06-12'),
        'total_dur': 0,
        Columns.Weight: 5
    },
    {
        Columns.User: parent_id,
        Columns.Item: parent_items_ids[1],
        Columns.Datetime: np.datetime64('2021-06-15'),
        'total_dur': 0,
        Columns.Weight: 4
    },
    {
        Columns.User: parent_id,
        Columns.Item: parent_items_ids[2],
        Columns.Datetime: np.datetime64('2021-06-24'),
        'total_dur': 0,
        Columns.Weight: 3
    },
    {
        Columns.User: parent_id,
        Columns.Item: parent_items_ids[3],
        Columns.Datetime: np.datetime64('2021-07-01'),
        'total_dur': 0,
        Columns.Weight: 5
    },
    {
        Columns.User: parent_id,
        Columns.Item: parent_items_ids[4],
        Columns.Datetime: np.datetime64('2021-08-02'),
        'total_dur': 0,
        Columns.Weight: 5
    }
])

### Case 2: A fan of superheroics

In [20]:
fan_items_ids = []

In [21]:
# So, I have selected 5 Marvel movies in the order of their release
fan_items_ids.append(itr_with_desc[itr_with_desc['title'] == 'Мстители'].item_id.iloc[0])
fan_items_ids.append(itr_with_desc[itr_with_desc['title'] == 'Первый мститель: Другая война'].item_id.iloc[0])
fan_items_ids.append(itr_with_desc[itr_with_desc['title'] == 'Доктор Стрэндж'].item_id.iloc[0])
fan_items_ids.append(itr_with_desc[itr_with_desc['title'] == 'Человек-паук: Возвращение домой'].item_id.iloc[0])
fan_items_ids.append(itr_with_desc[itr_with_desc['title'] == 'Тор: Рагнарёк'].item_id.iloc[0])

In [22]:
# Ckeck max user id in interactions to get our new user unique id
# greater than max
fan_id = interactions[Columns.User].max() + 2

In [23]:
# Add artificial user 
users = users.append({
    Columns.User: fan_id,
    'age': 'age_18_24',
    'income': 'income_20_40',
    'sex': 'М',
    'kids_flg': 0
}, ignore_index=True)

In [24]:
# And his interactions
interactions = interactions.append([
    {
        Columns.User: fan_id,
        Columns.Item: fan_items_ids[0],
        Columns.Datetime: np.datetime64('2021-06-20'),
        'total_dur': 0,
        'weight': 4
    },
    {
        Columns.User: fan_id,
        Columns.Item: fan_items_ids[1],
        Columns.Datetime: np.datetime64('2021-06-27'),
        'total_dur': 0,
        'weight': 2
    },
    {
        Columns.User: fan_id,
        Columns.Item: fan_items_ids[2],
        Columns.Datetime: np.datetime64('2021-07-08'),
        'total_dur': 0,
        'weight': 5
    },
    {
        Columns.User: fan_id,
        Columns.Item: fan_items_ids[3],
        Columns.Datetime: np.datetime64('2021-07-20'),
        'total_dur': 0,
        'weight': 5
    },
    {
        Columns.User: fan_id,
        Columns.Item: fan_items_ids[4],
        Columns.Datetime: np.datetime64('2021-08-01'),
        'total_dur': 0,
        'weight': 5
    }
])

### Case 3: Born in the USSR

In [25]:
# Lets choose from Lenfilm popular films
itr_with_desc[itr_with_desc['studios'] == 'Ленфильм'][Columns.Item].value_counts().head(20)

1012     34
6843     23
14471    18
8016     16
15899    16
15242    16
6996     14
4328     13
13724    13
5393     13
8003     13
1202     12
2291     10
8248     10
12962     9
14939     9
2211      9
9512      8
8189      8
5659      7
Name: item_id, dtype: int64

In [26]:
old_man_items_ids = [1012, 14471, 15899, 2291, 5659]

In [27]:
# Lets check what I'd choose
# P.s: Pretty normal films for old soviet dedushka
itr_with_desc.drop_duplicates(Columns.Item).set_index(Columns.Item).loc[old_man_items_ids]

Unnamed: 0_level_0,user_id,datetime,total_dur,weight,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1012,865583,2021-04-14,6020,5,film,Свадьба в Малиновке,,1967.0,"музыкальные, советские, мелодрамы, семейное, к...",СССР,,12.0,Ленфильм,Андрей Тутышкин,"Александр Орлов, Алексей Смирнов, Андрей Абрик...",Великолепная экранизация оперетты Б.Александро...,"Свадьба, Малиновке, 1967, СССР, свадьбы, любов..."
14471,58766,2021-07-08,140,1,film,Афганский излом,,1991.0,"драмы, советские, военные",СССР,,16.0,Ленфильм,Владимир Бортко,"Алексей Серебряков, Алессандро Стефанелли, Арт...",Наконец-то мечта тысяч поклонников сериала «Сп...,"Афганский, излом, 1991, СССР, армия, преступле..."
15899,305747,2021-08-20,6993,5,film,На войне как на войне,,1968.0,"исторические, советские, военные",СССР,,12.0,Ленфильм,Виктор Трегубович,"Валентин Зубков, Виктор Павлов, Михаил Глузски...","Михаил Кононов и Олег Борисов в фильме, посвящ...","войне, как, войне, 1968, СССР, армия, Великая,..."
2291,177183,2021-08-15,2387,3,film,Укротительница тигров,,1954.0,"советские, комедии, мелодрамы",СССР,,0.0,Ленфильм,"Александр Ивановский, Надежда Кошеверова","Леонид Быков, Людмила Касаткина, Павел Кадочни...",Людмила Касаткина и Павел Кадочников в культов...,"Укротительница, тигров, 1954, СССР, животные, ..."
5659,682190,2021-04-05,3910,5,film,Планета бурь,,1961.0,"фантастика, семейное, советские",СССР,,12.0,Ленфильм,Павел Клушанцев,"Владимир Емельянов, Геннадий Вернов, Георгий Ж...",Научно-фантастический фильм о первой междунаро...,"Планета, бурь, 1961, СССР, выживание, диких, у..."


In [28]:
# Ckeck max user id in interactions to get our new user unique id
# greater than max
old_man_id = interactions[Columns.User].max() + 2

In [29]:
# Add artificial user 
users = users.append({
    Columns.User: old_man_id,
    'age': 'age_65_inf',
    'income': 'income_20_40',
    'sex': 'М',
    'kids_flg': 0
}, ignore_index=True)

In [30]:
# And his interactions
interactions = interactions.append([
    {
        Columns.User: old_man_id,
        Columns.Item: old_man_items_ids[0],
        Columns.Datetime: np.datetime64('2021-06-12'),
        'total_dur': 0,
        'weight': 5
    },
    {
        Columns.User: old_man_id,
        Columns.Item: old_man_items_ids[1],
        Columns.Datetime: np.datetime64('2021-06-15'),
        'total_dur': 0,
        'weight': 5
    },
    {
        Columns.User: old_man_id,
        Columns.Item: old_man_items_ids[2],
        Columns.Datetime: np.datetime64('2021-06-24'),
        'total_dur': 0,
        'weight': 5
    },
    {
        Columns.User: old_man_id,
        Columns.Item: old_man_items_ids[3],
        Columns.Datetime: np.datetime64('2021-07-01'),
        'total_dur': 0,
        'weight': 5
    },
    {
        Columns.User: old_man_id,
        Columns.Item: old_man_items_ids[4],
        Columns.Datetime: np.datetime64('2021-08-02'),
        'total_dur': 0,
        'weight': 4
    }
])

In [31]:
artificial_users_ids = [parent_id, fan_id, old_man_id]

In [32]:
artificial_users_ids

[1097559, 1097561, 1097563]

### Preparation of users dataframe

In [33]:
# Drop all users whose interactions were dropped at the last stage of preprocessing
users = users.loc[users[Columns.User].isin(interactions[Columns.User])]

In [34]:
# Add some user's features like sex, age group and income group
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex


## Saving prepared data

In [35]:
interactions.to_csv(DATA_SAVE_PATH / 'au_interactions_full.csv', index=False)
users.to_csv(DATA_SAVE_PATH / 'au_users_full.csv', index=False)

In [36]:
user_features.to_csv(DATA_SAVE_PATH / 'au_featured_users_full.csv', index=False)

In [37]:
items.drop([
    'title_orig',
    'for_kids',
    'studios',
    'directors',
    'actors',
    'description',
    'keywords'
], axis = 1).to_csv(DATA_SAVE_PATH / 'items_descriptions_cleared.csv', index=False)

## Deleting unnecessary files

In [38]:
os.remove('kion.zip')

os.remove('data_original/interactions.csv')
os.remove('data_original/items.csv')
os.remove('data_original/users.csv')
os.rmdir('data_original') 