In [1]:
import warnings

# Disable warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from rectools import Columns
import requests
import zipfile as zf
from pathlib import Path
import os

In [3]:
DATA_SAVE_PATH = Path('../../data/03_primary')

## Data loading

In [4]:
url = 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'

req = requests.get(url, stream=True)

with open('kion.zip', 'wb') as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

In [5]:
files = zf.ZipFile('kion.zip','r')
files.extractall()
files.close()

In [6]:
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')
interactions = pd.read_csv('data_original/interactions.csv')

### Preparation of interactions dataframe

In [7]:
# Renaming columns to rectools style
interactions.rename(columns={
    'last_watch_dt': 'datetime',
    'watched_pct': 'weight'
}, inplace=True)

In [8]:
# Drop all rows with non 10 symbols date 
# And transform remaining dates to pandas datetime 
interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format='%Y-%m-%d')

In [9]:
# Check last date in dataset
max_date = interactions[Columns.Datetime].max()

In [10]:
def get_score(weight: float) -> int:
    score = 0
    if weight > 80.:
        score = 5
    elif weight > 60.:
        score = 4
    elif weight > 40.:
        score = 3
    elif weight > 20.:
        score = 2
    else:
        score = 1
        
    return score

vec_get_score = np.vectorize(get_score)

In [11]:
# Set weights to rows based on percentage
interactions[Columns.Weight] = vec_get_score(interactions[Columns.Weight])


In [12]:
# Train\Test split
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

In [13]:
# Filter 'cold' users from test
cold_users = set(test[Columns.User]) - set(train[Columns.User])
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

### Preparation of users dataframe

In [17]:
# Drop all users whose interactions were dropped at the last stage of preprocessing
users = users.loc[users[Columns.User].isin(interactions[Columns.User])]

In [18]:
# Add some user's features like sex, age group and income group
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex


In [19]:
# Select the users present in the train
train_user_features = user_features.loc[user_features['id'].isin(train[Columns.User])]

### Preparation of items dataframe

In [20]:
# Drop all items related to which interactions were dropped at the last stage of preprocessing
items = items.loc[items[Columns.Item].isin(interactions[Columns.Item])]

In [21]:
# Add genres to item's features
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


In [22]:
# Add content type to item's features
content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"
content_feature.head()

Unnamed: 0,id,value,feature
0,10711,film,content_type
1,2508,film,content_type
2,10716,film,content_type
3,7868,film,content_type
4,16268,film,content_type


In [23]:
# Add release year bin to item's features
items['release_year_bin'] =  pd.qcut(items.release_year, q = 5, labels = np.arange(5, dtype = np.int))
release_year_feature = items.reindex(columns = [Columns.Item, 'release_year_bin'])
release_year_feature.columns = ["id", "value"]
release_year_feature["feature"] = "release_year_bin"
release_year_feature.head()

Unnamed: 0,id,value,feature
0,10711,0,release_year_bin
1,2508,2,release_year_bin
2,10716,1,release_year_bin
3,7868,2,release_year_bin
4,16268,0,release_year_bin


In [24]:
# Add country to item's features
items['country'] =  items["countries"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
country_feature = items[["item_id", "country"]].explode("country")
country_feature.columns = ["id", "value"]
country_feature["feature"] = "country"
country_feature.head()

Unnamed: 0,id,value,feature
0,10711,испания,country
1,2508,сша,country
2,10716,канада,country
3,7868,великобритания,country
4,16268,ссср,country


In [25]:
# Add age rating to item's features
rating_feature = items.reindex(columns = [Columns.Item, 'age_rating'])
rating_feature.columns = ["id", "value"]
rating_feature["feature"] = "age_rating"
rating_feature.head()

Unnamed: 0,id,value,feature
0,10711,16.0,age_rating
1,2508,16.0,age_rating
2,10716,16.0,age_rating
3,7868,16.0,age_rating
4,16268,12.0,age_rating


In [26]:
# One Dataframe to rule them all, 
# One Dataframe to find them, 
# One Dataframe to bring them all, 
# and in the darkness bind them
item_features = pd.concat((
    genre_feature,
    content_feature,
    release_year_feature,
    country_feature,
    rating_feature
))
item_features.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


In [27]:
# Select the items present in the train
train_item_features = item_features.loc[item_features['id'].isin(train[Columns.Item])]

## Saving prepared data

In [28]:
interactions.to_csv(DATA_SAVE_PATH / 'prepared_interactions_full.csv', index=False)
items.to_csv(DATA_SAVE_PATH / 'prepared_items_full.csv', index=False)
users.to_csv(DATA_SAVE_PATH / 'prepared_users_full.csv', index=False)

In [29]:
item_features.to_csv(DATA_SAVE_PATH / 'prepared_featured_items_full.csv', index=False)
user_features.to_csv(DATA_SAVE_PATH / 'prepared_featured_users_full.csv', index=False)

In [30]:
train.to_csv(DATA_SAVE_PATH / 'prepared_interactions_train.csv', index=False)
test.to_csv(DATA_SAVE_PATH / 'prepared_interactions_test.csv', index=False)

In [31]:
train_user_features.to_csv(DATA_SAVE_PATH / 'prepared_featured_users_train.csv', index=False)
train_item_features.to_csv(DATA_SAVE_PATH / 'prepared_featured_items_train.csv', index=False)

## Deleting unnecessary files

In [32]:
os.remove('kion.zip')

os.remove('data_original/interactions.csv')
os.remove('data_original/items.csv')
os.remove('data_original/users.csv')
os.rmdir('data_original') 