In [1]:
import os
import gzip
import json
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm

In [2]:
WORKING_DIR = '/mnt/E/Projects/Content-based-Neural-Recommender-Systems/'
DATA_DIR = f'{WORKING_DIR}data/zen/'
os.chdir(WORKING_DIR)

# Getting The Data

In [3]:
items_df = []
for line in tqdm(gzip.GzipFile(f"{DATA_DIR}items.json.gz", "r")):
    j = json.loads(line)
    j["content"] = j["content"].encode("utf8")  # storing in utf8 saves RAM
    j["title"] = j["title"].encode("utf8")
    items_df.append(j)
items_df = pd.DataFrame(items_df)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [4]:
def utf8_preview(d):
    try:
        return d.apply(lambda x: x.decode("utf8"))
    except:
        return d
items_df = items_df.apply(utf8_preview)
items_df.head()

Unnamed: 0,content,image,itemId,title
0,"Согласитесь, дорогие любители собак, до чего ж...","[-0.169, 0.129, 0.067, 0.019, 0.281, -0.245, 0...",0,Пять забавных «морщинистых» пород собак
1,"Контуры Третьей Поперечной улицы, состоявшей ...","[-0.158, -0.112, -0.325, 0.05, -0.114, 0.002, ...",1,История улицы Ирининской в Гомеле
2,Источник: http://infodays.ru Вообще он как-то ...,"[0.084, -0.181, 0.008, 0.34, -0.03, -0.197, -0...",2,Зачем Дудь всё время спрашивает гостей програм...
3,41-летняя Светлана Зейналова решила окрестить ...,"[0.034, -0.119, -0.062, 0.025, 0.128, -0.041, ...",3,Светлана Зейналова крестила младшую дочь
4,«Организованные преступные группировки ГБАО де...,"[-0.061, -0.015, -0.198, -0.047, 0.054, 0.029,...",4,"ГКНБ: бандиты в ГБАО делают вид, что рассталис..."


In [5]:
users_df = []
for line in tqdm(gzip.GzipFile(f"{DATA_DIR}train.json.gz", "r")):
    j = json.loads(line)
    user_items = []
    user_ratings = []
    for item, rating in j["trainRatings"].items():
        user_items.append(int(item))
        user_ratings.append(int(rating))
    users_df.append({"userId": j["userId"], 
                     "userItems": np.array(user_items), 
                     "userRatings": np.array(user_ratings)})
users_df = pd.DataFrame(users_df)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [6]:
users_df.head()

Unnamed: 0,userId,userItems,userRatings
0,0,"[206495, 279694, 19718, 74707, 221548, 127012,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,"[180165, 286761, 127012, 117072, 86362, 4023, ...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,"[127873, 172799, 27499, 286761, 41012, 276191,...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
3,3,"[127873, 127011, 248445, 286761, 127012, 13046...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, ..."
4,4,"[201809, 213372, 188756, 188751, 26379, 240855...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."


## Split data

In [58]:
def split_ratings(user_items, user_ratings, train_size=0.8):
    user_feedback = list(zip(user_items, user_ratings))
    user_feedback_1 = list(filter(lambda x: x[1] != 0, user_feedback))
    user_feedback_0 = list(filter(lambda x: x[1] == 0, user_feedback))

    split_1_n = int(len(user_feedback_1) * train_size)
    split_0_n = int(len(user_feedback_0) * train_size)
    user_feedback_train = user_feedback_1[:split_1_n] + user_feedback_0[:split_0_n]
    user_feedback_test = user_feedback_1[split_1_n:] + user_feedback_0[split_0_n:]
    return list(zip(*user_feedback_train)), list(zip(*user_feedback_test))

In [65]:
users_train_df, users_test_df = [], []
for _, (idd, items, ratings) in tqdm(users_df.iterrows()):
    user_train, user_test = split_ratings(items, ratings)
    users_train_df.append({
        'userId': idd,
        'userItems': np.array(user_train[0]),
        'userRatings': np.array(user_train[1]),
    })
    users_test_df.append({
        'userId': idd,
        'userItems': np.array(user_test[0]),
        'userRatings': np.array(user_test[1]),
    })

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [69]:
users_train_df = pd.DataFrame(users_train_df)
users_test_df = pd.DataFrame(users_test_df)

In [70]:
users_train_df.head()

Unnamed: 0,userId,userItems,userRatings
0,0,"[93250, 304018, 213209, 101495, 82995, 117892,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,1,"[286761, 127012, 114176, 56905, 320506, 146014...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,2,"[227848, 240616, 43774, 237709, 179355, 213105...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,3,"[165774, 214890, 92304, 204146, 60130, 117329,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,4,"[315060, 200417, 249471, 107092, 208941, 11194...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [71]:
users_test_df.head()

Unnamed: 0,userId,userItems,userRatings
0,0,"[24166, 17469, 124365, 165117, 79860, 281354, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,1,"[106948, 83007, 303304, 28005, 314928, 124158,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,2,"[218928, 221035, 149971, 56705, 253616, 220349...","[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,3,"[102278, 176638, 208449, 211761, 269999, 10677...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,4,"[211569, 256481, 186819, 281578, 185040, 52078...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [72]:
users_test_df['userRatings'].iloc[0]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

## Save csv-s