In [1]:
import pandas as pd
from tqdm import tqdm
import os
import json
import numpy as np
import matplotlib.pyplot as plt
from RS.utils.dictutils import *
from RS.utils.dataset import transpose_df


dataroot = os.path.join("data")
course_dataroot = os.path.join(dataroot,"course")
book_dataroot = os.path.join(dataroot,"book")

## Course data (2010-2021)

In [None]:
c = pd.read_csv(
    os.path.join(course_dataroot,"record_2010_to_2021.csv"),
    encoding='utf-8'
)

c.head()

### Group by users

In [None]:
usercourse_2010_2011 = c.groupby(
    'uid', as_index=False
)['iid'].agg({'course': (lambda x: list(set(x)))})

usercourse_2010_2011.head()

In [None]:
usercourse_2010_2011['uid']=usercourse_2010_2011['uid'].astype(str)

## Book lending data (2010-2021)

In [None]:
b = pd.read_csv(
    os.path.join(book_dataroot, "record_2010_to_2021.csv"),
    encoding='utf-8'
)
b['uid'] = b['uid'].astype(str)
b.head()

### Group by user

In [None]:
userbook_2010_2021 = b.groupby(
    'uid', as_index=False
)['category'].agg({'c': (lambda x: list(x))})

print(userbook_2010_2021.shape)

In [None]:
user = userbook_2010_2021['uid'].tolist()
pre3 = []
for i in user:
    pre3.append(str(i)[:3])
userbook_2010_2021['pref3'] = pre3
userbook_2010_2021.head()

### Popular book (2021)

In [None]:
popular = pd.read_csv(
    os.path.join(dataroot, "book", "popular", "popular_2021.csv")
)
popular.head() 

In [None]:
popular_cate3_list = list(
    map(lambda x: str(x), popular.cate3.tolist())
)
print(popular_cate3_list[:5])

In [None]:
pop_set = []
for i in popular_cate3_list:
    if i not in pop_set:
        try:
            u = int(i)
            pop_set.append(i)
        except:
            pass
print(pop_set[:5])
print(len(pop_set))

In [None]:
writejson(
    {'popluar_2021':pop_set}, 
    os.path.join(dataroot, "book", "popular", "popular.json")
)

## Filt-out non-common users for both dataset

### Get common user

In [None]:
lendusers = []
for i in tqdm( userbook_2010_2021['uid'].tolist()):
    lendui = i.strip()
    lendusers.append(lendui)
print(lendusers[0])

courseusers = []
for i in tqdm(usercourse_2010_2011['uid'].tolist()):
    courseusers.append(str(i))
print(courseusers[0])

In [None]:
all_users = set()
for i in tqdm(lendusers):
    all_users.add(i)
for i in tqdm(courseusers):
    all_users.add(i)

all_users = list(all_users)
print(len(all_users))

In [None]:
common_user_list = []
for i in tqdm(all_users):
    if (i in courseusers) and (i in lendusers):
        common_user_list.append(i)
print(len(common_user_list))

In [None]:
with open(os.path.join(dataroot,"commonuser.txt"),"w+") as f:
    for cu in common_user_list:
        f.write(f"{cu}\n")

### Filt-out non common

In [3]:
c = []
try:
    c = common_user_list
except:
    print("from disk")
    with open(os.path.join(dataroot, "commonuser.txt"),"r") as f:
        for cu in f.readlines():
            c.append(cu.strip())

common_user_list = c
del c
print(len(common_user_list))

from disk
21829


In [None]:
lend_common_user_df = userbook_2010_2021[
    userbook_2010_2021['uid'].isin(common_user_list)
]

lend_common_user_df= lend_common_user_df.sort_values(
    by=['uid']
)

In [None]:
course_common_user_df = usercourse_2010_2011[
    usercourse_2010_2011['uid'].isin(common_user_list)
]
course_common_user_df = course_common_user_df.sort_values(
    by=['uid']
)

In [None]:
print(f"{lend_common_user_df.shape}, {course_common_user_df.shape}")

In [None]:
### Check if user order is correct

for i in tqdm(range(lend_common_user_df.shape[0])):
    lendui = lend_common_user_df.iloc[i]['uid']
    lendui = lendui.strip()
    courseui = course_common_user_df.iloc[i]['uid']
    courseui = courseui.strip()

    if courseui != lendui:
        print(f"{i}  go wrong")
        break

### Store as json file

#### Course

In [2]:
course_common_user_df = pd.read_csv(
    os.path.join(course_dataroot,"commonuser.csv"),
    encoding='utf-8'
)

In [None]:
course_common_user_df.head()

In [6]:
with open(os.path.join(dataroot,"course","commonuser_course.json"), "w+") as f:
    user_course_dict={}
    for i in tqdm(range(course_common_user_df.shape[0])):
        userid =  course_common_user_df.iloc[i]['uid']
        userid = int(userid)
        usercourses = course_common_user_df.iloc[i]['course']
        usercourses = usercourses[1:-1]
        usercourses = usercourses.split(",")
        usercourses = list(c.strip() for c in usercourses)
        user_course_dict[userid] = usercourses
    json.dump(user_course_dict, f, indent=4)


100%|██████████| 21829/21829 [00:08<00:00, 2663.63it/s]


#### Book

In [7]:
lend_common_user_df = pd.read_csv(
    os.path.join(book_dataroot, "commonuser.csv"),
    encoding="utf-8"
)

In [None]:
lend_common_user_df.head()

In [None]:
t = lend_common_user_df[lend_common_user_df['uid']==400110002]['c']
print(t.values)

In [13]:
with open(os.path.join(dataroot,"book","commonuser_book.json"), "w+") as f:
    user_book_dict = {}
    for i in tqdm(range(lend_common_user_df.shape[0])):
        userid = lend_common_user_df.iloc[i]['uid']
        userid = int(userid)
        userbooks = lend_common_user_df.iloc[i]['c']
        userbooks = userbooks[1:-1].split(",")
        userbooks = list(b.strip()[1:-1] for b in userbooks)
        user_book_dict[userid]=userbooks
    json.dump(user_book_dict,f,indent=4)

100%|██████████| 21829/21829 [00:08<00:00, 2487.57it/s]
