### Data pre-processing for Amazon books
This notebook is going to transfer the raw dataset to .pickle file that is fit for HGN model.

The test datasets is Amazon books

In [1]:
import pandas as pd
import numpy as np

dir = 'E:\Sebnewrepo\Data\hgnData/'
rating_file = 'ratings_Books.csv'

In [2]:
# read the raw csv data
col = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv(dir + rating_file, sep = ',', names = col)
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,AH2L9G3DQHHAJ,116,4.0,1019865600
1,A2IIIDRK3PRRZY,116,1.0,1395619200
2,A1TADCM7YWPQ8M,868,4.0,1031702400
3,AWGH7V0BDOJKB,13714,4.0,1383177600
4,A3UTQPQPM4TQO0,13714,5.0,1374883200


In [3]:
# count the user amount and item amount

print('Amount of Users: ', len(df['user_id'].unique()))
print('Amount of Items: ', len(df['item_id'].unique()))     

Amount of Users:  8026324
Amount of Items:  2330066


In [4]:
# delete the records with rating smaller than 4

df = df[df.rating >= 4]
print('Amount of Users: ', len(df['user_id'].unique()))
print('Amount of Items: ', len(df['item_id'].unique())) 

Amount of Users:  7118528
Amount of Items:  2138299


In [5]:
# remove infrequent items and users
from copy import deepcopy 
def rm_infrequent_items(data, min_counts):
    df = deepcopy(data)
    counts = df['item_id'].value_counts()
    df = df[df['item_id'].isin(counts[counts >= min_counts].index)]
    print("items with < {} interactoins are removed".format(min_counts))
    return df

def rm_infrequent_users(data, min_counts):
    df = deepcopy(data)
    counts = df['user_id'].value_counts()
    df = df[df["user_id"].isin(counts[counts >= min_counts].index)]
    print("users with < {} interactoins are removed".format(min_counts))
    return df

filtered_df = rm_infrequent_users(df, 20)
filtered_df = rm_infrequent_items(filtered_df, 20)
print('num of users:{}, num of items:{}'.format(len(filtered_df['user_id'].unique()), len(filtered_df['item_id'].unique())))

users with < 20 interactoins are removed
items with < 20 interactoins are removed
num of users:76696, num of items:41265


In [6]:
# Parse the item review that in itemset  
item_list = filtered_df['item_id'].unique()
item_set = set(item_list)
# print(item_list[:10])

review_file = 'reviews_Books_5.json.gz'

import json
import gzip

def parse(path):
    g = gzip.open(path, 'r')
    for l in g:
        yield json.loads(l)
      
review_dict = dict()  # [review_id] = review_text
review_helpful = dict()

# store the review into dictionary and delete the item without any review that longer than 10 words
for l in parse(dir + review_file):
    if l['asin'] in item_set:
        if l['asin'] in review_dict:
            if l['helpful'][0] / float(l['helpful'][1] + 0.01) > review_helpful[l['asin']] and len(l['reviewText']) > 10:
                review_dict[l['asin']] = l['reviewText']
                review_helpful[l['asin']] = l['helpful'][0] / float(l['helpful'][1] + 0.01)
        else:
            if len(l['reviewText']) > 10:
                review_dict[l['asin']] = l['reviewText']
                review_helpful[l['asin']] = l['helpful'][0] / float(l['helpful'][1] + 0.01)
            


In [7]:
review_dict['000100039X']

'It is a true masterpiece in which almost every, if not every word counts.  I have read it over and over again, since a girlfriend gave me a copy just before I left for Berkeley to attend law school.  I have given copies to many friends, and recommended to others that they buy it and read it carefully.There is enormous wisdom in each chapter, especially given the materialistic and secular world in which we live - which often seems for many people to be devoid of meaning or any spiritual underpinnings.  Gibran cuts through to the beauty and essence of Life, and his words are just as profound today as when they were written in the early years of the last century.This book should be recommended reading for any young person who is old enough to wonder what Life is really all about.  Indeed, it can be picked up at any age, and it offers insights that are brilliant and beautifully written.I urge anyone, who is thinking about buying it, to do so.  Just peruse a copy at a local bookstore or li

In [8]:
review_helpful['000100039X']

0.9980039920159681

In [9]:
# convert to sequential data per user
def convert_data(data):
    df = deepcopy(data)
    df_ordered = df.sort_values(['timestamp'], ascending=True)
    data = df_ordered.groupby('user_id')['item_id'].apply(list)
    unique_data = df_ordered.groupby('user_id')['item_id'].nunique()
    # delete users whose items are less than 10
    data = data[unique_data[unique_data >= 10].index]
    print(data[:10])
    print(len(data))
    return data

In [10]:
seq_data = convert_data(filtered_df)

user_id
A002359833QJM7OQHCXWY    [B00BUKRALG, B00BWY3UKU, B004OEKH7Y, 076420477...
A00463782V7TKAP9EMNL     [B00ES4C28C, 1941450008, B004XJ6922, 148481477...
A00579222Q4YKY0J53RLA    [193415766X, 0345492641, 1451608160, 193639924...
A006458827ALF2J23JJTO    [1489539042, 1482616319, B00DBE8QDU, B00DUFCJ1...
A0092581WFYQNV4KMUZ3     [0425263916, 0615744257, 0060734019, 032147404...
A0099735VDZ3HDCAAYKL     [0451228219, 0451229444, B008XOWVVG, B006BFX4U...
A010971113OD625HDB6X8    [0606262520, 0985023058, 0373210515, 148270683...
A010997525FU27TAPMJCG    [0451165209, 1442346272, 0470101474, 039331929...
A01628721NLXK7ENDWDC9    [B00558WOZG, 1425746845, B004WLOGYE, B004Z1N2G...
A01631062UX24GI4LJKF     [B00EOUWEW4, B00D6IAJHM, B00FXSDNX0, B00G002LH...
Name: item_id, dtype: object
52406


In [11]:
# user sequential data to dict
user_item_dict = seq_data.to_dict()
user_mapping = []
item_set = set()

# create user and item mapping table
for user_id, item_list in seq_data.iteritems():
    user_mapping.append(user_id)
    for item_id in item_list:
        item_set.add(item_id)
item_mapping = list(item_set)

print(len(user_mapping), len(item_mapping))

52406 41264


In [12]:
# create mapping index
def generate_inverse_mapping(data_list):
    inverse_mapping = dict()
    for inner_id, true_id in enumerate(data_list):
        inverse_mapping[true_id] = inner_id
    return inverse_mapping

def convert_to_inner_index(user_records, user_mapping, item_mapping):
    inner_user_records = []
    user_inverse_mapping = generate_inverse_mapping(user_mapping)
    item_inverse_mapping = generate_inverse_mapping(item_mapping)

    for user_id in range(len(user_mapping)):
        real_user_id = user_mapping[user_id]
        item_list = list(user_records[real_user_id])
        for index, real_item_id in enumerate(item_list):
            item_list[index] = item_inverse_mapping[real_item_id]
        inner_user_records.append(item_list)

    return inner_user_records, user_inverse_mapping, item_inverse_mapping

inner_data_records, user_inverse_mapping, item_inverse_mapping = convert_to_inner_index(user_item_dict, user_mapping, item_mapping)
print(inner_data_records[:5])

[[30760, 24286, 31426, 25106, 25128, 34254, 35686, 24214, 30357, 16476, 11606], [37352, 32642, 8719, 27827, 10324, 8990, 6884, 24739, 21241, 21013, 12179, 16461, 25351, 12894, 20743, 26668, 28915, 15250, 5754, 10682], [26150, 30666, 27835, 23353, 39608, 3783, 33181, 10604, 35354, 14855, 7776, 13858, 9705], [14876, 985, 11882, 29715, 19554, 34565, 34633, 33480, 15273, 41192, 36058, 56, 16117, 11583, 15022, 19315, 27686, 30378, 27991, 38207, 16380, 13595, 3976, 35546, 24128, 24605, 11486, 26977, 2593, 40227, 32990, 18319, 29179, 36062, 3029, 24897, 32574, 28105], [30660, 11463, 30343, 17479, 20555, 22125, 15747, 23059, 38904, 19705, 23941, 21548, 26676, 34847, 31675, 32303, 21446, 18125, 40249]]


In [13]:
import pickle
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f)

In [14]:
save_obj(inner_data_records, 'Books_item_sequences')
save_obj(user_mapping, 'Books_user_mapping')
save_obj(item_mapping, 'Books_item_mapping') 