In [4]:
from __future__ import absolute_import, division, print_function

import os
import pickle
import gzip
import argparse

In [5]:
from utils import *
from data_utils import AmazonDataset
from knowledge_graph import KnowledgeGraph

In [6]:
def generate_labels(dataset, mode='train'):
    review_file = '{}/{}.txt.gz'.format(DATASET_DIR[dataset], mode)
    print('review_file:',review_file)
    user_products = {}  # {uid: [pid,...], ...}
    with gzip.open(review_file, 'r') as f:
        for line in f:
            line = line.decode('utf-8').strip()
            arr = line.split('\t')
            user_idx = int(arr[0])
            product_idx = int(arr[1])
            if user_idx not in user_products:
                user_products[user_idx] = []
            user_products[user_idx].append(product_idx)
    print('user_products:',user_products[0])
    save_labels(dataset, user_products, mode=mode)

# CLOTH

In [8]:
if not os.path.isdir(TMP_DIR[CLOTH]):
    os.makedirs(TMP_DIR[CLOTH])
dataset = AmazonDataset(DATASET_DIR[CLOTH])
save_dataset(CLOTH, dataset)
print('--------')
print(CLOTH)
print('--------')
print(dataset)

Load user of size 39387
Load product of size 23033
Load word of size 21366
Load related_product of size 339367
Load brand of size 1182
Load category of size 1193
Load produced_by of size 23033
Load belongs_to of size 23033
Load also_bought of size 23033
Load also_viewed of size 23033
Load bought_together of size 23033
word_indices: [5]
Load review of size 194439 word count= 194439
Create word sampling rate
--------
cloth
--------
<data_utils.AmazonDataset object at 0x000001A7702090B8>


In [9]:
# Generate knowledge graph instance.
# ========== BEGIN ========== #
print('Create', CLOTH, 'knowledge graph from dataset...')
dataset_cloth = load_dataset(CLOTH)
#print(dataset)
kg_cloth = KnowledgeGraph(dataset_cloth)
#print(kg)
kg_cloth.compute_degrees()
save_kg(CLOTH, kg_cloth)
# =========== END =========== #


Create cloth knowledge graph from dataset...
dataset_file: ./tmp/Amazon_Clothing/dataset.pkl
Load entities...
Total 425528 nodes.
Load reviews...
Total 0 review edges.
uid pid reamained_words. 19768 ; 6191 ; []
type of remained words: <class 'list'>
Load knowledge produced_by...
Total 7928 produced_by edges.
Load knowledge belongs_to...
Total 309666 belongs_to edges.
Load knowledge also_bought...
Total 2826284 also_bought edges.
Load knowledge also_viewed...
Total 289588 also_viewed edges.
Load knowledge bought_together...
Total 31674 bought_together edges.
Remove duplicates...
Compute node degrees...


In [10]:
# Genereate train/test labels.
# ========== BEGIN ========== #
print('Generate', CLOTH, 'train/test labels.')
generate_labels(CLOTH, 'train')#表示第一个用户买的商品的编号
generate_labels(CLOTH, 'test')
# =========== END =========== #

Generate cloth train/test labels.
review_file: ./data/Amazon_Clothing/train.txt.gz
user_products: [22034, 17059, 8668, 469, 21869]
review_file: ./data/Amazon_Clothing/test.txt.gz
user_products: [3375]


# CD

In [11]:
if not os.path.isdir(TMP_DIR[CD]):
    os.makedirs(TMP_DIR[CD])
dataset = AmazonDataset(DATASET_DIR[CD])
save_dataset(CD, dataset)

Load user of size 75258
Load product of size 64443
Load word of size 202959
Load related_product of size 236255
Load brand of size 1414
Load category of size 770
Load produced_by of size 64443
Load belongs_to of size 64443
Load also_bought of size 64443
Load also_viewed of size 64443
Load bought_together of size 64443
word_indices: [5]
Load review of size 194439 word count= 194439
Create word sampling rate


In [12]:
# Generate knowledge graph instance.
# ========== BEGIN ========== #
print('Create', CD, 'knowledge graph from dataset...')
dataset_cd = load_dataset(CD)
print('--')
kg_cd = KnowledgeGraph(dataset_cd)
print('--')
kg_cd.compute_degrees()
print('--')
save_kg(CD, kg_cd)
# =========== END =========== #

Create cd knowledge graph from dataset...
dataset_file: ./tmp/Amazon_CDs/dataset.pkl
--
Load entities...
Total 581099 nodes.
Load reviews...
Total 0 review edges.
uid pid reamained_words. 19768 ; 6191 ; []
type of remained words: <class 'list'>
Load knowledge produced_by...
Total 26762 produced_by edges.
Load knowledge belongs_to...
Total 933902 belongs_to edges.
Load knowledge also_bought...
Total 7383192 also_bought edges.
Load knowledge also_viewed...
Total 34848 also_viewed edges.
Load knowledge bought_together...
Total 88152 bought_together edges.
Remove duplicates...
--
Compute node degrees...
--


In [9]:
# Genereate train/test labels.
# ========== BEGIN ========== #
print('Generate', CD, 'train/test labels.')
generate_labels(CD, 'train')#表示第一个用户买的商品的编号
generate_labels(CD, 'test')
# =========== END =========== #

Generate cd train/test labels.
review_file: ./data/Amazon_CDs/train.txt.gz
user_products: [26837, 40754, 32754, 7929, 1296]
review_file: ./data/Amazon_CDs/test.txt.gz
user_products: [13551]


# BEAUTY

In [7]:
if not os.path.isdir(TMP_DIR[BEAUTY]):
    os.makedirs(TMP_DIR[BEAUTY])
dataset = AmazonDataset(DATASET_DIR[BEAUTY])
save_dataset(BEAUTY, dataset)

Load user of size 22363
Load product of size 12101
Load word of size 22564
Load related_product of size 164721
Load brand of size 2077
Load category of size 248
Load produced_by of size 12101
Load belongs_to of size 12101
Load also_bought of size 12101
Load also_viewed of size 12101
Load bought_together of size 12101
word_indices: [5]
Load review of size 194439 word count= 194439
Create word sampling rate


In [8]:
# Generate knowledge graph instance.
# ========== BEGIN ========== #
print('Create', BEAUTY, 'knowledge graph from dataset...')
dataset_beauty = load_dataset(BEAUTY)
print('--')
kg_beauty = KnowledgeGraph(dataset_beauty)
print('--')
kg_beauty.compute_degrees()
print('--')
save_kg(BEAUTY, kg_beauty)
# =========== END =========== #

Create beauty knowledge graph from dataset...
dataset_file: ./tmp/Amazon_Beauty/dataset.pkl
--
Load entities...
Total 224074 nodes.
Load reviews...
Total 0 review edges.
uid pid reamained_words. 19768 ; 6191 ; []
type of remained words: <class 'list'>
Load knowledge produced_by...
Total 20042 produced_by edges.
Load knowledge belongs_to...
Total 99512 belongs_to edges.
Load knowledge also_bought...
Total 1782364 also_bought edges.
Load knowledge also_viewed...
Total 310700 also_viewed edges.
Load knowledge bought_together...
Total 18042 bought_together edges.
Remove duplicates...
--
Compute node degrees...
--


In [9]:
# Genereate train/test labels.
# ========== BEGIN ========== #
print('Generate', BEAUTY, 'train/test labels.')
generate_labels(BEAUTY, 'train')#表示第一个用户买的商品的编号
generate_labels(BEAUTY, 'test')
# =========== END =========== #

Generate beauty train/test labels.
review_file: ./data/Amazon_Beauty/train.txt.gz
user_products: [1086, 2725, 2136, 535, 8179]
review_file: ./data/Amazon_Beauty/test.txt.gz
user_products: [7745]


# CELL

In [13]:
#parser = argparse.ArgumentParser()
#parser.add_argument('--dataset', type=str, default=BEAUTY, help='One of {BEAUTY, CELL, CD, CLOTH}.')
#args = parser.parse_args()

# Create AmazonDataset instance for dataset.
# ========== BEGIN ========== #
#print('Load', args.dataset, 'dataset from file...')
if not os.path.isdir(TMP_DIR[CELL]):
    os.makedirs(TMP_DIR[CELL])
dataset = AmazonDataset(DATASET_DIR[CELL])
save_dataset(CELL, dataset)

Load user of size 27879
Load product of size 10429
Load word of size 22493
Load related_product of size 101287
Load brand of size 955
Load category of size 206
Load produced_by of size 10429
Load belongs_to of size 10429
Load also_bought of size 10429
Load also_viewed of size 10429
Load bought_together of size 10429
word_indices: [5]
Load review of size 194439 word count= 194439
Create word sampling rate


In [14]:
# Generate knowledge graph instance.
# ========== BEGIN ========== #
print('Create', CELL, 'knowledge graph from dataset...')
dataset = load_dataset(CELL)
#print(dataset)
kg = KnowledgeGraph(dataset)
#print(kg)
kg.compute_degrees()
save_kg(CELL, kg)
# =========== END =========== #

Create cell knowledge graph from dataset...
dataset_file: ./tmp/Amazon_Cellphones/dataset.pkl
Load entities...
Total 163249 nodes.
Load reviews...
Total 1166634 review edges.
uid pid reamained_words. 19768 ; 6191 ; [5]
type of remained words: <class 'list'>
Load knowledge produced_by...
Total 10836 produced_by edges.
Load knowledge belongs_to...
Total 72786 belongs_to edges.
Load knowledge also_bought...
Total 1179008 also_bought edges.
Load knowledge also_viewed...
Total 25860 also_viewed edges.
Load knowledge bought_together...
Total 16888 bought_together edges.
Remove duplicates...
Compute node degrees...


In [15]:
# Genereate train/test labels.
# ========== BEGIN ========== #
print('Generate', CELL, 'train/test labels.')
generate_labels(CELL, 'train')
generate_labels(CELL, 'test')
# =========== END =========== #

Generate cell train/test labels.
review_file: ./data/Amazon_Cellphones/train.txt.gz
user_products: [6155, 6837, 4833, 9995, 10030, 4170, 3955, 7043, 3819, 9863, 6520, 10206, 6532, 8731, 9325]
review_file: ./data/Amazon_Cellphones/test.txt.gz
user_products: [8014, 2989, 1816, 2429, 7437, 1813]


In [None]:
#main
'''
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--dataset', type=str, default=BEAUTY, help='One of {BEAUTY, CELL, CD, CLOTH}.')
    args = parser.parse_args()

    # Create AmazonDataset instance for dataset.
    # ========== BEGIN ========== #
    print('Load', args.dataset, 'dataset from file...')
    
    if not os.path.isdir(TMP_DIR[args.dataset]):
        os.makedirs(TMP_DIR[args.dataset])
    dataset = AmazonDataset(DATASET_DIR[args.dataset])
    save_dataset(args.dataset, dataset)

    # Generate knowledge graph instance.
    # ========== BEGIN ========== #
    print('Create', args.dataset, 'knowledge graph from dataset...')
    dataset = load_dataset(args.dataset)
    kg = KnowledgeGraph(dataset)
    kg.compute_degrees()
    save_kg(args.dataset, kg)
    # =========== END =========== #

    # Genereate train/test labels.
    # ========== BEGIN ========== #
    print('Generate', args.dataset, 'train/test labels.')
    generate_labels(args.dataset, 'train')
    generate_labels(args.dataset, 'test')
    # =========== END =========== #


if __name__ == '__main__':
    main()
'''

In [20]:
class node():
    def _init_(self,val):
        self.left = None
        self.right = None
        self.val = val

root = node(1)
root

TypeError: node() takes no arguments