# Process momo details

In [1]:
import os
import json
import re
from collections import defaultdict

In [3]:
class Info:
    
    def __init__(self, product):
        self.product = product
        self.info = {}

    def find_attributes(self, html):
        """
        Capture data attributes from spec
        
        :type html: string
        :rtype attributes: dictionary
        
        """

        soup = BeautifulSoup(html, 'lxml')
        headers = soup.find_all('th')

        attributes = defaultdict(list)

        for i in range(len(headers)):
            header = headers[i].string
            values = soup.find_all('td')[i].find_all('li')

            for value in values:
                if value.string is not None:
                    attributes[header].append(value.string.strip())
                
        ## check if 品牌名稱 is in attributes
        ## otherwise store the info from self.product['品牌名稱']
        if '品牌名稱' not in attributes:
            attributes['品牌名稱'].append(self.product['品牌名稱'])

        return dict(attributes)
    

    def find_categories(self, html):
        """
        Capture hierarchical categories of a product
        
        :type html: string
        :rtype categories: dictionary
        
        """

        categories = re.sub(r'\n', '', html).split('\xa0>\xa0')
        categories = [cat for cat in categories if cat]  # remove empty strings
        categories = {f'level_{i}': cat.strip() for i, cat in enumerate(categories) if cat}

        return categories
    
    
    def wrap_info(self):
        """
        Process information of a product
        
        :rtype self.info: dictionary
        """
        
        self.info['Product'] = self.product['title']
        self.info['Price'] = self.product['折扣後價格']
        
        self.info['Attributes'] = self.find_attributes(self.product['spec'])
        self.info['Categories'] = self.find_categories(self.product['category'])
        
        self.info['Url'] = self.product['page_URL']
        
        return self.info

In [None]:
ROOT = '../ecommerce/detail_momo'
files = os.listdir(ROOT)

all_products = []
COUNT = 0

for file in files:
    
    # select only data crawled on March 10th
    if '0310' not in file:
        continue
    
    if COUNT % 10 == 0:
        print(COUNT)
    COUNT += 1
    
    with open(os.path.join(ROOT, file), 'r') as f:
        data = json.load(f)
        
    # Remove data without title
    products = [x for x in data if x['title']]
    
    for product in products:

        info = Info(product)
        processed_info = info.wrap_info()
        all_products.append(processed_info)

In [None]:
# save the file

with open('0310_all_products.json', 'w') as f:
    json.dump(all_products, f)

# Connect to neo4j and create knowledge graph

In [None]:
from py2neo import Graph

import json
import re
from collections import defaultdict

from py2neo import Graph

import json
import re
from collections import defaultdict

class KG:
    
    def __init__(self):
        
        self.g = Graph(
            host="13.112.207.201",  # neo4j 的 ip
            http_port=7687,  # neo4j 的 port
            user="neo4j",
            password="123456")
        
        ############ CAUTION WITH THIS LINE !!! ############
        # 確定要清除在 neo4j database 中的所有內容的話才執行這行 #
#         self.g.delete_all()
                
        self.path = '0310_all_products.json'
        
        self.ch2en = {
            '品牌名稱': 'Brand',
            '品牌定位': 'BrandType',
            
            '款式': 'Style',
            '類型': 'Type',
            '產地': 'Origin',
            '尺寸': 'Size',
            '材質': 'Material',
            '適用於': 'Usage',

            '顏色': 'Color',
            '色調': 'Color',

            '功效': 'Function',
            '功能': 'Function',

            '對象與族群': 'Audience',
            '成份': 'Ingredient',
            '口味': 'Flavor',
            '香味': 'Smell',

            '圖案': 'Design',
            '形狀': 'Design',
            
            '價格': 'Price'
        }
        
        return
    
    
    def read_data(self):
        
        with open(self.path, 'rb') as f:
            data = json.load(f)
        
        return data[:20000] # we only use 20000 pieces of data now
    
    
    def build_relationship(self, data, attribute, rel_name, concepts_dict, relationships_dict):
        """
        Build relationship between products and attributes for later usage (create_edges)
        convert the label of relationship into uppercase (neo4j naming convention)
        """
        
        data = data.replace("'",'')
        attribute = attribute.replace("'",'')

        rel_label = self.ch2en[rel_name]

        concepts_dict[rel_label].add(attribute)
        relationships_dict[rel_label].add('@'.join([data, rel_label.upper(), rel_name, attribute]))

        return concepts_dict, relationships_dict
    
    
    def process_data(self, data):
        
        relationships_dict = defaultdict(set)
        concepts_dict = defaultdict(set)

        for product in data:

            title = product['Product']
            price = re.sub("[^0-9]", "", product['Price'])

            concepts_dict['Product'].add(title)
            concepts_dict, relationships_dict = self.build_relationship(title, price, '價格', concepts_dict, relationships_dict)

            attributes_keys = product['Attributes'].keys()

            for attribute in attributes_keys:
                # we only construct the graph with some selected attributes
                if attribute not in self.ch2en:
                    continue

                if attribute == '品牌定位':
                    # 品牌定位 is only connected to the corresponding brand
                    start = product['Attributes']['品牌名稱'][0]
                    end = product['Attributes']['品牌定位'][0]
                    concepts_dict, relationships_dict = self.build_relationship(start, end, attribute, concepts_dict, relationships_dict)    
                    continue

                for value in product['Attributes'][attribute]:
                    concepts_dict, relationships_dict = self.build_relationship(title, value, attribute, concepts_dict, relationships_dict)    
        
        return dict(concepts_dict), dict(relationships_dict)
    
    
    def count_attributes(self, data):
        all_attributes_dict = defaultdict(lambda: {'count':0 , 'value': set()})

        for product in data:
            for attribute in product['Attributes']:
                all_attributes_dict[attribute]['count'] += 1
                all_attributes_dict[attribute]['value'].update(product['Attributes'][attribute])
        
        return dict(all_attributes_dict)

    
    def create_node(self, label, nodes, batch_size=500):
        """
        Create nodes in neo4j first
        """
        
        batch = 0
        bulk = 0
        batch_all = len(nodes) // batch_size
        print(f'{label}: {batch_all + 1} batches')
        pairs = []
        
        for node_name in nodes:
            if not node_name:
                continue
            sql = "CREATE(:%s {name:'%s'})" % (label, node_name.replace("'",''))
            pairs.append(sql)
            bulk += 1
            if bulk % batch_size == 0 or bulk == batch_all + 1:
                sqls = '\n'.join(pairs)
                self.g.run(sqls)
                batch += 1
                print(f'[{label}]  {batch * batch_size}/{len(nodes)} finished')
                pairs = []
        
        return
    
    
    def create_edges(self, relationship, start_type, end_type, batch_size=100):
        """
        Create edges in neo4j with relationship
        """
        count = 0
        
        for r in relationship:
            r = r.split('@')
            start_name = r[0].replace("'",'')
            end_name = r[3].replace("'",'')

            r_type = r[1]
            r_name = r[2]

            sql = 'match (m:%s), (n:%s) where m.name = "%s" and n.name = "%s" create (m)-[:%s{name:"%s"}]->(n)' %(start_type, end_type, start_name, end_name, r_type, r_name)
            try:
                self.g.run(sql)
            except Exception as e:
                print(e)
            
            if count % batch_size == 0:
                print(f'\t batch = {count}')
                
            count += 1
                
        return
    
    
    def create_graph(self):
        
        print('Loading data...')
        data = self.read_data()
        concepts_dict, relationships_dict = self.process_data(data)
                
        print('Creating attribute nodes...')
        all_attributes_dict = self.count_attributes(data)
        for attribute in all_attributes_dict.keys():
            if attribute not in self.ch2en:
                continue
            label = self.ch2en[attribute]
            self.create_node(label, all_attributes_dict[attribute]['value'])
        
        print('Creating product nodes...')
        self.create_node('Product', concepts_dict['Product'])

        print('Creating price nodes...')
        self.create_node('Price', concepts_dict['Price'])
        
        print('Creating edges...')
        # create edges
        for r_key in relationships_dict:
            print(r_key)
            self.create_edges(set(relationships_dict[r_key]), 'Product', r_key)
            
        return

In [None]:
kg = KG()
kg.create_graph()