# Convert Momo database to OpenKE format

In [None]:
import re
from collections import defaultdict
import pandas as pd
import os

from sklearn.model_selection import train_test_split

In [1]:
class PKG:
    
    def __init__(self):
                        
        self.data_path = '0310_all_products.json'
        self.save_root = 'OpenKE/benchmarks/momo'
        
        self.ch2en = {
            '品牌名稱': 'Brand',
#             '品牌定位': 'BrandType',
            
            '款式': 'Style',
            '類型': 'Type',
            '產地': 'Origin',
            '尺寸': 'Size',
            '材質': 'Material',
            '適用於': 'Usage',

            '顏色': 'Color',
            '色調': 'Color',

            '功效': 'Function',
            '功能': 'Function',

            '對象與族群': 'Audience',
            '成份': 'Ingredient',
            '口味': 'Flavor',

            '圖案': 'Design',
            '形狀': 'Design'
        }
        
        return
    
    
    def read_data(self, size=10000):
        
        with open(self.data_path, 'rb') as f:
            data = json.load(f)
        
        return data[:size]
    
    
    
    def build_relationship(self, data, attribute, rel_name, concepts_dict, relationships_dict):
        """
        Build relationship between products and attributes for later usage (create_edges)
        convert the label of relationship into uppercase (neo4j naming convention)
        """
        
        data = data.replace("'",'')
        attribute = attribute.replace("'",'')

        rel_label = self.ch2en[rel_name]

        concepts_dict[rel_label].add(attribute)
        relationships_dict[rel_label].append([data, attribute, rel_label.upper()])

        return concepts_dict, relationships_dict
    
    
    def process_data(self, data):
        
        relationships_dict = defaultdict(list)
        concepts_dict = defaultdict(set)

        for product in data:

            title = product['Product']

            concepts_dict['Product'].add(title)

            attributes_keys = product['Attributes'].keys()

            for attribute in attributes_keys:
                # we only construct the graph with some selected attributes
                if attribute not in self.ch2en:
                    continue

                if attribute == '品牌定位':
                    # 品牌定位 is only connected to the corresponding brand
                    start = product['Attributes']['品牌名稱'][0]
                    end = product['Attributes']['品牌定位'][0]
                    concepts_dict, relationships_dict = self.build_relationship(start, end, attribute, concepts_dict, relationships_dict)    
                    continue

                for value in product['Attributes'][attribute]:
                    concepts_dict, relationships_dict = self.build_relationship(title, value, attribute, concepts_dict, relationships_dict)    
        
        return dict(concepts_dict), dict(relationships_dict)
        
    
    def entity2id(self, concepts_dict):
        
        concepts_id = [entity for key in concepts_dict for entity in concepts_dict[key]]
        concepts_df = pd.DataFrame(concepts_id).reset_index()
        
        # change the order of columns
        concepts_df.columns = ['index', 'entity']
        concepts_df = concepts_df[['entity', 'index']]

        # set the first column name to the numeber of lines
        concepts_df.columns = [concepts_df.shape[0], None]

        concepts_df.to_csv(os.path.join(self.save_root, 'entity2id.txt'), sep ='\t', index=False)
        
        return concepts_df
    
    
    def relation2id(self):
        
        relation_id = [s.upper() for s in set(self.ch2en.values())]
        relation_df = pd.DataFrame(relation_id).reset_index()
        
        # change the order of columns
        relation_df.columns = ['index', 'relation']
        relation_df = relation_df[['relation', 'index']]

        # set the first column name to the numeber of lines
        relation_df.columns = [relation_df.shape[0], None]
        
        relation_df.to_csv(os.path.join(self.save_root, 'relation2id.txt'), sep ='\t', index=False)
        
        return relation_df
        

    def build_dataset_df(self, relationships_dict, concepts_df, relation_df):
        
        relationships_list = [r for key in relationships_dict for r in relationships_dict[key]]
        df = pd.DataFrame(relationships_list)
        df.columns = ['e1', 'e2', 'rel']
        df.to_csv(os.path.join(self.save_root, 'relationships_raw.txt'), sep ='\t')
        
        # convert to ids
        concept_id_dict = {v: k for k, v in concepts_df.iloc[:, 0].to_dict().items()}
        relation_id_dict = {v: k for k, v in relation_df.iloc[:, 0].to_dict().items()}

        relationships_list_ids = []
        for e1, e2, rel in relationships_list:
            relationships_list_ids.append([concept_id_dict[e1], concept_id_dict[e2], relation_id_dict[rel]])
            
        relationships_ids_df = pd.DataFrame(relationships_list_ids)
        relationships_ids_df.to_csv(os.path.join(self.save_root, 'relationships_ids.txt'), sep ='\t')
        
        return relationships_ids_df
    
    
    def build_train_test(self, dataset, test_size=0.2, valid_size=0.25):
        
        train, test = train_test_split(dataset, test_size=test_size, random_state=24)
        train, valid = train_test_split(train, test_size=valid_size, random_state=24)
        
        train.columns = [train.shape[0], None, None]
        test.columns = [test.shape[0], None, None]
        valid.columns = [valid.shape[0], None, None]
        
        train.to_csv(os.path.join(self.save_root, 'train2id.txt'), sep ='\t', index=False)
        test.to_csv(os.path.join(self.save_root, 'test2id.txt'), sep ='\t', index=False)
        valid.to_csv(os.path.join(self.save_root, 'valid2id.txt'), sep ='\t', index=False)
        
        return train, test, valid
        
    
    def main(self):
        
        data = self.read_data()
        concepts_dict, relationships_dict = self.process_data(data)
        
        relation_df = self.relation2id()
        concepts_df = self.entity2id(concepts_dict)
        
        dataset = self.build_dataset_df(relationships_dict, concepts_df, relation_df)
        
        train, test, valid = self.build_train_test(dataset)
        
        return train, test, valid

In [2]:
pkg = PKG()

In [None]:
data = pkg.read_data()
concepts_dict, relationships_dict = pkg.process_data(data)