# Data conversion

This script allows the data conversion of the Cypher query to import the Movie dataset into Neo4j into two dataframes (nodes, and edges) that are imported in JanusGraph.

In [1]:
with open("movieCreationQuery.txt", "r") as fid:
    lines = fid.readlines()

In [2]:
import re

is_edge_regex = re.compile("\[.*\]")

def is_edge(line):
    if is_edge_regex.search(line):
        return True
    return False

is_valid_regex = re.compile("\(.*\)")

def is_valid(line):
    if is_valid_regex.search(line):
        return True
    return False    

In [3]:
import json 

def dict_serializer(input_dict):
    return {
        k: json.dumps(v) if isinstance(v, dict) or isinstance(v, list) else v
        for k, v in input_dict.items()
    }

single_quote_parser = lambda props: dict_serializer(json.loads(re.sub("(\w+):", r"'\1':",  props).replace("'","\"")))
double_quote_parser = lambda props: dict_serializer(json.loads(re.sub("(\w+):", r'"\1":',  props)))

In [4]:
from dataclasses import dataclass

@dataclass
class Node:
    id: str
    label: str
    props: dict

@dataclass
class Edge:
    id: tuple[str, str]
    label: str
    props: dict

node_regex = re.compile(r'.*\((\w*):(\w*).*({.*})\).*')

def parse_node_line(line):

    name, label, props = node_regex.match(line).groups()

    parser = double_quote_parser if '"' in props else single_quote_parser

    return Node(name, label, parser(props))

edge_regex = re.compile(r'^[a-zA-Z\s]*\((\w+)\)-\[:(\w+) ({.*})\]->\((\w+)\).*')
edge_regex_noprop = re.compile(r'^[a-zA-Z\s]*\((\w+)\)-\[:(\w+)]->\((\w+)\).*')

def parse_edge_line(line):

    try:
        source, rel_type, props, target = edge_regex.match(line).groups()
    except:
        source, rel_type, target = edge_regex_noprop.match(line).groups()
        props="{}"

    parser = double_quote_parser if '"' in props else single_quote_parser

    return Edge((source, target), rel_type, parser(props))


def parse_line(line: str):
    if not is_valid(line):
        return None
    
    if is_edge(line):
        return parse_edge_line(line)
    return parse_node_line(line)


In [5]:
line=lines[18]

In [6]:
parse_line(line)

Edge(id=('Emil', 'TheMatrix'), label='ACTED_IN', props={'roles': '["Emil"]'})

In [7]:
parsed_outout = [parse_line(line) for line in lines]

In [8]:
nodes = [item for item in parsed_outout if isinstance(item, Node)]
edges = [item for item in parsed_outout if isinstance(item, Edge)]

In [9]:
len(nodes)

171

In [10]:
len(edges)

254

In [11]:
import pandas as pd

In [12]:
nodes[:10]

[Node(id='TheMatrix', label='Movie', props={'title': 'The Matrix', 'released': 1999, 'tagline': 'Welcome to the Real World'}),
 Node(id='Keanu', label='Person', props={'name': 'Keanu Reeves', 'born': 1964}),
 Node(id='Carrie', label='Person', props={'name': 'Carrie-Anne Moss', 'born': 1967}),
 Node(id='Laurence', label='Person', props={'name': 'Laurence Fishburne', 'born': 1961}),
 Node(id='Hugo', label='Person', props={'name': 'Hugo Weaving', 'born': 1960}),
 Node(id='LillyW', label='Person', props={'name': 'Lilly Wachowski', 'born': 1967}),
 Node(id='LanaW', label='Person', props={'name': 'Lana Wachowski', 'born': 1965}),
 Node(id='JoelS', label='Person', props={'name': 'Joel Silver', 'born': 1952}),
 Node(id='Emil', label='Person', props={'name': 'Emil Eifrem', 'born': 1978}),
 Node(id='TheMatrixReloaded', label='Movie', props={'title': 'The Matrix Reloaded', 'released': 2003, 'tagline': 'Free your mind'})]

In [13]:
pd.DataFrame.from_records([{"id": node.id, "label": node.label, "props": node.props} for node in nodes]).set_index("id").to_pickle("nodes.pkl")

In [14]:
pd.DataFrame.from_records([{"id": edge.id, "label": edge.label, "props": edge.props} for edge in edges]).set_index("id").to_pickle("edges.pkl")