In [1]:
#Import the right libraries
import pandas as pd
import dgl
import numpy as np
import networkx as nx
import torch
from dgl.data.utils import save_graphs

Using backend: pytorch


In [2]:
#Import the data
processed_company_data = pd.read_csv('processed_company_data.csv', sep=",")
processed_investor_data = pd.read_csv('processed_investor_data.csv', sep=",")
processed_relation_data = pd.read_csv('processed_relation_data.csv', sep=",")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Get the relations for the graph

In [3]:
#This function is a bit too slow..
tuples_relations = list(zip(processed_relation_data.InvestorID, processed_relation_data.CompanyID))

different_relation = []
same_relation = []
for relation in tuples_relations:
    investor_id = processed_investor_data.loc[processed_investor_data['InvestorID'] == relation[0]].index
    company_id = processed_company_data.loc[processed_company_data['CompanyID'] == relation[1]].index
    
#     If the company id can not be found in the company dataset
    if company_id.empty:
        #This is actually the investor/company ID
        company_id = processed_investor_data.loc[processed_investor_data['InvestorID'] == relation[1]].index
        same_relation.append((investor_id.tolist()[0], company_id.tolist()[0]))
    
    else:
        different_relation.append((investor_id.tolist()[0], company_id.tolist()[0]))

In [4]:
transposed_different_relation = [(t[1], t[0]) for t in different_relation]
transposed_same_relation = [(t[1], t[0]) for t in same_relation]

# Create the graph

In [5]:
#Input to graph has to be a list with tuples.
#In our case that will be [(investor, company), (investor, company), ...]
#We do have an exception when an investor is also a company:
#Then we have: [(investor/company, investor/company), (investor/company, investor/company), ...]

g = dgl.heterograph({
    ('investor/company', 'different_invests_in', 'company'): different_relation,
    ('investor/company', 'same_invests_in', 'investor/company'): same_relation,
    ('company', 'different_invested_by', 'investor/company'): transposed_different_relation,
    ('investor/company', 'same_invested_by', 'investor/company'): transposed_same_relation
})

print(g)

Graph(num_nodes={'company': 9779, 'investor/company': 7883},
      num_edges={('company', 'different_invested_by', 'investor/company'): 50087, ('investor/company', 'different_invests_in', 'company'): 50087, ('investor/company', 'same_invested_by', 'investor/company'): 5102, ('investor/company', 'same_invests_in', 'investor/company'): 5102},
      metagraph=[('company', 'investor/company', 'different_invested_by'), ('investor/company', 'company', 'different_invests_in'), ('investor/company', 'investor/company', 'same_invested_by'), ('investor/company', 'investor/company', 'same_invests_in')])


In [6]:
#Statistics
print('Node types:', g.ntypes)
print('Edge types:', g.etypes)
print('Canonical edge types:', g.canonical_etypes)

print('The amount of companies in our graph:', g.number_of_nodes('company'))
print('The amount of investor/company nodes in our graph:', g.number_of_nodes('investor/company'))

print('The amount of different relations:', g.number_of_edges(('investor/company', 'different_invests_in', 'company')))
print('The amount of same relations:',g.number_of_edges(('investor/company', 'same_invests_in', 'investor/company')))

print('Total number of edges', g.number_of_edges())

Node types: ['company', 'investor/company']
Edge types: ['different_invested_by', 'different_invests_in', 'same_invested_by', 'same_invests_in']
Canonical edge types: [('company', 'different_invested_by', 'investor/company'), ('investor/company', 'different_invests_in', 'company'), ('investor/company', 'same_invested_by', 'investor/company'), ('investor/company', 'same_invests_in', 'investor/company')]
The amount of companies in our graph: 9779
The amount of investor/company nodes in our graph: 7883
The amount of different relations: 50087
The amount of same relations: 5102
Total number of edges 110378


In [7]:
#Save graph
# save_graphs("dgl_graph", g)