In [None]:
import json
import networkx as nx
import matplotlib.pyplot as plt
import torch
from torch_geometric.data import Data
import pandas as pd
from dotenv import load_dotenv
import os


In [None]:
load_dotenv()
companies_file_path = os.getenv('COMPANIES_JSON_FILE_PATH')
links_file_path = os.getenv('LINKS_JSON_FILE_PATH')

In [None]:
# Companies.json -> https://query.data.world/s/k7ib3ovtm7w5aseritn4sdv6bqrhwe?dws=00000
companies_json = pd.read_json(companies_file_path, lines=True)

In [None]:
# Links.json -> https://query.data.world/s/x2qsj7pr75w7ouieifaf3whaiw4cbb?dws=00000
links_json = pd.read_json(links_file_path, lines=True)

In [None]:
companies_json.head()

In [None]:
links_json.head()

In [None]:
companies_df = pd.DataFrame(companies_json)
links_df = pd.DataFrame(links_json)

In [None]:
load_dotenv()
sp500_csv_file_path = os.getenv('SP500_CSV_FILE_PATH')
sp500_df = pd.read_csv(sp500_csv_file_path)

In [None]:
sp500_df.head()

In [None]:
# FILTER

sp500_companies = companies_df[companies_df['name'].str.lower().isin(sp500_df['Name'].str.lower())]

sp500_links = links_df[
    links_df['home_name'].str.lower().isin(sp500_companies['name'].str.lower()) & links_df['link_name'].str.lower().isin(sp500_companies['name'].str.lower())
]

In [None]:
len(sp500_companies)

In [None]:
sp500_companies.head()

In [None]:
len(sp500_links)

In [None]:
sp500_df['Name'] = sp500_df['Name'].str.lower()
companies_df['name'] = companies_df['name'].str.lower()

# Find names in S&P500 dataset not present in companies.json
missing_names_sp500 = sp500_df[~sp500_df['Name'].isin(companies_df['name'])]['Name']

# Find names in companies.json not present in S&P500 dataset
missing_names_companies = companies_df[~companies_df['name'].isin(sp500_df['Name'])]['name']

# Print or inspect the missing names
print("Missing names in S&P500 dataset:", missing_names_sp500)
print("Missing names in companies.json:", missing_names_companies)

In [None]:
type(sp500_companies)

In [None]:
# Create a directed graph
graph = nx.DiGraph()

# Add nodes (companies) to the graph with features
for _, company in sp500_companies.iterrows():
    node_id = str(company["_id"]["$oid"])
    features = {"name": company["name"], "domain": company["domain"]}
    graph.add_node(node_id, **features)

# Add edges (links) to the graph
for _, link in sp500_links.iterrows():
    home_id = link["home_name"]
    link_id = link["link_name"]
    graph.add_edge(home_id, link_id, type=link["type"])

In [None]:
# Visualize the graph
pos = nx.spring_layout(graph)
nx.draw(graph, pos, with_labels=False, font_size=8, node_size=2, node_color="skyblue", font_color="black", font_weight="bold", edge_color="gray", arrowsize=10, connectionstyle="arc3,rad=0.1")
plt.title("Company Network")
plt.show()

In [None]:
sp500_companies.head(50)

In [None]:
sp500_df.head(50)

In [None]:
sp500_df['Name'] = sp500_df['Name'].str.lower()
sp500_companies['name'] = sp500_companies['name'].str.lower()
sp500_merged = sp500_companies.merge(sp500_df, left_on='name', right_on='Name', how='left').drop('Name', axis=1)


In [None]:
sp500_merged.head(50)

In [None]:
len(sp500_merged)

In [None]:
sp500_links.head()

In [None]:
companies = []
links = []
i = 0
j = 0

G = nx.Graph()
for _, company in sp500_merged.iterrows():
    companies.append(company['name'])
    i += 1
    node_id = company['name']
    features = {'name': company['name'], 'symbol': company['Symbol'], 'sector': company['Sector']}
    G.add_node(node_id, **features)

for _, link in sp500_links.iterrows():
    links.append(link['home_name'])
    j += 1
    home_id = link["home_name"]
    link_id = link["link_name"]
    G.add_edge(home_id, link_id, type=link["type"])

print(len(companies))
print(len(links))
print(i, j)
print(G.number_of_nodes())

pos = nx.spiral_layout(G)
nx.draw(G, pos, with_labels=True, font_size=5, node_size=2, node_color="skyblue", font_color="black", font_weight="bold", edge_color="gray")
plt.title("Company Network")
plt.show()

In [None]:
sp500_merged = sp500_merged.drop_duplicates(subset=['name'])
sp500_links = sp500_links.drop_duplicates(subset=['home_name', 'link_name', 'type'])

In [None]:
companies = []
links = []
i = 0
j = 0

G = nx.Graph()
for _, company in sp500_merged.iterrows():
    companies.append(company['name'])
    i += 1
    node_id = company['name']
    features = {'name': company['name'], 'symbol': company['Symbol'], 'sector': company['Sector']}
    G.add_node(node_id, **features)

for _, link in sp500_links.iterrows():
    links.append(link['home_name'])
    j += 1
    home_id = link["home_name"]
    link_id = link["link_name"]
    G.add_edge(home_id, link_id, type=link["type"])

print(len(companies))
print(len(links))
print(i, j)
print(G.number_of_nodes())

pos = nx.spiral_layout(G)
nx.draw(G, pos, with_labels=True, font_size=5, node_size=2, node_color="skyblue", font_color="black", font_weight="bold", edge_color="gray")
plt.title("Company Network")
plt.show()

In [None]:
G = nx.Graph()
for _, company in sp500_merged.iterrows():
    companies.append(company['name'])
    node_id = company['name']
    # features = {'name': company['name'], 'symbol': company['Symbol'], 'sector': company['Sector']}
    G.add_node(node_id)

In [None]:
pos = nx.random_layout(G)
nx.draw(G, pos, with_labels=True, font_size=5, node_size=2, node_color="skyblue", font_color="black", font_weight="bold", edge_color="gray")
plt.title("Company Network")
plt.show()

In [None]:
G.number_of_nodes()

In [None]:
for _, link in sp500_links.iterrows():
    home_id = link["home_name"]
    link_id = link["link_name"]
    G.add_edge(home_id, link_id, type=link["type"])

In [None]:
plt.figure(figsize=(10, 10))

pos = nx.random_layout(G)
nx.draw(G, pos, with_labels=True, font_size=5, node_size=20, node_color="skyblue", font_color="black", font_weight="bold", edge_color="gray", width=0.1)
plt.title("Company Network")
plt.show()

In [None]:
plt.figure(figsize=(12, 12))

pos = nx.spring_layout(G, k=4)
nx.draw(G, pos, with_labels=True, font_size=5, node_size=100, node_color="skyblue", font_color="black", font_weight="bold", edge_color="gray", width=0.1)
plt.title("Company Network")

plt.show()



In [None]:
len(sp500_links)

In [None]:
sp500_links['home_name'] = sp500_links['home_name'].str.lower()
sp500_links['link_name'] = sp500_links['link_name'].str.lower()

In [None]:
company_names = sp500_companies['name']

filtered_links = sp500_links[
    sp500_links['home_name'].isin(company_names) &
    sp500_links['link_name'].isin(company_names)
]

print(len(filtered_links))

In [None]:
G = nx.Graph()
for _, company in sp500_merged.iterrows():
    companies.append(company['name'])
    node_id = company['name']
    features = {'name': company['name'], 'symbol': company['Symbol'], 'sector': company['Sector']}
    G.add_node(node_id, **features)
for _, link in filtered_links.iterrows():
    home_id = link["home_name"]
    link_id = link["link_name"]
    G.add_edge(home_id, link_id, type=link["type"])


In [None]:
print(G.number_of_nodes())
print(G.number_of_edges())

In [None]:
plt.figure(figsize=(10, 10))

pos = nx.random_layout(G)
nx.draw(G, pos, with_labels=True, font_size=5, node_size=20, node_color="skyblue", font_color="black", font_weight="bold", edge_color="gray", width=0.1)
plt.title("Company Network")
plt.show()

In [None]:
duplicates = filtered_links.duplicated(subset=['home_name', 'link_name'])
print(duplicates.sum())

There are 1660 pairs that are connected with different type of links. What would the solution be?
1. Create list of types for each edge and add it as an attribute.

In [None]:
sp500_merged

In [None]:
G = nx.Graph()

for _, company in sp500_merged.iterrows():
    node_id = company['name']
    features = {'name': company['name'], 'symbol': company['Symbol'], 'sector': company['Sector']}
    G.add_node(node_id, **features)

for _, link in filtered_links.iterrows():
    home_id = link["home_name"]
    link_id = link["link_name"]
    relationship_type = link["type"]

    # Check if the edge already exists
    if G.has_edge(home_id, link_id) and relationship_type not in G[home_id][link_id]['types']:
        # If the edge exists, update the 'types' attribute with the new type
        G[home_id][link_id]['types'].append(relationship_type)
    else:
        # If the edge doesn't exist, add it with a 'types' attribute
        G.add_edge(home_id, link_id, types=[relationship_type])


In [None]:
print(G.number_of_nodes())
print(G.number_of_edges())

In [None]:
plt.figure(figsize=(10, 10))

pos = nx.random_layout(G)
nx.draw(G, pos, with_labels=True, font_size=5, node_size=20, node_color="skyblue", font_color="black", font_weight="bold", edge_color="gray", width=0.1)
plt.title("Company Network")
plt.show()

In [None]:
# Print nodes and their attributes
print("Nodes:")
for node_id, attributes in G.nodes(data=True):
    print(f"Node {node_id}: {attributes}")

# Print edges and their attributes
print("\nEdges:")
for edge in G.edges(data=True):
    print(f"Edge {edge[:2]}: {edge[2]}")

In [None]:
import numpy as np

degree_matrix = np.diag(list(dict(nx.degree(G)).values()))
adjacency_matrix = nx.to_numpy_array(G)

combined_matrix = degree_matrix + adjacency_matrix

print("Degree Matrix:")
print(degree_matrix)

print("\nAdjacency Matrix:")
print(adjacency_matrix)

print("\nCombined Matrix:")
print(combined_matrix)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 8))
plt.imshow(degree_matrix, cmap='viridis', interpolation='none')
plt.title('Degree Matrix')
plt.colorbar()
plt.show()

plt.figure(figsize=(8, 8))
plt.imshow(adjacency_matrix, cmap='viridis', interpolation='none')
plt.title('Adjacency Matrix')
plt.colorbar()
plt.show()