In [None]:
import json
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from dotenv import load_dotenv
import os

In [None]:
load_dotenv()
companies_file_path = os.getenv('COMPANIES_JSON_FILE_PATH')
links_file_path = os.getenv('LINKS_JSON_FILE_PATH')

In [None]:
# Companies.json -> https://query.data.world/s/k7ib3ovtm7w5aseritn4sdv6bqrhwe?dws=00000
companies_json = pd.read_json(companies_file_path, lines=True)

In [None]:
# Links.json -> https://query.data.world/s/x2qsj7pr75w7ouieifaf3whaiw4cbb?dws=00000
links_json = pd.read_json(links_file_path, lines=True)

In [None]:
companies_json.head()

In [None]:
links_json.head()

In [None]:
companies_df = pd.DataFrame(companies_json)
links_df = pd.DataFrame(links_json)

In [None]:
load_dotenv()
sp500_csv_file_path = os.getenv('SP500_CSV_FILE_PATH')
sp500_df = pd.read_csv(sp500_csv_file_path)

In [None]:
sp500_df.head()

In [None]:
# FILTER

sp500_companies = companies_df[companies_df['name'].str.lower().isin(sp500_df['Name'].str.lower())]

sp500_links = links_df[
    links_df['home_name'].str.lower().isin(sp500_companies['name'].str.lower()) & links_df['link_name'].str.lower().isin(sp500_companies['name'].str.lower())
]

In [None]:
len(sp500_companies)

In [None]:
sp500_companies.head()

In [None]:
len(sp500_links)

In [None]:
sp500_df['Name'] = sp500_df['Name'].str.lower()
sp500_companies['name'] = sp500_companies['name'].str.lower()
sp500_merged = sp500_companies.merge(sp500_df, left_on='name', right_on='Name', how='left').drop('Name', axis=1)


In [None]:
sp500_merged.head(50)

In [None]:
len(sp500_merged)

In [None]:
sp500_links.head()

In [None]:
# Drop Duplicates

sp500_merged = sp500_merged.drop_duplicates(subset=['name'])
sp500_links = sp500_links.drop_duplicates(subset=['home_name', 'link_name', 'type'])

In [None]:
sp500_links['home_name'] = sp500_links['home_name'].str.lower()
sp500_links['link_name'] = sp500_links['link_name'].str.lower()

In [None]:
company_names = sp500_companies['name']

filtered_links = sp500_links[
    sp500_links['home_name'].isin(company_names) &
    sp500_links['link_name'].isin(company_names)
]

print(len(filtered_links))

In [None]:
duplicates = filtered_links.duplicated(subset=['home_name', 'link_name'])
print(duplicates.sum())

There are 1660 pairs that are connected with different type of links. What would the solution be?
1. Create list of types for each edge and add it as an attribute.

In [None]:
sp500_merged

In [None]:
filtered_links.head()

In [None]:
for _, link in filtered_links.iterrows():
    home_symbol = sp500_merged.loc[sp500_merged['name'] == link['home_name'], 'Symbol'].values[0] 
    link_symbol = sp500_merged.loc[sp500_merged['name'] == link['link_name'], 'Symbol'].values[0] 

    print("Home_SYMBOL: ", home_symbol)
    print("Link_SYMBOL: ", link_symbol)

In [None]:
G = nx.Graph()

for _, company in sp500_merged.iterrows():
    node_id = company['Symbol']
    features = {'name': company['name'], 'symbol': company['Symbol'], 'sector': company['Sector']}
    G.add_node(node_id, **features)

for _, link in filtered_links.iterrows():
    home_symbol = sp500_merged.loc[sp500_merged['name'] == link['home_name'], 'Symbol'].values[0] 
    link_symbol = sp500_merged.loc[sp500_merged['name'] == link['link_name'], 'Symbol'].values[0] 
    home_id = home_symbol
    link_id = link_symbol
    relationship_type = link["type"]

    # Check if the edge already exists
    if G.has_edge(home_id, link_id):
        # If the edge exists, update the 'types' attribute with the new type
        if 'types' in G[home_id][link_id]:
            if relationship_type not in G[home_id][link_id]['types']:
                G[home_id][link_id]['types'].append(relationship_type)
        else:
            G[home_id][link_id]['types'] = [relationship_type]
    else:
        # If the edge doesn't exist, add it with a 'types' attribute
        G.add_edge(home_id, link_id, types=[relationship_type])

# Convert the 'types' attribute to a string
for edge in G.edges():
    if 'types' in G[edge[0]][edge[1]]:
        G[edge[0]][edge[1]]['types'] = ', '.join(G[edge[0]][edge[1]]['types'])

In [None]:
print(G.number_of_nodes())
print(G.number_of_edges())

In [None]:
plt.figure(figsize=(15, 15))

pos = nx.random_layout(G)
nx.draw_networkx(G, pos, with_labels=True, font_size=5, node_size=20, node_color="skyblue", font_color="black", font_weight="bold", edge_color="gray", width=0.1)
plt.title("Company Network")
plt.show()

In [None]:
# Print nodes and their attributes
print("Nodes:")
for node_id, attributes in G.nodes(data=True):
    print(f"Node {node_id}: {attributes}")

# Print edges and their attributes
print("\nEdges:")
for edge in G.edges(data=True):
    print(f"Edge {edge[:2]}: {edge[2]}")

In [None]:
import numpy as np

degree_matrix = np.diag(list(dict(nx.degree(G)).values()))
adjacency_matrix = nx.to_numpy_array(G)

combined_matrix = degree_matrix + adjacency_matrix

print("Degree Matrix:")
print(degree_matrix)

print("\nAdjacency Matrix:")
print(adjacency_matrix)

print("\nCombined Matrix:")
print(combined_matrix)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 8))
plt.imshow(degree_matrix, cmap='viridis', interpolation='none')
plt.title('Degree Matrix')
plt.colorbar()
plt.show()

plt.figure(figsize=(8, 8))
plt.imshow(adjacency_matrix, cmap='viridis', interpolation='none')
plt.title('Adjacency Matrix')
plt.colorbar()
plt.show()

In [None]:
nx.write_graphml_lxml(G, "../Graphs/relato_graph.graphml")

In [None]:
plt.figure(figsize=(15, 15))

pos = nx.spring_layout(G)
nx.draw_networkx(G, pos, with_labels=True, font_size=5, node_size=20, node_color="skyblue", font_color="black", font_weight="bold", edge_color="gray", width=0.1)
plt.title("Company Network")
plt.show()