In [12]:
pip install networkx



In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from operator import itemgetter

**Importing nodes and edges lists**
  
  Here we are using the full graph 




In [33]:
nodes_df = pd.read_csv('words_with_ratings5.csv', low_memory=False)
edges_df = pd.read_csv('Lemmas_Edgeslist.csv', low_memory=False)

**Creating NetworkX Graph object**

In [34]:
#Creating a NetworkX Graph
G = nx.Graph()

In [35]:
G = nx.from_pandas_edgelist(edges_df, 'source', 'target', ['weight'])	

nodes = pd.read_csv('words_with_ratings5.csv')
data = nodes.set_index('word').to_dict('index').items()

G.add_nodes_from(data)
print(G.nodes(data=True))
print(G.edges(data=True))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Printing some Graph information

In [36]:
#Checking if the edges and nodes were added successfully
print(nx.info(G)) # Print information about the Graph

Name: 
Type: Graph
Number of nodes: 39364
Number of edges: 2827611
Average degree: 143.6648


###Some Network Statistics below can be done using Gephi while others are more flexible to try in Python

####**PART ONE:** Metrics Related to the network structure (Density, Transitivity)



**DENSITY**

In [37]:
#Overall Density of the network
density = nx.density(G)
print("Network density:", density)

Network density: 0.0036497426681987677


This **Density** value just means that our network is weakly connected, this is predictable since we are dealing with a huge network.

**TRANSITIVITY**

In [None]:
#Overall Density of the network
transitivity = nx.transitivity(G)
print("Network transitivity:", transitivity)

**Transitivity** is just a way to measure triadic closure, teh value we get tells us how the nodes are interconnected. This suggest that we should look more into the importance of each node(degrees, centrality, page rank etc..) using the metrics in PART 2. 

####**PART TWO:** Metrics Related to the importance of each node
In this section we add new attributes to each node depending on its centrality, degree...

**DEGREES** : Create **degrees dictionary**, mapping each word to their respective degrees. 

In [24]:
degree_dict = dict(G.degree(G.nodes()))
nx.set_node_attributes(G, degree_dict, 'degree')
#It is important to set degree as an attribute of node alongside the other attributes 

In [27]:
#Sorting in desceding order (this will be important if we want to use nodes removal)
sorted_degree = sorted(degree_dict.items(), key=itemgetter(1), reverse=True)

In [29]:
print("Top 20 nodes by degree:")
for d in sorted_degree[:20]:
    print(d)

Top 20 nodes by degree:
('fare', 10428)
('anno', 10109)
('politico', 9649)
('partire', 9622)
('italia', 9614)
('sempre', 9538)
('italiano', 9406)
('quando', 8998)
('potere', 8830)
('parlare', 8672)
('dire', 8096)
('lavorare', 8077)
('salvini', 8039)
('governare', 7989)
('persona', 7841)
('bene', 7483)
('votare', 7481)
('andare', 7253)
('paese', 7236)
('mettere', 7224)


We see what hubs we have: 'fare' ; 'anno' ; 'politico'...

**Betweeness Centrality Dictionary**

  

It is quite informative to see what is the most central node in the network. This measure is more relevant because it doesn’t care about the number of edges any one node or set of nodes has. Betweenness centrality looks at all the shortest paths that pass through a particular node.

TAKES LOTS OF TIME TO RUN, calculating every possible shortest path in the network is exhaustive.

In [None]:
#Inspecting the centrality of each node --> important if we want to implement nodes removal
betweenness_dict = nx.betweenness_centrality(G) # Run betweenness centrality
nx.set_node_attributes(G, betweenness_dict, 'betweenness')

In [None]:
sorted_betweenness = sorted(betweenness_dict.items(), key=itemgetter(1), reverse=True)
print("Top 20 nodes by betweenness centrality:")
for b in sorted_betweenness[:20]:
    print(b)

Well, it is very expected that nodes with high degrees would have high centrality but it would be more interesting to see if there are any noes with low degrees and high centrality. Below we append both attributes to investigate this hypothesis.

In [None]:
#First get the top 20 nodes by betweenness as a list
top_betweenness = sorted_betweenness[:20]

#Then find and print their degree
for tb in top_betweenness: # Loop through top_betweenness
    degree = degree_dict[tb[0]] # Use degree_dict to access a node's degree,
    print("Name:", tb[0], "| Betweenness Centrality:", tb[1], "| Degree:", degree)

###PART 3: Nodes Removal 
In this section we try to remove a set of nodes one by one to see how it affects the robustness of the network ( ofc we can use the reduced networks to compare which one disconnectes faster. 

Here we can consider just the Hate comments subgraph and start removing the nodes with highest degrees first, then we do the same for non hate speech comments. 

In [39]:
#Select Subgraph based on Negative Emotions for example
hate_comments_words = [n for n,v in G.nodes(data=True) if v.get("hate",None) != 0]  
#print (hate_comments_words) #Americano is a positive emotion lol
print( len(hate_comments_words))

25022


In [40]:
nhate_comments_words = [n for n,v in G.nodes(data=True) if v.get("hate",None) == 0]  
#print (nhate_comments_words) #Americano is a positive emotion lol
print( len(nhate_comments_words))

14342


In [None]:
#Creating Subgraphs: 1- Hate comments 2- Non hate comments
hateG = G.subgraph(hate_comments_words)
nhateG = G.subgraph(nhate_comments_words)

In [None]:
#Here we select the nodes of hate comments subgraph and sort them by degree
from operator import itemgetter
degree_dict = dict(hateG.degree(hateG.nodes()))
nx.set_node_attributes(hateG, degree_dict, 'degree')
sorted_degree_hateG= sorted(degree_dict.items(), key=itemgetter(1), reverse=True)
selected_nodes = list(sorted_degree_hateG)[:20] #20 nodes to be removed just for simplicity you can choose any number
#TOP 20 nodes to remove
nodes_removed = []
for i,v in enumerate(selected_nodes): 
  nodes_removed.append(selected_nodes[i][0])

In [None]:
#Node Removal, for robustness, removing 1 by 1 starting by nodes with highest degrees
G1 = hateG.copy()
list_transitivity = [] #We can check density too
#list_degree = []
#diam_list =[]
nodes_rem = []
for i, node in enumerate(nodes_removed):
  G1.remove_node(node)
  #diam_list.append(nx.diameter(G1.to_undirected()) ) #returns an error 
  list_transitivity.append(nx.transitivity(G1)) #we can use it to measure robustness
  nodes_rem.append(i+1)

In [None]:
plt.plot(nodes_rem, list_transitivity, 'r-')
#plt.plot(nodes_removed, diam_list , 'b-')
plt.ylabel("transitivity")
plt.xlabel("Removed nodes")
plt.show()

**The above code can be repeated on another subset of data**