In [1]:
import pandas as pd
import numpy as np

import networkx as nx
import itertools
from itertools import combinations

import statsmodels
import statsmodels.api as sm
import scipy.stats as stats

import matplotlib.pyplot as plt

In [2]:
###Import the clean tokens csv file
clean_words = pd.read_csv('all_tokens.csv')
df = pd.DataFrame(data = clean_words)
df.head()

Unnamed: 0.1,Unnamed: 0,text_nlp
0,0,dire gente guerra portata casa rimpatriati
1,1,marcello perfavore
2,2,patria patrioti difesa proprie radici soccombe...
3,3,musulmani comandare casa
4,4,odio dipende odio comandato libro


In [3]:
##rename the columns
df.columns = ["id","words"]
df.head()

Unnamed: 0,id,words
0,0,dire gente guerra portata casa rimpatriati
1,1,marcello perfavore
2,2,patria patrioti difesa proprie radici soccombe...
3,3,musulmani comandare casa
4,4,odio dipende odio comandato libro


In [4]:
##check for null values
df.isnull().values.any()

False

In [5]:
df.shape

(75775, 2)

In [6]:
##create a copy of the dataframe
df_words = df.copy()
df_words.shape

(75775, 2)

In [7]:
###tokenization using NLTK
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

tokenizer = RegexpTokenizer(r'\w+')

In [8]:
##tokenize the comments
df_words['words'] = df['words'].apply(lambda s: tokenizer.tokenize(s))
df_words.head(20)

Unnamed: 0,id,words
0,0,"[dire, gente, guerra, portata, casa, rimpatriati]"
1,1,"[marcello, perfavore]"
2,2,"[patria, patrioti, difesa, proprie, radici, so..."
3,3,"[musulmani, comandare, casa]"
4,4,"[odio, dipende, odio, comandato, libro]"
5,5,"[direi, dovremmo, diventare, razzisti, peggio]"
6,6,"[meditate, sinistronzi, meditate]"
7,7,"[tolleranza, rispetta, cultura, legge, barcone..."
8,8,[sottoscrivo]
9,9,"[papa, francesco, giudichi, fatto, sicuro, deb..."


In [9]:
## create a dictionary of unique words

unique_words = {} ##58384
words_per_comment = {}

for index, row in df_words.iterrows():  ##iterating over the df
    if(len(row['words'])>0):  ##this gives the length of each row
        words_per_comment[row["id"]]= [clean_txt for clean_txt in row['words']] ###iterating the column text_clean. There are 78174 rows
        for clean_txt in row['words']:
            unique_words.setdefault(clean_txt, 0) ## setdefault() method returns the value of a key (if the key is in dictionary)
            unique_words[clean_txt] += 1




In [10]:
print(len(unique_words))

57536


In [11]:
print(unique_words)

{'dire': 1568, 'gente': 1320, 'guerra': 497, 'portata': 45, 'casa': 1434, 'rimpatriati': 16, 'marcello': 146, 'perfavore': 12, 'patria': 205, 'patrioti': 11, 'difesa': 245, 'proprie': 92, 'radici': 58, 'soccombe': 1, 'roma': 595, 'islamico': 45, 'accoltella': 2, 'uomo': 467, 'crocifisso': 34, 'collo': 24, 'accaduto': 39, 'stazione': 24, 'termini': 76, 'aggressore': 2, 'fermato': 30, 'poliziotto': 14, 'notato': 35, 'vittima': 58, 'tentato': 21, 'sgozzarlo': 1, 'aumentare': 93, 'controlli': 52, 'attenzione': 157, 'prevenire': 12, 'violenza': 195, 'cittadini': 570, 'innocenti': 60, 'parte': 1476, 'islamici': 102, 'imperativo': 4, 'corrado': 23, 'armeri': 2, 'fdi': 75, 'musulmani': 108, 'comandare': 28, 'odio': 407, 'dipende': 46, 'comandato': 3, 'libro': 177, 'direi': 189, 'dovremmo': 94, 'diventare': 110, 'razzisti': 162, 'peggio': 455, 'meditate': 27, 'sinistronzi': 21, 'tolleranza': 59, 'rispetta': 46, 'cultura': 254, 'legge': 701, 'barcone': 32, 'via': 614, 'sottoscrivo': 17, 'papa': 

In [12]:
##create a matrix of words
clean_df = pd.DataFrame(0, index=unique_words, columns=unique_words)

for key in words_per_comment:
    for pair in itertools.product(words_per_comment[key],words_per_comment[key]):
        if pair[0]!=pair[1] and not(clean_df.at[pair[0],pair[1]]):
            clean_df.at[pair[0],pair[1]] += 1
            clean_df.at[pair[1],pair[0]] += 1

In [13]:
display(clean_df) ##this is the full matrix of words

Unnamed: 0,dire,gente,guerra,portata,casa,rimpatriati,marcello,perfavore,patria,patrioti,...,patteggiamenti,quellivche,ndrine,earth,rip,aspettandoprometeo,ivreich,satanassi,pezzettino,consigliatissima
dire,0,1,1,1,1,1,1,0,1,1,...,0,1,1,0,0,0,0,0,0,0
gente,1,0,1,1,1,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
guerra,1,1,0,1,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
portata,1,1,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
casa,1,1,1,1,0,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
aspettandoprometeo,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,1,0,0,0
ivreich,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,1,0,0,0,0
satanassi,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
pezzettino,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
##use this as a basis to get edges and their weights

words_projection = {} ##create dictionary 
##itertools.product() which computes the cartesian product of input iterables.

for key in words_per_comment:
    for pair in itertools.product(words_per_comment[key],words_per_comment[key]):
        if pair[0]!=pair[1] and not(pair[::-1] in words_projection):
            words_projection.setdefault(pair,0)
            words_projection[pair] += 1

words_projection

{('dire', 'gente'): 60,
 ('dire', 'guerra'): 23,
 ('dire', 'portata'): 3,
 ('dire', 'casa'): 79,
 ('dire', 'rimpatriati'): 3,
 ('gente', 'guerra'): 8,
 ('gente', 'portata'): 4,
 ('gente', 'casa'): 63,
 ('gente', 'rimpatriati'): 1,
 ('guerra', 'portata'): 3,
 ('guerra', 'casa'): 31,
 ('guerra', 'rimpatriati'): 1,
 ('portata', 'casa'): 6,
 ('portata', 'rimpatriati'): 1,
 ('casa', 'rimpatriati'): 3,
 ('marcello', 'perfavore'): 1,
 ('patria', 'patrioti'): 2,
 ('patria', 'difesa'): 6,
 ('patria', 'proprie'): 5,
 ('patria', 'radici'): 1,
 ('patria', 'soccombe'): 1,
 ('patria', 'roma'): 5,
 ('patria', 'islamico'): 3,
 ('patria', 'accoltella'): 1,
 ('patria', 'uomo'): 17,
 ('patria', 'crocifisso'): 1,
 ('patria', 'collo'): 1,
 ('patria', 'accaduto'): 1,
 ('patria', 'stazione'): 1,
 ('patria', 'termini'): 5,
 ('patria', 'aggressore'): 1,
 ('patria', 'fermato'): 1,
 ('patria', 'poliziotto'): 1,
 ('patria', 'notato'): 1,
 ('patria', 'vittima'): 2,
 ('patria', 'tentato'): 1,
 ('patria', 'sgozzarlo

In [15]:
##obtain weighted edges list
##edge lists =36million
#WEIGHTED
#to get weighted graph we need a list of 3-element tuplels (u,v,w) where u and v are nodes 
## and w is a number representing weight

words_weighted = []
for edge in words_projection:
    words_weighted.append((edge[0],edge[1],words_projection[edge]))

G = nx.Graph()
G.add_weighted_edges_from(words_weighted)

print(G.edges)
print(G.nodes)

nx.write_weighted_edgelist(G, "New_weighted_edges.csv",delimiter=",") ##save the edges list as csv



IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [16]:
##load the edges list
edges_list = pd.read_csv('New_weighted_edges.csv')
df_edges = pd.DataFrame(data = edges_list)
df_edges.head()

Unnamed: 0,dire,gente,60
0,dire,guerra,23
1,dire,portata,3
2,dire,casa,79
3,dire,rimpatriati,3
4,dire,senatore,11


In [17]:
df_edges.loc[-1] = ["dire", "gente",60]  # adding a row
df_edges.index = df_edges.index + 1  # shifting index
df_edges = df_edges.sort_index() 

In [18]:
df_edges.head()

Unnamed: 0,dire,gente,60
0,dire,gente,60
1,dire,guerra,23
2,dire,portata,3
3,dire,casa,79
4,dire,rimpatriati,3


In [19]:
df_edges.columns = ["source", "target", "weight"]
df_edges.head()

Unnamed: 0,source,target,weight
0,dire,gente,60
1,dire,guerra,23
2,dire,portata,3
3,dire,casa,79
4,dire,rimpatriati,3


In [20]:
df_edges.shape

(3692282, 3)

In [21]:
###use this as your list of edges
df_edges.to_csv("New_Edges.csv", index = False)

In [22]:
##lets create a node list
###nodes list = 57419
nl_df = pd.DataFrame.from_dict(unique_words,orient="index")
nl_df.reset_index(inplace=True)
nl_df[0] = nl_df['index']
nl_df.rename(columns={"index":"Id", 0:"Label"},inplace=True)


nl_df.to_csv("New_Nodes_list.csv",index=False)