# Edgelist experiments 

Import modules

In [1]:
# System tools
import os

# Data analysis
import pandas as pd
from collections import Counter
from itertools import combinations 
from tqdm import tqdm

# NLP
import spacy
nlp = spacy.load("en_core_web_sm")

# Network analysis tools
import networkx as nx
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,20)

# Regular expressions 
import re

2022-05-27 19:26:20.485862: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-27 19:26:20.485895: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Load data 

In [2]:
input_file = os.path.join("..", "..", "..", "CDS-LANG", "tabular_examples", "fake_or_real_news.csv")
data = pd.read_csv(input_file)
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


Extract individuals

In [3]:
real_text = data[data["label"] == "REAL"]["text"]

In [4]:
# get list of lists of entities
person_list = []

# iterate over every text
for doc in tqdm(nlp.pipe(real_text, batch_size=500)):
    # temp list
    tmp_list = []
    # get named entities for each text
    for entity in doc.ents:
        # if it is a PERSON
        if entity.label_ == "PERSON":
            # append to temporary list
            tmp_list.append(entity.text)
    # add to output list
    person_list.append(tmp_list)

3171it [04:00, 13.19it/s]


In [5]:
person_list[1]

['Hillary Clinton',
 'Donald Trump',
 'Ted Cruz',
 'Bernie Sanders',
 'John Kasich',
 'Clinton',
 'Clinton',
 'Cruz',
 'Cruz',
 'Cruz',
 'Hillary',
 'Hillary',
 'Clinton']

# Creating egdelist 

In [6]:
# create output edgelist
edgelist = []

# go over each list or "document" one at a time
for sublist in person_list[:10]:
    # get pairings in this doc
    edges = list(combinations(sublist, 2))
    # for every possible edge
    for edge in edges:
        # if the two values are the same
        if edge[0]==edge[1]:
            # do nothing
            pass
        # otherwise append to output
        else:
            edgelist.append(edge)

In [7]:
edgelist[:10]

[('John F. Kerry', 'Francois Hollande'),
 ('John F. Kerry', 'Benjamin Netanyahu'),
 ('John F. Kerry', 'Jane Hartley'),
 ('John F. Kerry', 'Victoria Nuland'),
 ('John F. Kerry', 'Eric H. Holder Jr.'),
 ('John F. Kerry', 'Narendra Modi'),
 ('John F. Kerry', 'Kerry'),
 ('Francois Hollande', 'Benjamin Netanyahu'),
 ('Francois Hollande', 'Jane Hartley'),
 ('Francois Hollande', 'Victoria Nuland')]

Count occurances

In [8]:
# create a weighted edgelist
weighted_edges = []

# use counter on edgelist
for key, value in Counter(edgelist).items():
    nodeA = key[0]
    nodeB = key[1]
    weight = value
    # append to output
    weighted_edges.append((nodeA, nodeB, weight))

In [9]:
weighted_edges[:10]

[('John F. Kerry', 'Francois Hollande', 1),
 ('John F. Kerry', 'Benjamin Netanyahu', 1),
 ('John F. Kerry', 'Jane Hartley', 1),
 ('John F. Kerry', 'Victoria Nuland', 1),
 ('John F. Kerry', 'Eric H. Holder Jr.', 1),
 ('John F. Kerry', 'Narendra Modi', 1),
 ('John F. Kerry', 'Kerry', 1),
 ('Francois Hollande', 'Benjamin Netanyahu', 1),
 ('Francois Hollande', 'Jane Hartley', 1),
 ('Francois Hollande', 'Victoria Nuland', 1)]

In [10]:
edges_df = pd.DataFrame(weighted_edges, columns=["nodeA", "nodeB", "weight"])

In [11]:
edges_df.head()

Unnamed: 0,nodeA,nodeB,weight
0,John F. Kerry,Francois Hollande,1
1,John F. Kerry,Benjamin Netanyahu,1
2,John F. Kerry,Jane Hartley,1
3,John F. Kerry,Victoria Nuland,1
4,John F. Kerry,Eric H. Holder Jr.,1


In [12]:
len(edges_df)

1821