In [1]:
import numpy as np
import networkx as nx
import pickle
import pandas as pd 
import glob
import json

import graph_tool.all as gt
from utilities import build_CI_rank

In [2]:
raw_data = 'PATH TO RAW DATA'
path_to_save = 'PATH TO RETWEETS NETWORK'

users_info = 'PATH TO USERS INFO'
save = False

In [None]:
# The raw_data_final folder contains the rar data with all the tweets information
# The retweet_networks_final folder contains the initial edgelists from the nhb paper
# The edge_list_expanded foler contains the originald edgelist modifed for the tempora study. No need 
# in ensamble/validated there are the validate graphs

In [3]:
 glob.glob(raw_data+'/*')

['right_retweet_edges.csv',
 'right_leaning_retweet_edges.csv',
 'left_extreme_retweet_edges.csv',
 'center_retweet_edges.csv',
 'fake_retweet_edges.csv',
 'right_extreme_retweet_edges.csv',
 'left_retweet_edges.csv',
 'left_leaning_retweet_edges.csv']

# General case

Building the general network generated by using all the categories

In [4]:
merged_edge_list = pd.DataFrame(columns=['infl_id', 'auth_id', 'id'])
for file_path in glob.glob(raw_data+'/*'):

    print(file_path)
    
    # Read the edge list from the current file
    edge_list = pd.read_csv(file_path)
    
    # Merge unique values based on 'id'
    merged_edge_list = pd.concat([merged_edge_list, edge_list], ignore_index=True).drop_duplicates(subset='id')


right_retweet_edges.csv
right_leaning_retweet_edges.csv
left_retweet_edges.csv
left_leaning_retweet_edges.csv


In [6]:
unique_users = len(set(merged_edge_list['infl_id'].to_list()).union(merged_edge_list['auth_id'].to_list()))
unique_edges = len(merged_edge_list['id'].unique())

In [7]:
print('Unique users ',unique_users)
print('Unique edges ',unique_edges)

Unique users  2963210
Unique edges  48501421


In [8]:
# Create edgelist
edgelist = merged_edge_list.groupby(['infl_id','auth_id'],as_index=False).count()
# Rename the column 'id' to 'count'
edgelist = edgelist.rename(columns={'id': 'weight'})
# Create Digraph
G = nx.from_pandas_edgelist(edgelist, 'infl_id', 'auth_id', edge_attr='weight', create_using=nx.DiGraph())
# Save the graph as a gpickle file
if save: 
    nx.write_gpickle(G, path_to_save+'Full_only_lr.gpickle')

In [9]:
# Some checks
print('Is G weighted: ', nx.is_weighted(G))
print('N° nodes: ', len(G.nodes()))
print('N° edges: ', len(G.edges()))
print('Weights: ',sum(weight['weight'] for _, _, weight in G.edges(data=True)))

Is G weighted:  True
N° nodes:  2963210
N° edges:  27608480
Weights:  48501421


In [12]:
 G_ = nx2gt(G)

converting ...


In [13]:
G_.properties

{('v',
  'id'): <VertexPropertyMap object with value type 'string', for Graph 0x7f11aa1e6890, at 0x7f11adfd9210>,
 ('e',
  'weight'): <EdgePropertyMap object with value type 'double', for Graph 0x7f11aa1e6890, at 0x7f0d95b6dfd0>}

In [14]:
 G_.save(path_to_save + 'Full_only_lr.gt')

# Left vs. Right

Constructing the networks for the left and right affiliations involves
categorizing sources into distinct groups. In the case of the "Right" network,
we include only those sources classified as right-leaning or falling under the right category. Conversely,
for the "Left" network, we consider sources labeled as left-leaning or affiliated with the left category.

In [None]:
right = ['right_retweet','right_leaning_retweet']
left = ['left_retweet','left_leaning_retweet']

In [None]:
merged_edge_list_right = pd.DataFrame(columns=['infl_id', 'auth_id', 'id'])
merged_edge_list_left = pd.DataFrame(columns=['infl_id', 'auth_id', 'id'])
for file_path in glob.glob(raw_data+'/*'):
    net_name = file_path.split('/')[-1].split('_edges.csv')[0]
    # Read the edge list from the current file
    if net_name in right: 
        edge_list = pd.read_csv(file_path)
        # Merge unique values based on 'id'
        merged_edge_list_right = pd.concat([merged_edge_list_right, edge_list], ignore_index=True).drop_duplicates(subset='id')
    elif net_name in left: 
        edge_list = pd.read_csv(file_path)
        # Merge unique values based on 'id'
        merged_edge_list_left = pd.concat([merged_edge_list_left, edge_list], ignore_index=True).drop_duplicates(subset='id')    


In [None]:
unique_users_right = len(set(merged_edge_list_right['infl_id'].to_list()).union(merged_edge_list_right['auth_id'].to_list()))
unique_edges_right = len(merged_edge_list_right['id'].unique())

unique_users_left = len(set(merged_edge_list_left['infl_id'].to_list()).union(merged_edge_list_left['auth_id'].to_list()))
unique_edges_left = len(merged_edge_list_left['id'].unique())

print('Unique users right',unique_users_right)
print('Unique edges right',unique_edges_right)
print('\n')
print('Unique users left',unique_users_left)
print('Unique edges left',unique_edges_left)

In [None]:
# Create edgelist
edgelist_left = merged_edge_list_left.groupby(['infl_id','auth_id'],as_index=False).count()
# Rename the column 'id' to 'count'
edgelist_left = edgelist_left.rename(columns={'id': 'weight'})
# Create Digraph
G_left = nx.from_pandas_edgelist(edgelist_left, 'infl_id', 'auth_id', edge_attr='weight', create_using=nx.DiGraph())
# Save the graph as a gpickle file
if save: 
    nx.write_gpickle(G_left, path_to_save+'Left.gpickle')


# Create edgelist
edgelist_right = merged_edge_list_right.groupby(['infl_id','auth_id'],as_index=False).count()
# Rename the column 'id' to 'count'
edgelist_right = edgelist_right.rename(columns={'id': 'weight'})
# Create Digraph
G_right = nx.from_pandas_edgelist(edgelist_right, 'infl_id', 'auth_id', edge_attr='weight', create_using=nx.DiGraph())
# Save the graph as a gpickle file
if save: 
    nx.write_gpickle(G_right, path_to_save+'Right.gpickle')

In [None]:
# Some checks
for G in [G_left,G_right]:
    print('Is G weighted: ', nx.is_weighted(G))
    print('N° nodes: ', len(G.nodes()))
    print('N° edges: ', len(G.edges()))
    print('Weights: ',sum(weight['weight'] for _, _, weight in G.edges(data=True)))
    print('\n')

# Check after links validation 

In [None]:
validated_networks = ['Grafo_full.pkl','Grafo_left.pkl','Grafo_right.pkl']

In [None]:
names = ['Full','Left','Right']
cnt = 0
for V in validated_networks:
    with open(f'VALIDATED NETWORKS PATH/{V}', "rb") as f:
        V_ = pickle.load(f)
        
    print(f'N° of nodes in validated graph {names[cnt]}...', len(V_.nodes()))
    print(f'N° of edges in validated graph {names[cnt]}...', len(V_.edges()))
    print('\n')
    cnt += 1