# Analyze Twitter Networks

So far we collected Tweet IDs, dehydrated them to get actual content, identified users, and colledted Botometer scores for those users. 

Let's put all these information together in this final step and see what we can learn from the network and the available metadata.

In [1]:
%matplotlib inline 
%load_ext autoreload
%autoreload 2

import json
import gzip
import numpy as np
import pandas as pd
import datetime
from dateutil.parser import parse

import matplotlib
from matplotlib import pyplot as plt
import matplotlib.gridspec as gridspec

import networkx as nx

from config import DATA_PATH

In [2]:
def iterate_tweet_content():
    with gzip.open('../data/tweets_dehydrated.jsons.gz', 'rb') as fl:
        for line in fl:
            tweet = json.loads(line)
            yield tweet
            
for tweet in iterate_tweet_content():
    print(tweet)
    break

{'created_at': 'Sun May 10 20:28:06 +0000 2020', 'id': 1259580998586716162, 'id_str': '1259580998586716162', 'text': '#MilliHesaplarTakipte', 'truncated': False, 'entities': {'hashtags': [{'text': 'MilliHesaplarTakipte', 'indices': [0, 21]}], 'symbols': [], 'user_mentions': [], 'urls': []}, 'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 913841350130520065, 'id_str': '913841350130520065', 'name': '🇹🇷🇹🇷Engin Değirmenci🇹🇷🇹🇷', 'screen_name': 'huduluengin75', 'location': '', 'description': 'ERZİNCAN lı heyecanlı.', 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 157, 'friends_count': 257, 'listed_count': 0, 'created_at': 'Fri Sep 29 19:02:11 +0000 2017', 'favourites_count': 6401, 'utc_offset': None, 'time_zone': None, 'ge

In [57]:
# Build retweet - mention - reply network

userProfile = dict()
infoNet = nx.DiGraph() # Keep stats of min&max dates, count etc.
for c,tweet in enumerate(iterate_tweet_content()):
    connections = list()
    tdate = parse(tweet['created_at'])
    uid = tweet['id_str']
    if uid not in userProfile:
        userProfile[uid] = tweet['user']
    
    if tweet['in_reply_to_user_id'] != None:
        connections.append((uid, tweet['in_reply_to_user_id'], 'reply'))
        
    if 'retweeted_status' in tweet:
        rid = tweet['retweeted_status']['user']['id_str']
        connections.append((rid, uid, 'retweet'))
        if rid not in userProfile:
            userProfile[rid] = tweet['retweeted_status']['user']
    
    for m in tweet['entities']['user_mentions']:
        connections.append((uid, m['id_str'], 'mention'))
    
    for conn in connections:
        if not infoNet.has_edge(conn[0], conn[1]):
            infoNet.add_edge(conn[0], conn[1], reply=0, retweet=0, mention=0,
                             datemin=tdate, datemax=tdate, count=0)
        
        infoNet[conn[0]][conn[1]][conn[2]] += 1
        infoNet[conn[0]][conn[1]]['datemin'] = min(infoNet[conn[0]][conn[1]]['datemin'], tdate)
        infoNet[conn[0]][conn[1]]['datemax'] = max(infoNet[conn[0]][conn[1]]['datemax'], tdate)
        infoNet[conn[0]][conn[1]]['count'] += 1
        
        
    if (c % 1000000) == 0:
        print(nx.info(infoNet))
    
print(nx.info(infoNet))

Name: 
Type: DiGraph
Number of nodes: 0
Number of edges: 0

Name: 
Type: DiGraph
Number of nodes: 203995
Number of edges: 390375
Average in degree:   1.9136
Average out degree:   1.9136


In [58]:
for n in infoNet.nodes():
    if n not in userProfile:
        continue
    infoNet.nodes[n]['nfriends'] = userProfile[n]['friends_count']
    infoNet.nodes[n]['nfollowers'] = userProfile[n]['followers_count']
    infoNet.nodes[n]['nstatuses'] = userProfile[n]['statuses_count']
    infoNet.nodes[n]['nfavorites'] = userProfile[n]['favourites_count']
    
    infoNet.nodes[n]['screen_name'] = userProfile[n]['screen_name']
    infoNet.nodes[n]['user_name'] = userProfile[n]['name']
    infoNet.nodes[n]['creation_year'] = userProfile[n]['created_at'].split()[-1]
    

In [59]:
# Read Botometer Scores

BOTOMETER_SCORE_FILE = '../data/botometer_scores.jsons'
BOTOMETER_ERROR_FILE = '../data/botometer_scores.errors'

## Collect already existing scores from previous runs
botometerScores = dict()
try:
    with open(BOTOMETER_SCORE_FILE, 'r') as fl:
        for line in fl:
            try:
                temp = json.loads(line)
                botometerScores[temp['user']['id_str']] = temp
            except:
                pass
except Exception as e:
    print(e)
print('Scores for {} accounts already collected'.format(len(botometerScores)))


## Collect IDs of the accounts that are either deleted or suspended
removedAccounts = set()
try:
    with open(BOTOMETER_ERROR_FILE, 'r') as fl:
        for line in fl:
            removedAccounts.add(line.strip())
except Exception as e:
    print(e)
print('{} accounts unaccessible'.format(len(removedAccounts)))


### Add Botometer Scores to network

for n in infoNet.nodes():
    if n in botometerScores:
        infoNet.nodes[n]['botometer_eng'] = np.ceil(botometerScores[n]['scores']['english']*5)
        infoNet.nodes[n]['botometer_uni'] = np.ceil(botometerScores[n]['scores']['universal']*5)
    
    if n in removedAccounts:
        infoNet.nodes[n]['botometer_eng'] = -1
        infoNet.nodes[n]['botometer_uni'] = -1
        


Scores for 64066 accounts already collected
1171 accounts unaccessible


In [61]:
for u,v,d in infoNet.edges.data():
    infoNet[u][v]['datemin'] = infoNet[u][v]['datemin'].strftime("%Y-%m-%d %H:%M:%S")
    infoNet[u][v]['datemax'] = infoNet[u][v]['datemax'].strftime("%Y-%m-%d %H:%M:%S")

degreeSeq = {n:d for n,d in infoNet.degree()}
for n in sorted(degreeSeq, key=degreeSeq.get, reverse=True)[:100]:
    if n in userProfile:
        infoNet.nodes[n]['viz_label'] = userProfile[n]['screen_name']
    
#nx.write_gexf(infoNet, '../data/demo_infonet-alldata.gexf')
#nx.write_gexf(infoNet, '{}/demo_infonet-alldata.gexf'.format(DATA_PATH))

In [62]:

toRemove = list()
for n,d in infoNet.degree():
    if d < 5:
        toRemove.append(n)

for n in toRemove:
    infoNet.remove_node(n)
    
print(nx.info(infoNet))
nx.write_gexf(infoNet, '../data/demo_infonet-filtered.gexf')

'''
toRemove = set(infoNet.nodes()) - (removedAccounts | set(botometerScores.keys()))
print(len(toRemove))

for n in toRemove:
    infoNet.remove_node(n)

print(nx.info(infoNet))
nx.write_gexf(infoNet, '../data/demo_infonet-subnet.gexf')
'''

Name: 
Type: DiGraph
Number of nodes: 15710
Number of edges: 45150
Average in degree:   2.8740
Average out degree:   2.8740


"\ntoRemove = set(infoNet.nodes()) - (removedAccounts | set(botometerScores.keys()))\nprint(len(toRemove))\n\nfor n in toRemove:\n    infoNet.remove_node(n)\n\nprint(nx.info(infoNet))\nnx.write_gexf(infoNet, '../data/demo_infonet-subnet.gexf')\n"

In [None]:
# Hashtag similarity network

# Analyze network data