In [1]:
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import tweepy
import os
import datetime, time
import re

In [2]:
home_dir = "/Users/christopherallison/.virtualenvs/py_twi/results"
save_dir = "/Users/christopherallison/Documents/Coding/Gephi/twitter"

# Enter your search queries here
search_queries = "rstats python".split()

### Functions for data prep

In [3]:
# Convert text to camel_case

first_cap_re = re.compile('(.)([A-Z][a-z]+)')
all_cap_re = re.compile('([a-z0-9])([A-Z])')

def convert(name):
    s1 = first_cap_re.sub(r'\1_\2', name)
    return all_cap_re.sub(r'\1_\2', s1).lower()

In [4]:
# Remove empty mentions created through dataframe list comprehension
# i.e, ['twitter_user_A', None]

def clean_mentions(scr, men):
    temp = []
    for s, m in zip(scr, men):
        if m:
            for i, v in enumerate(m):
                temp.append([s, m[i]])
        else:
            pass
        
    return temp
        

In [5]:
# Remove edges for reply_to_tweet where no reply exists
# Need to add code to assign weight for duplicate edges - multiple mentions or replies to from same person
# Will return list on len 3 - node, target, weight

def clean_edges(edge_list):
    clean_edges = []

    for e in edge_list:
        if e[1] == None:
            pass
        else:
            clean_edges.append(e)
            
    return clean_edges


In [6]:
# Check for duplicates in edges and compute number of repeats as edge weight

def sort_and_weight(edges):
    temp = {}

    for i in edges:
        try:
            temp[str(i)]['weight'] += 1
        except KeyError:
            temp[str(i)] = {}
            temp[str(i)]['node'] = i[0]
            temp[str(i)]['target'] = i[1]
            temp[str(i)]['weight'] = 1

    return temp

In [20]:
def twitter_data_to_graph(search_query):
    
    for sq in search_query:
        
        fname = open(os.path.join(home_dir, 'search_{}.json'.format(sq)))
        
        # Load the JSON file
        df = pd.read_json(fname)
        
        # Swap rows & columns
        df2 = df.transpose()
        
        # Change index from Timestamp to integer
        df2.index = [t.value // 1 for t in df2.index]
        df2.head()
        
        # Pull hashtags from dict
        new_hash = []

        for h in df2.hashtags:
            n = []
            for i in h:
                n.append(i['text'].lower())
            new_hash.append(n)
            
        # Pull user names from dict
        new_mentions = []

        for m in df2.user_mentions:
            n = []
            for i in m:
                n.append(i['screen_name'])
            new_mentions.append(n)
            
        # Set up lists for node & edge creation
        texts = [text for text in df2.text]
        dates = [d for d in df2.date]
        replies = [r for r in df2.reply_to_tweet]
        user_replies = [u for u in df2.reply_to]
        mentions = [m for m in new_mentions]
        screen_names = [sn for sn in df2.user_screen_name]
        hashtags = [h for h in new_hash]
        retweets = [r for r in df2.retweet_count]
        id_str = [i for i in df2.id_str]
        quote_id_str = [q for q in df2.quoted_status_id_str]
        coords = [c for c in df2.coordinates]
        favorites = [f for f in df2.favorite_count]
        
        # Create one node for each tweet with embedded data - Graph N
        for i, n, d, h, r, t, q, c, f  in zip(id_str, screen_names, dates, hashtags, retweets, texts, quote_id_str, coords, favorites):
            N.add_node(i, attr_dict={
                    'name':n,
                    'date':d,
                    'hashtags':str(h),
                    'retweets':r,
                    'text':t, 
                    #'quoted_id':q,
                    #'coords':c,
                    #'favorites':f,
                })
        
        # Create edges list for each tweet & reply_to_tweet
        
        edges_r = [[n, r] for n,r in zip(id_str, replies)]
        edges_q = [[n, q] for n,q in zip(id_str, quote_id_str)]
        
        # Combine edges and prep - expanding multiple @-mentions FREX
        edges_all = edges_r + edges_q
        edges_all = clean_edges(edges_all)
        
        # Create edge weight as number of repeat replies & @-mentions
        
        edge_dict = sort_and_weight(edges_all)
        
        # Add edges for @mentions and replies to users
        for e in edge_dict:
            N.add_edge(edge_dict[e]['node'],
                       edge_dict[e]['target'],
                       weight=edge_dict[e]['weight'])
            
        # Insert Data analysis here
        print("Nodes: {}, Edges: {}".format(len(N.nodes()), len(N.edges())))
        
    # Write N graph in gexf for Gephi
    file_name = "{}_tweets_graph_{}.gexf".format(
        convert("_".join(search_queries)),
        datetime.datetime.now())
    
    nx.write_gexf(N, os.path.join(save_dir, file_name))
    

In [21]:
# Set up Graph
N = nx.DiGraph()

In [22]:
twitter_data_to_graph(search_queries)

Nodes: 1028, Edges: 35
Nodes: 2027, Edges: 52


NameError: name 'df2' is not defined

Unnamed: 0,1990-01-09 01:51:10.291136512,1990-01-09 01:52:34.508730368,1990-01-09 01:58:04.805947393,1990-01-09 01:58:19.490238464,1990-01-09 01:58:32.035368960,1990-01-09 01:59:04.868261889,1990-01-09 01:59:16.494966784,1990-01-09 01:59:37.286156288,1990-01-09 01:59:55.560714240,1990-01-09 02:01:07.341893632,...,1990-01-11 04:21:36.238456833,1990-01-11 04:24:03.085381632,1990-01-11 04:25:32.205785088,1990-01-11 04:30:20.459479042,1990-01-11 04:35:39.792666624,1990-01-11 04:37:35.236800512,1990-01-11 04:44:58.457325568,1990-01-11 04:48:40.050802689,1990-01-11 05:10:47.904546816,1990-01-11 05:12:22.838276096
coordinates,,,,,,,,,,,...,,"{'type': 'Point', 'coordinates': [-117.9533807...",,,,,,,,
date,2015-08-13 15:28:42,2015-08-13 15:29:03,2015-08-13 15:30:21,2015-08-13 15:30:25,2015-08-13 15:30:28,2015-08-13 15:30:36,2015-08-13 15:30:38,2015-08-13 15:30:43,2015-08-13 15:30:48,2015-08-13 15:31:05,...,2015-08-14 03:31:13,2015-08-14 03:31:48,2015-08-14 03:32:09,2015-08-14 03:33:18,2015-08-14 03:34:34,2015-08-14 03:35:02,2015-08-14 03:36:47,2015-08-14 03:37:40,2015-08-14 03:42:57,2015-08-14 03:43:19
entities,"{'symbols': [], 'urls': [{'display_url': 'micr...","{'symbols': [], 'urls': [], 'media': [{'indice...","{'symbols': [], 'urls': [{'display_url': 'abiz...","{'symbols': [], 'urls': [{'display_url': 'pypi...","{'symbols': [], 'urls': [{'display_url': 'buff...","{'symbols': [], 'urls': [{'display_url': 'bit....","{'symbols': [], 'urls': [{'display_url': 'buff...","{'symbols': [], 'urls': [{'display_url': 'buff...","{'symbols': [], 'urls': [{'display_url': 'neuv...","{'symbols': [], 'urls': [], 'hashtags': [{'tex...",...,"{'symbols': [], 'urls': [{'display_url': 'stac...","{'symbols': [], 'urls': [{'display_url': 'inst...","{'symbols': [], 'urls': [], 'hashtags': [{'tex...","{'symbols': [], 'urls': [{'display_url': 'neuv...","{'symbols': [], 'urls': [{'display_url': 'daan...","{'symbols': [], 'urls': [{'display_url': 'gith...","{'symbols': [], 'urls': [{'display_url': 'daan...","{'symbols': [], 'urls': [{'display_url': 'buff...","{'symbols': [], 'urls': [{'display_url': 'isaa...","{'symbols': [], 'urls': [], 'hashtags': [{'tex..."
favorite_count,3,0,0,0,0,0,0,2,0,0,...,0,0,0,1,0,0,0,0,0,0
hashtags,"[{'text': 'python', 'indices': [30, 37]}, {'te...","[{'text': 'python', 'indices': [100, 107]}]","[{'text': 'Python', 'indices': [0, 7]}, {'text...","[{'text': 'Euchner', 'indices': [57, 65]}, {'t...","[{'text': 'python', 'indices': [90, 97]}, {'te...","[{'text': 'python', 'indices': [68, 75]}, {'te...","[{'text': 'programming', 'indices': [95, 107]}...","[{'text': 'webdev', 'indices': [35, 42]}, {'te...","[{'text': 'Python', 'indices': [14, 21]}, {'te...","[{'text': 'python', 'indices': [64, 71]}, {'te...",...,"[{'text': 'Python', 'indices': [104, 111]}]","[{'text': 'amazing', 'indices': [28, 36]}, {'t...","[{'text': 'C', 'indices': [14, 16]}, {'text': ...","[{'text': 'Programmatore', 'indices': [33, 47]...","[{'text': 'python', 'indices': [53, 60]}, {'te...","[{'text': 'python', 'indices': [92, 99]}]","[{'text': 'python', 'indices': [71, 78]}, {'te...","[{'text': 'Python', 'indices': [21, 28]}, {'te...","[{'text': 'python', 'indices': [58, 65]}, {'te...","[{'text': 'Python', 'indices': [23, 30]}]"


Unnamed: 0,coordinates,date,entities,favorite_count,hashtags,id_str,lang,quoted_status,reply_to,reply_to_tweet,retweet_count,text,urls,user,user_mentions,user_screen_name
1990-01-09 01:51:10.291136512,,2015-08-13 15:28:42,"{'symbols': [], 'urls': [{'display_url': 'micr...",3,"[{'text': 'python', 'indices': [30, 37]}, {'te...",631849870291136512,en,False,,,0,Microsoft teaching how to use #python and #Dja...,[{'display_url': 'microsoftvirtualacademy.com/...,"{'contributors_enabled': False, 'notifications...",[],pollitosabroson
1990-01-09 01:52:34.508730368,,2015-08-13 15:29:03,"{'symbols': [], 'urls': [], 'media': [{'indice...",0,"[{'text': 'python', 'indices': [100, 107]}]",631849954508730368,en,False,,,14,RT @MuseumsInspire: Moving robots in the 'Pi P...,[],"{'contributors_enabled': False, 'notifications...","[{'indices': [3, 18], 'id': 3195219910, 'scree...",ryanteck
1990-01-09 01:58:04.805947393,,2015-08-13 15:30:21,"{'symbols': [], 'urls': [{'display_url': 'abiz...",0,"[{'text': 'Python', 'indices': [0, 7]}, {'text...",631850284805947393,en,False,,,0,#Python-gdal create geotiff from array with co...,"[{'display_url': 'abizy.com/p/view.html?ur…', ...","{'contributors_enabled': False, 'notifications...",[],Naijaknowhow
1990-01-09 01:58:19.490238464,,2015-08-13 15:30:25,"{'symbols': [], 'urls': [{'display_url': 'pypi...",0,"[{'text': 'Euchner', 'indices': [57, 65]}, {'t...",631850299490238464,de,False,,,1,RT @dajool_com: Hat jemand ein Electronic-Key-...,[{'display_url': 'pypi.python.org/pypi/eks/0.1...,"{'contributors_enabled': False, 'notifications...","[{'indices': [3, 14], 'id': 119441666, 'screen...",lgtr
1990-01-09 01:58:32.035368960,,2015-08-13 15:30:28,"{'symbols': [], 'urls': [{'display_url': 'buff...",0,"[{'text': 'python', 'indices': [90, 97]}, {'te...",631850312035368960,de,False,,,0,Python Library for Amadeus Travel Innovation S...,"[{'display_url': 'buff.ly/1P8q30m', 'indices':...","{'contributors_enabled': False, 'notifications...",[],PyBaltimore


Just noticed that date are incorrect - these are the tweet ID's that have been converted by accident

## Scripts for setting up graph of all tweets

[('631734304524005376',
  {'date': '2015-08-13 07:49:29',
   'hashtags': "['Python', 'optimization', 'performance', 'funny', 'Twitter']",
   'name': 'slacknux',
   'retweets': 0,
   'text': 'Good talk https://t.co/iSZ7GS3BxH #Python #optimization #performance #funny #Twitter'}),
 ('631565819781218304',
  {'date': '2015-08-12 20:40:00',
   'hashtags': "['Python', 'edtech']",
   'name': 'Fuliski',
   'retweets': 3,
   'text': 'RT @Don_Watkins: Using #Python &amp; Raspberry Pi to communicate w/ Lego Mindstorms EV3 | https://t.co/v5LfotVS2o #edtech @rickweinberg @philsh…'}),
 ('631569161991376896',
  {'date': '2015-08-12 20:53:16',
   'hashtags': "['IPython', 'Python']",
   'name': 'ppblt',
   'retweets': 24,
   'text': 'RT @randal_olson: Big changes w/ the latest #IPython 4.0 release.\n\n"ipython notebook" no longer works?\n\nhttp://t.co/j0MAe0w7fy #Python http…'}),
 ('631567373972058112',
  {'date': '2015-08-12 20:46:10',
   'hashtags': "['python']",
   'name': 'artwisanggeni',
   'retwe

[['kenwalger', 'kennethlove'],
 ['groundwalkergmb', 'jeanhuguesroy'],
 ['uogbuji', 'arh'],
 ['j4pe_', 'eric_pommereau'],
 ['Don_Watkins', 'rickweinberg'],
 ['HarriSrivastav', 'algoritmic'],
 ['HarriSrivastav', 'lonriesberg'],
 ['ishamyyl', 'climagic'],
 ['HarriSrivastav', 'artwisanggeni'],
 ['HarriSrivastav', 'Proximity_ltd'],
 ['HarriSrivastav', 'MarketACourse'],
 ['HarriSrivastav', 'BestSQL'],
 ['HarriSrivastav', 'ElaMoscicka'],
 ['HarriSrivastav', 'artwisanggeni'],
 ['HarriSrivastav', 'nodenow'],
 ['HarriSrivastav', '_testanic'],
 ['_organicit', 'pcgeek86'],
 ['melindathrasher', 'djangogirls'],
 ['ryanteck', 'MuseumsInspire'],
 ['ryanteck', 'computermuseum'],
 ['ryanteck', 'Raspberry_Pi'],
 ['lgtr', 'dajool_com'],
 ['hayza11', 'googletricks19'],
 ['kenwalger', 'kennethlove'],
 ['isurunix', 'nodenow'],
 ['MikeHerman', 'pycoders'],
 ['MikeHerman', 'Podcast__init__'],
 ['MikeHerman', 'jessicamckellar'],
 ['joebalalaika', 'RealPython'],
 ['mr_brejoc', 'lgtr'],
 ['Patty_1982', 'MuseumsIn

[('funKidsSTEM', 'atcsapp'),
 ('BobBodily', 'PlaceILive'),
 ('BobBodily', 'LoralynTaylor'),
 ('BobBodily', 'sepirdata'),
 ('pdr2002', 'rae_simon'),
 ('robbyki', 'Hancock_JohnD'),
 ('Jordanmlima', 'randal_olson'),
 ('emilyhwilson', 'fredmcconnell'),
 ('emilyhwilson', 'OrbitalMechs'),
 ('cmarschner', 'randal_olson'),
 ('BitterMoa', 'moaimx'),
 ('BitterMoa', 'GIRE_mx'),
 ('BitterMoa', 'Data4mx'),
 ('Niels_Bremen', 'flowingdata'),
 ('Niels_Bremen', 'randal_olson'),
 ('GoClickSpree', 'gregmancel'),
 ('GoClickSpree', 'DataScienceCtrl'),
 ('jknowles', 'HollyLynnMck'),
 ('jknowles', 'Biff_Bruise'),
 ('jknowles', 'EamonCaddigan'),
 ('jknowles', 'sweetspot'),
 ('matlabulous', 'NatGeo'),
 ('matlabulous', 'visualisingdata'),
 ('matlabulous', 'NadiehBremer'),
 ('santoshosu', 'DecisionViz'),
 ('santoshosu', 'alteryx'),
 ('BioMickWatson', 'randal_olson'),
 ('MAGISTR_OM', 'SkytreeHQ'),
 ('MAGISTR_OM', 'KirkDBorne'),
 ('GerryWieder', 'vjo'),
 ('luzumuyok', 'Ronald_vanLoon'),
 ('yayayarndiva', 'bayrepor

2628