In [1]:
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import tweepy
import json
import os
import datetime, time
import re
import gensim

In [2]:
home_dir = "/Users/christopherallison/.virtualenvs/py_twi/streaming_results"
save_dir = "/Users/christopherallison/Documents/Coding/Gephi/twitter_stream"

# Enter your search queries here
search_queries = "harper trudeau".split()

### Functions for data prep

In [3]:
first_cap_re = re.compile('(.)([A-Z][a-z]+)')
all_cap_re = re.compile('([a-z0-9])([A-Z])')

def convert(name):
    # Convert text to camel_case
    s1 = first_cap_re.sub(r'\1_\2', name)
    return all_cap_re.sub(r'\1_\2', s1).lower()

In [4]:
def find_user(tweet_id, tweet_dict):
    # Find tweet author based on tweet status ID
    try:
        x = tweet_dict[tweet_id]['user_screen_name']
    except KeyError:
        x = None # User is out of scope
    return x

In [61]:
def twitter_data_to_graph(search_query):
    
    for sq in search_query:
        
        tweets = {}
        
        # Open source files for each query
        
        with open(os.path.join(home_dir, '{}_stream.json'.format(sq)), 'r') as f:
        
            for data in f:
        
                result = json.loads(data)

                id_str = result['id_str']

                tweets[id_str] = {
                    'id_str': id_str,
                    'date': str(result['created_at']),
                    'text': result['text'],
                    'retweet_count': result['retweet_count'],
                    'favorite_count': result['favorite_count'],
                    'reply_to': result['in_reply_to_screen_name'],
                    'coordinates': result['coordinates'],
                    'reply_to_tweet': result['in_reply_to_status_id'],
                    'user_screen_name': result['user']['screen_name'],
                    'quoted_status': result['is_quote_status'],
                    'lang': result['lang'],
                    'entities': result['entities'],
                    'urls': result['entities']['urls'],
                    'hashtags': result['entities']['hashtags'],
                    'user_mentions': result['entities']['user_mentions'],
                    'user': result['user']
                    }
                
                try:
                    tweets[id_str]['quoted_status_id_str'] = result['quoted_status_id_str']
                except KeyError:
                    tweets[id_str]['quoted_status_id_str'] = None
        
        
        for t in tweets:
    
            # Convert hashtags to string
            n = []
            if tweets[t]['hashtags']:
                for i in tweets[t]['hashtags']:
                    n.append(i['text'].lower())
                tweets[t]['hashtags'] = " ".join(n)
            else:
                tweets[t]['hashtags'] = ""

            # Convert user-mentions to string
            n = []
            if tweets[t]['user_mentions']:
                for i in tweets[t]['user_mentions']:
                    n.append(i['screen_name'].lower())
                tweets[t]['user_mentions'] = " ".join(n)
            else:
                tweets[t]['user_mentions'] = ""
                
            # Get coordinates if they exist
            if tweets[t]['coordinates']:
                print(tweets[t]['coordinates'])
                pass
            else:
                tweets[t]['coordinates'] = ""
                
            N.add_node(str(tweets[t]['id_str']), label=tweets[t]['user_screen_name'],
                      text=tweets[t]['text'], hashtags=tweets[t]['hashtags'],
                      date=tweets[t]['date'], coordinates=tweets[t]['coordinates'])

                
        # Create edge dict
        
        edge_dict = {}
        
        for t in tweets:
            temp = []
            
            # Prep replies to tweets
            rtt = tweets[t]['reply_to_tweet']
                
            # Prep quoted_status_id_str
            qis = tweets[t]['quoted_status_id_str']
                
            # Create edge list keys and weight
            
            if rtt:
                temp = rtt
            elif qis:
                temp = qis

            if temp:
                try:
                    edge_dict[str([tweets[t]['id_str'], str(temp)])]['weight'] += 1

                except KeyError:
                    edge_dict[str([tweets[t]['id_str'], str(temp)])] = {}
                    edge_dict[str([tweets[t]['id_str'], str(temp)])]['node'] = tweets[t]['id_str']
                    edge_dict[str([tweets[t]['id_str'], str(temp)])]['target'] = str(temp)
                    edge_dict[str([tweets[t]['id_str'], str(temp)])]['weight'] = 1

            # Add edges for @mentions and replies to users
            for e in edge_dict:
                N.add_edge(edge_dict[e]['node'],
                           edge_dict[e]['target'],
                           weight=edge_dict[e]['weight'])
        
            
        # Insert Data analysis here
        print("Nodes: {}, Edges: {}".format(len(N.nodes()), len(N.edges())))
        
    # Write N graph in gexf for Gephi
    file_name = "{}_tweets_graph_{}.gexf".format(
        convert("_".join(search_queries)),
        datetime.datetime.now())
    
    nx.write_gexf(N, os.path.join(save_dir, file_name))
    

In [62]:
# Set up Graph
N = nx.DiGraph()

In [63]:
twitter_data_to_graph(search_queries)

Nodes: 226, Edges: 33
Nodes: 265, Edges: 39


### That's it!  Now open Gephi and go play with your graph!