This notebook is concerned with analyzing the co-occurrance networks of terms

In [1]:
%cd twitteranalysis
import pandas as pd
import numpy as np
import networkx as nx

#Plotting 
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
%config InlineBackend.figure_format = 'svg'

import environment
from DataTools import DataRepositories as DR
from DataTools import DataConnections as DC
from DataTools.WordORM import Word
from DataTools.TweetORM import Users as User
from DataTools.TweetORM import Tweet
from SearchTools.WordMaps import get_adjacent_word_counts, get_adjacent_words, get_user_ids_for_word

def make_term_ids_filepath(term, path=environment.LOG_FOLDER_PATH):
    return "%s/temp_output/tweet-ids/%s-ids.csv" % (path, term)

EXP_TERMS_FILEPATH = '%s/experimental-terms.xlsx' % environment.EXPERIMENTS_FOLDER
IDS_FILEPATH = "%s/temp_output/user-ids.xlsx" % environment.LOG_FOLDER_PATH


# load in terms to search for
experimentalTerms = pd.read_excel(EXP_TERMS_FILEPATH, sheet_name='terms', squeeze=True)
termMap = pd.read_excel(EXP_TERMS_FILEPATH, sheet_name='mapping')

terms = [t for t in termMap.T.index]

(bookmark:twitteranalysis) -> /Users/adam/Dropbox/PainNarrativesLab/TwitterDataAnalysis
/Users/adam/Dropbox/PainNarrativesLab/TwitterDataAnalysis


In [2]:
e = DC.initialize_engine('mysql')
dao = DC.DAO(e)

creating connection: mysql twitter_data


In [None]:
def create_term_network(terms):
    """
    Returns a list of tuples containing unique, unordered combinations
    of two terms
    terms = ['a', 'b', 'c']
    expect = [ ('a', 'b'), ('a', 'c'),('b', 'c')]
    assert(create_term_network(terms) == expect)
    
    """
    network = []
    i = 0
    for term in terms:
        i +=1
        [network.append((term, term2)) for term2 in terms[i:]]
        
    return network
        

terms = ['a', 'b', 'c']
expect = [ ('a', 'b'), ('a', 'c'),('b', 'c')]
assert(create_term_network(terms) == expect)

# Hashtag co-occurrance

Finding co-occurrance networks in the separately stored hashtags and tweetsXtags tables

## Determine whether a term has been used as a hashtag

In [None]:
def get_hashtag_id(term):
    query = """SELECT tagID FROM hashtags WHERE hashtag = %s""" % term

def get_count_of_tweets_containing_hashtag(term):
    query = """SELECT h.hashtag, h.tagID, count(DISTINCT x.tweetID) AS tweetCount 
    FROM hashtags h 
    INNER JOIN tweetsXtags x ON (h.tagID = x.tagID)
    WHERE h.hashtag = %s;""" % term

def get_count_of_tweets_containing_two_hashtags(term1, term2):
    query = """
        SELECT count(DISTINCT x.tweetID) 
        FROM tweetsXtags x 
        INNER JOIN (
            SELECT x1.tweetID AS id 
                FROM tweetsXtags x1 
                INNER JOIN (
                    SELECT tagID AS id 
                    FROM hashtags 
                    WHERE hashtag = %s
                    ) AS t1 
                    ON (x1.tagID = t1.id)
            ) AS t3
            ON x.tweetID = t3.id
        INNER JOIN (
          SELECT x2.tweetID AS id 
                FROM tweetsXtags x2 
                INNER JOIN (
                    SELECT tagID AS id 
                    FROM hashtags 
                    WHERE hashtag = %s
                ) AS t1 
                ON (x2.tagID = t1.id)
        ) AS t4
        ON x.tweetID = t4.id """ % (term1, term2)

In [None]:
create_node_network(experimentalTerms.tolist())

# Tweet word_map co-occcurrance

Finding co-occurance networks in the tweet data stored in word_maps

In [None]:

def load_tweet_id_data(term):
    return pd.read_csv(make_term_ids_filepath(term), names=['idx', 'id']) #.drop('idx', axis=1)

def get_cooccurences(term1, term2):
    d1 = load_tweet_id_data(term1)
    d2 = load_tweet_id_data(term2)
    return (term1, term2, len(d1[d1.id.isin(d2.id)]))


In [None]:
terms = [t for t in termMap.T.index]
network = create_term_network(terms)

In [None]:
edges = [get_cooccurences(t1, t2) for t1, t2 in network]

In [None]:
edges

### Make networkx graph object

In [None]:
GRAPHS_FOLDER = "%s/temp_output/graphs" % environment.LOG_FOLDER_PATH
GRAPH_FILEPATH = "%s/experimental-terms-in-tweets.gexf" % GRAPHS_FOLDER

def write_graph_to_file(graph, filepath=GRAPH_FILEPATH):
    # write to file
    nx.write_gexf(graph, filepath)

@nx.utils.decorators.open_file(0,'r')
def load_graph_from_file(filepath=GRAPH_FILEPATH):
    return nx.read_gexf(filepath)

### Make graph file for gephi

In [None]:
# This produces a network which is small enough that gephi doesn't die
# It does not get the degree of nodes right since the number of edges is
# stored as the weight of a single edge. But it seems to be okay for
# visualizatinos
G = nx.Graph()   # or DiGraph, MultiGraph, MultiDiGraph, etc
G.add_weighted_edges_from(edges)
write_graph_to_file(G, '%s/exp-terms-in-tweets-for-gephi.gexf' % GRAPHS_FOLDER)

### Make full graph file 

The output of this will be very large

In [None]:
# This produces a graph
G = nx.MultiGraph()   # or DiGraph, MultiGraph, MultiDiGraph, etc
for n1, n2, degree in edges: 
    G.add_edges_from([(n1, n2) for i in range(0, degree)])
G.size()

In [None]:
write_graph_to_file(G, 'experimental-terms-in-tweets')

In [None]:
nx.draw_networkx(G, labels=True)

### Load from gexf file

In [None]:
g = load_graph_from_file()

## Measures

In [None]:
def convert_nx_output_to_dataframe(output, measure_label):
    """
    Networkx algorithms output a dictionary of results. 
    This converts them into a dataframe
    Example:
        dc = nx.degree_centrality(G)
        dc = convert_nx_output_to_dataframe(dc, 'degree_centrality')
    """
    j = []
    for r in output.keys():
        j.append({'term' : r, measure_label : output[r]})
    j = pd.DataFrame(j)
    j.set_index('term', inplace=True)
    return j

### Degree of nodes

In [None]:
d = nx.degree(g)
degree = pd.DataFrame([{ 'term' : term, 'degree': degree} for term, degree in d]).set_index('term')
degree

In [None]:
fig, axes = plt.subplots(figsize=(7, 4))
degree.plot(kind='barh', ax=axes)
fig.tight_layout()

### Degree centrality

In [None]:
degree_centrality = convert_nx_output_to_dataframe(nx.degree_centrality(g), 'degree_centrality')

In [None]:
fig, axes = plt.subplots(figsize=(7, 4))
degree_centrality.plot(kind='barh', ax=axes)
fig.tight_layout()

In [None]:
nx.clustering(G)

In [None]:
%run -i GraphEditingTools.py
%run -i GraphTools.py



In [None]:
nx.to_dict_of_lists(G)

In [None]:
create_node_network('migraine', 'crps')

In [None]:
b = load_tweet_id_data('migraine')
len(b)

In [None]:
b[:3]

In [None]:
c = load_tweet_id_data('crps')
len(c)

In [None]:
len(d)

In [None]:
c[:5]

In [None]:
j = c[c.id.isin(b.id)]
len(j)

In [None]:
j[:3]

In [None]:
 pos=nx.spring_layout(G)

In [None]:
experimentalTerms

In [None]:
experimentalTerms

In [None]:
d[1]

# Temporal distribution of tweets with stored hashtags