This includes tools for examining which words occur together anywhere in a tweet or user profile without regard to their relative positions.

Tools for investigating relative positions are in the Word Searching notebook

In [1]:
%cd twitteranalysis
import sqlite3
import environment
import pandas as pd
# plotting
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
%config InlineBackend.figure_format = 'svg'

#Set default to svg

#let pandas dataframe listings go long
pd.options.display.max_rows = 999


EXP_TERMS_FILEPATH = '%s/experimental-terms.xlsx' % environment.EXPERIMENTS_FOLDER
IDS_FILEPATH = "%s/temp_output/tweet-ids.csv" % environment.LOG_FOLDER_PATH

def make_term_ids_filepath(term, path=environment.LOG_FOLDER_PATH):
    return "%s/temp_output/tweet-ids/%s-ids.csv" % (path, term)


# load in terms to search for
experimentalTerms = pd.read_excel(EXP_TERMS_FILEPATH, sheet_name='terms', squeeze=True)
termMap = pd.read_excel(EXP_TERMS_FILEPATH, sheet_name='mapping')

terms = [t for t in termMap.T.index]

(bookmark:twitteranalysis) -> /Users/adam/Dropbox/PainNarrativesLab/TwitterDataAnalysis
/Users/adam/Dropbox/PainNarrativesLab/TwitterDataAnalysis


In [2]:
CO_OCCURRING_TERMS_FOLDER = "%s/co-occurring-terms" % environment.LOG_FOLDER_PATH

def make_filepath(word):
    return "%s/words-in-tweets-using-%s.pickle" % (CO_OCCURRING_TERMS_FOLDER, word)


# Create test data

In [None]:
TEST_DB = "%s/test-data.db" % environment.LOG_FOLDER_PATH

tweet1 = 'a b c d e f. a b c d.'
tweet2 = 'e f g h. i j k l.'
tweet3 = 'a e. a. i.'

In [None]:
expect = {'a' : 2, 'b': 1,'c': 1, 'd': 1,    'e':2,
    'f': 1,    'g': 1,
    'h': 1,    'i': 2,
    'j': 1,    'k': 1,
    'l': 1}

## All words for tweets

In [None]:
 # query = """
# SELECT a.word, count(a.word)
# FROM word_map a
# WHERE a.tweet_id = (
# SELECT tweet_id
# FROM word_map b
# WHERE b.word = ? 
# )
# GROUP BY a.word

# """


# Process and save co-occurrence frequencies


NB, this uses the stopword free lists

In [3]:
def get_all_words_in_tweet(tweetId, db):
    """
    Returns all the words used in the tweet
    
    Example:
        words = get_all_words_in_tweet(331546674315014144, db=environment.TWEET_DB_NO_STOP)
        words = [x[2] for x in words]
    Result:
        words = ['thought', 'crying', 
        'like', 'crazy',
        'im', 'tired', 
        'pain','inevitability', 
        'rely', 'life',
        'spoonie']
    """
    conn = sqlite3.connect(db)
    query = "SELECT * FROM word_map WHERE tweet_id = ?"
    param = (tweetId, )
    with conn:
        r = conn.execute(query, param)
        return r.fetchall()


In [4]:


def get_cooccuring_word_frequencies(word, save_path=None, db=environment.TWEET_DB_NO_STOP):
    """Returns a dataframe of the frequencies with which words
    appear in tweets containing the provided word
    
    todo Must be verified that this is correct!!!!!
  
    """
    
    query2 ="""
    SELECT a.word, count(distinct a.tweet_id)
    FROM word_map a
    JOIN word_map b
    ON a.tweet_id = b.tweet_id
    WHERE b.word = ?
    GROUP BY a.word
    """

    conn = sqlite3.connect(db)
    param = (word, )
    with conn:
        r = conn.execute(query2, param)
        result = r.fetchall()
    result = [{'word' : x[0], 'freq' : x[1]} for x in result]
    result = pd.DataFrame(result)
    
    if save_path != None:
        result.to_pickle(save_path)
    
    return result




Terms saved:
    arthritis
    hurt
    pain
    toothache
    
    'crps',
 'migraine',
 'fibromyalgia',
 'spoonie',
 'vulvodynia',
 'endometriosis',
 'neuropathy',
 'arthritis',
 'rhem_arthritis',
 'shingles',
 'backpain',
 'headache'
    
Execution times:

    wall time: 2min 29s


In [None]:
%%time
term = 'pain'

# Returns a dataframe with columns freq, term
# NB, this saves the result to a pickle file
result = get_cooccuring_word_frequencies(term, make_filepath(term))
print(len(result))

In [5]:
# runtime for all but 2 terms: 5m 22s
for term in terms:
    skip = ['arthritis', 'rhem_arthritis']
    if term not in skip:
        get_cooccuring_word_frequencies(term, make_filepath(term))


# Explore co-occurrences

## Terms occurring in a tweet containing the term 'pain'

In [None]:
# variables for this section
term = 'pain'
cutoff = 10000

In [None]:
# Load co-occurrences for term
data = pd.read_pickle(make_filepath(term))
data.set_index('word', inplace=True)

# sift out results with frequencies greater than  cutoff
data = data[data.freq>cutoff]
print("%s terms appear with '%s' in more than %s tweets" % (len(data), term, cutoff))

In [None]:
# top words which occur somewhere in a tweet where the word
# 'pain' occurs.
top = data.sort_values('freq').tail(100)
top

In [None]:
fig, axes = plt.subplots(figsize=(7, 5))
sns.violinplot(top.freq, ax=axes); 
axes.set_title('Frequency in the top 100 terms co-occuring with %s' % term); fig.tight_layout()

In [None]:
d = pd.read_pickle(make_filepath(term))
d.set_index('word', inplace=True)
len(d)

In [None]:
d.sort_values('freq').head(100)

In [None]:
fig, axes = plt.subplots(figsize=(8,8))
sns.violinplot(d, ax=axes)
fig.tight_layout()

# Term co-occurrence frequencies

How often do the most common terms for each of our experimental terms occur 


In [None]:
# Load all term data

In [55]:

def term_generator(skip=[]):
    for term in terms:
        if term not in skip:
            yield term
def rank_gen():
    i = 0
    while True:
        yield i
        i += 1
         
def add_rank_column(frame, term):
    rg = rank_gen()
    frame["%s-rank" % term] = frame.apply(lambda x: next(rg), axis=1)
    return frame

In [56]:
r = rank_gen()

In [57]:
next(r)

0

In [58]:
f = add_rank_column(frames[0], 'taco')

In [59]:
f[:5]

Unnamed: 0_level_0,crps,%s-rank,taco-rank
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
mariemcgowin,10513,,0
fibro,10563,,1
pls,10566,,2
spoonie,11309,,3
help,11448,,4


In [65]:
cutoff = 10000
skip = ['rhem_arthritis']

tgen = term_generator(skip)
frames = []
rankframes = []
while True:
    try:
        term = next(tgen) 
        d = pd.read_pickle(make_filepath(term))
        d.set_index('word', inplace=True)
        # sort by frequency
        d.sort_values('freq', ascending=False, inplace=True)
        # add rank
        d = add_rank_column(d, term)
        # cull below cutoff
        d = d[d.freq>cutoff]
        d.rename({'freq': term}, axis=1, inplace=True)  
        frames.append(d)
        print("%s : %s " % (term, len(d)))
    except StopIteration:
        break


crps : 24 
migraine : 40 
fibromyalgia : 38 
spoonie : 52 
vulvodynia : 0 
endometriosis : 17 
neuropathy : 4 
arthritis : 61 
shingles : 0 
backpain : 5 
headache : 10 


In [66]:
data = pd.concat(frames, axis=1)

In [29]:
frames[0].rename({'freq': 'j'}, axis=1,inplace=True)
frames[0][:5]

Unnamed: 0_level_0,j
word,Unnamed: 1_level_1
...,17938
amp,23165
awareness,18576
bnightscrps,11947
celebrity,13408


In [34]:
len(data)

130

In [68]:
folder = "%s/temp_output/co-occurrence" % environment.LOG_FOLDER_PATH
data.to_excel("%s/cooccurrences.xlsx" % folder)

In [None]:
x = data.sort_values('word')[7:16]
[y[1] for y in x.values]