This notebook is for exploring the tweet data, independent of user data

In [1]:
%cd twitteranalysis
import pandas as pd
#Plotting 
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np

sns.set(style="whitegrid")

import environment
from DataTools import DataRepositories as DR
from DataTools import DataConnections as DC
from DataTools.WordORM import Word
from DataTools.TweetORM import Users as User
from DataTools.TweetORM import Tweet
from SearchTools.WordMaps import get_adjacent_word_counts, get_adjacent_words, get_tweet_ids_for_word

EXP_TERMS_FILEPATH = '%s/experimental-terms.xlsx' % environment.EXPERIMENTS_FOLDER
IDS_FILEPATH = "%s/temp_output/tweet-ids.csv" % environment.LOG_FOLDER_PATH

def make_term_ids_filepath(term, path=environment.LOG_FOLDER_PATH):
    return "%s/temp_output/tweet-ids/%s-ids.csv" % (path, term)
    

(bookmark:twitteranalysis) -> /Users/adam/Dropbox/PainNarrativesLab/TwitterDataAnalysis
/Users/adam/Dropbox/PainNarrativesLab/TwitterDataAnalysis


In [2]:
# Find users whose profile contains an experimental term


def get_rows_for_terms(wordFrame, experimentalTerms):
    return wordFrame[wordFrame.term.isin(experimentalTerms)]


def find_mapping(term, termMap):
    for t in termMap.T.index:
        if termMap[t].str.contains(term).any():
            return t
    return False


# load in terms to search for
experimentalTerms = pd.read_excel(EXP_TERMS_FILEPATH, sheet_name='terms', squeeze=True)
termMap = pd.read_excel(EXP_TERMS_FILEPATH, sheet_name='mapping')



In [3]:
def term_map_generator(termMap):
    for term in termMap.T.index:
        for subterm in termMap[term]:
            yield (term, subterm)

In [None]:
tmp = term_map_generator(termMap)

In [None]:
next(tmp)

# How many tweets contain each term?

In [None]:
%time

# iterate through each of the experimental terms, 
# getting tweet ids for each.

# ids = []
for t in termMap.T.index:
    tweets = []
    for subterm in termMap[t]:
        tweets += [x[0] for x in get_tweet_ids_for_word(subterm)]
    tweets = list(set(tweets))
    tweets = pd.Series(tweets, name=t)

    print("%s : %s" % (t, len(tweets)))
    tweets.to_csv(make_term_ids_filepath(t)) 

    
# ids = pd.DataFrame(ids)

# # Save results
# ids.T.to_csv(IDS_FILEPATH) 

# len(ids)

In [None]:
tweets = [x[0] for x in get_tweet_ids_for_word('migraine')]
len(tweets)

In [None]:
ids = tweets
ids = pd.DataFrame(ids)

ids.T.to_csv(make_term_ids_filepath('migraine')) 


# How many unique users have had at least one tweet using the term captured?

In [13]:
e = DC.initialize_engine('mysql')
dao = DC.DAO(e)

creating connection: mysql twitter_data


In [12]:
def get_user_id_and_timestamp_for_tweet(tweetId):
    """Given a tweet id, this queries the mysql db and returns
    a tuple of the user id and timestamp.
    Returns: (userID, created_at)
    """
    # get the tweet object
    result = dao.session.query(Tweet).filter(Tweet.tweetID == tweetId).all()
    return (result[0].userID, result[0].created_at)




def tweet_ids_for_term_generator(term):
    """Generator for iterating over the ids of tweets which contain the term.
    It loads the ids from the relevant file. It does not search the db.
    """
    fp = make_term_ids_filepath(term)
    print(fp)
    # load the tweet ids for the term
    ids = pd.read_csv(fp, squeeze=True)
    for idx, row in ids.iterrows():
        tweetId = row[1]
        yield tweetId

In [26]:

def fp_maker(term, path=environment.LOG_FOLDER_PATH):
    return "%s/temp_output/tweet-ids-w-user-ids/%s-ids.csv" % (path, term)    

passes = 0
for term in termMap.T.index:
        results = []
        print(term)
        if term != 'crps':
            id_generator = tweet_ids_for_term_generator(term)
            while True:
                try:
                    tweetId = next(id_generator)
                    passes += 1
                    userId, timestamp = get_user_id_and_timestamp_for_tweet(tweetId)
                    results.append({'tweetId' : tweetId, 'userId': userId, 'timestamp' : timestamp })
                except StopIteration:
                    results = pd.DataFrame(results)
                    results.to_csv(fp_maker(term))
                    print('%s : %s results ' % (term, len(results)))
                    break
print(passes)

crps
migraine
/Users/adam/Desktop/TwitterDataAnalysisLogs/temp_output/tweet-ids/migraine-ids.csv
migraine : 549395 results 
fibromyalgia
/Users/adam/Desktop/TwitterDataAnalysisLogs/temp_output/tweet-ids/fibromyalgia-ids.csv
fibromyalgia : 662498 results 
spoonie
/Users/adam/Desktop/TwitterDataAnalysisLogs/temp_output/tweet-ids/spoonie-ids.csv
spoonie : 563467 results 
vulvodynia
/Users/adam/Desktop/TwitterDataAnalysisLogs/temp_output/tweet-ids/vulvodynia-ids.csv
vulvodynia : 5611 results 
endometriosis
/Users/adam/Desktop/TwitterDataAnalysisLogs/temp_output/tweet-ids/endometriosis-ids.csv
endometriosis : 298428 results 
neuropathy
/Users/adam/Desktop/TwitterDataAnalysisLogs/temp_output/tweet-ids/neuropathy-ids.csv
neuropathy : 62210 results 
arthritis
/Users/adam/Desktop/TwitterDataAnalysisLogs/temp_output/tweet-ids/arthritis-ids.csv
arthritis : 573767 results 
rhem_arthritis
/Users/adam/Desktop/TwitterDataAnalysisLogs/temp_output/tweet-ids/rhem_arthritis-ids.csv
rhem_arthritis : 27342

In [28]:
userCounts = {}

# determine unique users for each term
for term in termMap.T.index:
    frame = pd.read_csv(fp_maker(term))
    userCounts[term] = len(set(frame.userId.tolist()))

userCounts

{'crps': 52477,
 'migraine': 158936,
 'fibromyalgia': 99718,
 'spoonie': 61915,
 'vulvodynia': 1445,
 'endometriosis': 56379,
 'neuropathy': 13674,
 'arthritis': 134146,
 'rhem_arthritis': 8986,
 'shingles': 1202,
 'backpain': 12423,
 'headache': 31892}

User counts -- at least one tweet using term by user
{
'crps': 52477,
 'migraine': 158936,
 'fibromyalgia': 99718,
 'spoonie': 61915,
 'vulvodynia': 1445,
 'endometriosis': 56379,
 'neuropathy': 13674,
 'arthritis': 134146,
 'rhem_arthritis': 8986,
 'shingles': 1202,
 'backpain': 12423,
 'headache': 31892
 }

Results 

crps
migraine
/Users/adam/Desktop/TwitterDataAnalysisLogs/temp_output/tweet-ids/migraine-ids.csv
migraine : 549395 results 
fibromyalgia
/Users/adam/Desktop/TwitterDataAnalysisLogs/temp_output/tweet-ids/fibromyalgia-ids.csv
fibromyalgia : 662498 results 
spoonie
/Users/adam/Desktop/TwitterDataAnalysisLogs/temp_output/tweet-ids/spoonie-ids.csv
spoonie : 563467 results 
vulvodynia
/Users/adam/Desktop/TwitterDataAnalysisLogs/temp_output/tweet-ids/vulvodynia-ids.csv
vulvodynia : 5611 results 
endometriosis
/Users/adam/Desktop/TwitterDataAnalysisLogs/temp_output/tweet-ids/endometriosis-ids.csv
endometriosis : 298428 results 
neuropathy
/Users/adam/Desktop/TwitterDataAnalysisLogs/temp_output/tweet-ids/neuropathy-ids.csv
neuropathy : 62210 results 
arthritis
/Users/adam/Desktop/TwitterDataAnalysisLogs/temp_output/tweet-ids/arthritis-ids.csv
arthritis : 573767 results 
rhem_arthritis
/Users/adam/Desktop/TwitterDataAnalysisLogs/temp_output/tweet-ids/rhem_arthritis-ids.csv
rhem_arthritis : 27342 results 
shingles
/Users/adam/Desktop/TwitterDataAnalysisLogs/temp_output/tweet-ids/shingles-ids.csv
shingles : 2085 results 
backpain
/Users/adam/Desktop/TwitterDataAnalysisLogs/temp_output/tweet-ids/backpain-ids.csv
backpain : 50053 results 
headache
/Users/adam/Desktop/TwitterDataAnalysisLogs/temp_output/tweet-ids/headache-ids.csv
headache : 152961 results 
2947817


In [18]:
t = pd.read_csv(fp_maker('crps'))
len(t)

310138

In [23]:
seconds = (7* 60) + 42

In [24]:
perRecord = 310138 / seconds

In [25]:
perRecord

671.2943722943722