In [None]:
#### This notebook is dedicated to determining which users are pain patients


### ToDo

* Isolate people who have only texted about migraine vs. crps (etc). Look for systematic differences
* Do analysis on all words in texts
* Try doing the analysis on twitter users rather than tweets
* Combine terms like 'fibro' and 'fibromyalgia'
* Store lists of things to ignore/combine/categorize etc in spreadsheet and import?

### Notes
SQL query for counting tweets in which hashtag used:
SELECT h.hashtag, count(t.tweetID) AS totTweets FROM hashtags h INNER JOIN tweetsXtags t ON h.tagID = t.tagID GROUP BY hashtag ORDER BY totTweets DESC ;

#### This cell was imported from twitter_data_analysis 11/9/16
That was not the last time this notebook was updated 

# Setup
## Base imports

In [None]:
#standard lib
import os
import string

#other people's property
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd
from pandas import DataFrame, Series
import sqlalchemy

#Plotting 
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

In [None]:
#Base Classes stuff
from FileSystemTools import *
from UtilityDecorators import *
from UtilityFunctions import *


BASE = getSystemRoot()
USER_PROFILE_TERM_FREQ_FILEPATH = '%s/Desktop/TEMPORARY/user-profile-word-counts.csv' % BASE
MIN_100_NO_STOPS = '/Users/adam/Desktop/TEMPORARY/user-profile-word-counts-min-100-ex-stops.csv'
MAP_PICKLE_PATH = '%s/Desktop/TEMPORARY/user-id-map' % BASE

In [None]:
# Global variables

# The minimum frequency above which to
# exclude terms
MIN_FREQ = 100

## Create cursor for user data

In [None]:
%cd twitteranalysis
# %run -i DataTools/Cursors
from DataTools.Cursors import UserCursor
u = UserCursor(language='en')

In [None]:
j = u.next()

In [None]:
j

# Word counts in user descriptions

In [None]:
%cd texttools
from TextProcessors.Filters import *
from TextProcessors.Modifiers import *
from TextProcessors.Processors import *

filters = [
     UsernameFilter(),
    PunctuationFilter(),
    URLFilter(),
    NumeralFilter()
]

modifiers = [
    WierdBPrefixConverter(), 
    CaseConverter( ) 
]

In [None]:
%cd texttools
%run -i StatisticalTools/Counters
counter = WordCounter()
counter.add_filters(filters)
counter.add_modifiers(modifiers)

# Integrating with older tools

In [None]:
# Statistical tools approach is better, but
# older version separates work better...

%cd twitteranalysis
%run -i ProcessingTools/QueueTools.py
%run -i ProcessingTools/ProcessingControllers.py
%run -i ProcessingTools/Listeners.py

# First set up the object which will handle applying
# filters and modifiers to each word
word_processor = SingleWordProcessor()
word_processor.add_filters(filters)
word_processor.add_modifiers(modifiers)

# Set up the machinery for saving the 
# processed results
queueHandler = SaveQueueHandler()
listener = SaveListener()
queueHandler.register_listener(listener)

# Finally create the command and control
control = UserProcessingController(queueHandler)
control.load_word_processor(word_processor)

In [24]:
%cd twitteranalysis

# Make a fake user
%run -i TestingTools/Factories.py
%run -i TestingTools/DummyCursors.py
dummyCursor =DummyUserCursor()
# user = UserFactory()

(bookmark:twitteranalysis) -> /Users/adam/Dropbox/PainNarrativesLab/TwitterDataAnalysis
/Users/adam/Dropbox/PainNarrativesLab/TwitterDataAnalysis


In [27]:
u = dummyCursor.next()
u.description

'Staff future great position catch deep call. Leg itself professional front memory long. Call financial behavior song site.'

In [None]:
import asyncio


async def foo():
    print('Running in foo')
    await asyncio.sleep(0)
    print('Explicit context switch to foo again')


async def bar():
    print('Explicit context to bar')
    await asyncio.sleep(0)
    print('Implicit context switch back to bar')


ioloop = asyncio.get_event_loop()
tasks = [ioloop.create_task(foo()), ioloop.create_task(bar())]
wait_tasks = asyncio.wait(tasks)
ioloop.run_until_complete(wait_tasks)
ioloop.close()

In [None]:
Us

In [None]:
user = u.next()
user.description

In [None]:
control.process(user)

In [None]:
queueHandler.queue.queue

In [None]:
# This model works faster
us = []
for i in range(0, 20):
    us.append(u.next())

[ control.process(u) for u in us]

In [None]:
# Next task: getting the queueHandler to do its job
# and save to sql

### Process the user descriptions for word counts

In [None]:
# Don't know why but this is incredibly slow
# restricted number of users for testing
# max_users = 2

# for i in range(0, max_users):
#     user = u.next()
#     upc.process(user)
# #     counter.process(user.description)


In [None]:
sq.queue.queue

In [None]:

def process_word_freq_in_user_profiles(counter, userCursor):
    # all users
    user_count = 0

    while True:
        try:
            user = userCursor.next()
    #         Note that we're not going to add the id to the map yet
            counter.process(user.description)
            user_count += 1
        except StopIteration as e:
            print("%s users processed; %s words identified" % (user_count, len(counter.counts)))
            break

    return Series(counter.counts)

# word_freq_in_user_profiles = process_word_freq_in_user_profiles(counter, u)
# word_freq_in_user_profiles.to_csv(USER_PROFILE_TERM_FREQ_FILEPATH)

# took 6m31s to process 
# 1162362 users processed; 846948 words identified


### Report word frequencies in user profiles

In [None]:
fig, ax = plt.subplots(figsize=(8,4))
word_freq_in_user_profiles.hist(ax=ax, bins=100, bottom=0.1)
ax.set_yscale('log')
ax.set_title("Term frequency in user profiles")
fig.tight_layout()

## Truncate the found word list

In [None]:
# Remove terms 
words = word_freq_in_user_profiles.loc[lambda x: x > MIN_FREQ]
print("%s words out of %s appear more often than %sx" % (len(words), len(word_freq_in_user_profiles), MIN_FREQ))

In [None]:
out = '/Users/adam/Desktop/TEMPORARY/user-profile-word-counts-over-100.csv'
# words.to_csv(out)

In [None]:
fig, ax = plt.subplots()
words.hist(ax=ax, bins=100, bottom=0.1)
ax.set_yscale('log')
ax.set_title("Term frequency in user profiles (min: %s)" % MIN_FREQ)
fig.tight_layout()

## Remove stopwords 

In [None]:
def is_stopword(text):
    """check whether the supplied text is a stopword"""
    return text in nltk.corpus.stopwords.words('english')

def remove_stopwords(series):
    d = {}
    for k in series.keys():
        if not is_stopword(k):
             d[k] = series[k]
    print("Removed %s stopwords from the words list" % (len(series) - len(d.keys())))
    return Series(d)

words = remove_stopwords(words)
#words.to_csv(MIN_100_NO_STOPS)

### Word frequencies in user profiles

In [None]:
words = pd.read_csv(MIN_100_NO_STOPS, header=None, names=['term', 'freq'])
len(words)

In [None]:
fig, ax = plt.subplots(figsize=(8,4))
words.freq.hist(ax=ax, bins=100, bottom=0.1)
ax.set_yscale('log')
ax.set_title("User profile term freq (min: %s ; stopwords removed)" % MIN_FREQ)
fig.tight_layout()

In [None]:
fig, ax = plt.subplots(figsize=(8,4))
sns.distplot(words.freq, ax=ax)
ax.set_title("KDE of Term frequency in user profiles")

In [None]:
words.max()

In [None]:
w = words.sort_values(by='freq', ascending=False)
w[:10]

In [None]:
j = pd.read_csv(MIN_100_NO_STOPS)

In [None]:
len(j)

In [None]:
ww = words.copy(deep=True)[:10]
w5 = ww.loc[lambda x : x.freq > 1000]
w5

In [None]:
j[:5]

In [None]:
# Remove terms 
def filter_freq_list_by_min(wordFrame, minFreq):
    """Returns a copy of the frame, sans terms which do not occur above the minimum frequency"""
    w = wordFrame.loc[lambda x: x.freq > minFreq]
    print("%s words out of %s appear more often than %sx" % (len(w), len(wordFrame), minFreq))
    return w


In [None]:
w4 = filter_freq_list_by_min(words, 10000)
len(w4)

In [None]:
w4

In [None]:
%cd texttools
%run -i StatisticalTools/Counters
selectedCounter = SelectedWordCounter()
selectedCounter.set_words_to_count(list(counter.map.keys()))

In [None]:
selectedCounter.map

In [None]:
# restricted number of users for testing
max_users = 10



for i in range(0, max_users):
    user = u.next()
    selectedCounter.process(user.description, user.userID)

    
# todo save data

In [None]:
selectedCounter.counts

In [None]:
usersWithTermInProfile =[]

In [None]:

def get_rows_for_terms(wordFrame, experimentalTerms):
    return wordFrame[wordFrame.term.isin(experimentalTerms)]

EXP_TERMS_FILEPATH = '%s/Dropbox/PainNarrativesLab/Data/experimental-terms.xlsx' % BASE
experimentalTerms = pd.read_excel(EXP_TERMS_FILEPATH, squeeze=True)

In [None]:
get_rows_for_terms(words, experimentalTerms)

# Logger

In [None]:
from FileSystemTools import *
from UtilityDecorators import *
from UtilityFunctions import *


BASE = getSystemRoot()
LOG_FOLDER_PATH = "%s/Desktop/TwitterDataAnalysisLogs" %BASE
DEFAULT_LOG_FILE_NAME = 'twitter_log.txt'
DEFAULT_LOG_FILE_PATH = "%s/%s" % (LOG_FOLDER_PATH, DEFAULT_LOG_FILE_NAME)


In [None]:
%cd tweetloggers
%run -i FileLoggers

In [None]:
%bookmark 

In [None]:
f = FileWritingLogger(name='best')

In [None]:
f.log('j')

In [None]:
fwl = FileWritingLogger()

In [None]:
fwl.log('taco')

In [None]:
fwl2 = FileWritingLogger('new')

In [None]:
f.log_error('tacoooooo')

In [None]:
sol = StdOutLogger()

In [None]:
sol.log('taco')

# Categorize users by condition

In [None]:
# Determine number of valid users identified and number of tweets each

In [None]:
stripNonAlphaNum('!jip!')

In [None]:

wordTokenizer = WordTokenizer()
word_processor = SingleWordProcessor()


word_processor.add_to_filters( TextProcessors.Filters.UsernameFilter( ) )
word_processor.add_to_filters( TextProcessors.Filters.PunctuationFilter( ) )
word_processor.add_to_filters( TextProcessors.Filters.URLFilter( ) )
word_processor.add_to_filters( TextProcessors.Filters.NumeralFilter( ) )
word_processor.add_to_modifiers( TextProcessors.Modifiers.WierdBPrefixConverter() )
# processor.add_to_modifiers( TextProcessors.Modifiers.UnicodeConverter() )
word_processor.add_to_modifiers( TextProcessors.Modifiers.CaseConverter( ) )


# Create queue and listeners for processed tokens
Queue = SaveQueueHandler()
Queue.register_listener(SaveListener())

# Load cursor for tweet ids
cursor = DataTools.Cursors.TweetCursor()

StringProcessingWorker.initialize(cursor, Queue, word_processor)
threads = []

# for _ in range(1):
worker = StringProcessingWorker()
# worker.do_it()



In [None]:
%cd textanalysis
%run -i ProcessingTools/SearchTools.py


In [None]:
s = Searcher()
spoonie_results = s.search('Spoonie')

In [None]:
# This cell was imported from twitter_data_analysis 11/9/16 
%cd textanalysis
%run -i ProcessingTools/SearchTools.py

# \section{Word frequency}
# freq = pd.read_excel("%s/freq_wordlist.xlsx" % DATAFOLDER)
# len(freq)

# fig, ax = plt.subplots()
# freq.freq.hist(ax=ax, bins=100, bottom=0.1)
# ax.set_yscale('log')
# ax.set_title("Term frequency in user profiles")

# trimmed_freq = freq[freq.freq > 1000]
# len(trimmed_freq)

# fig, ax = plt.subplots()
# trimmed_freq.freq.hist(ax=ax, bins=100, bottom=0.1)
# ax.set_yscale('log')
# ax.set_title("Term frequency in user profiles")

from beautifulsoup4.BeautifulSoup import BeautifulStoneSoup

def formatter(text):
    try:
        return unicode(BeautifulStoneSoup(text, convertEntities=BeautifulStoneSoup.ALL_ENTITIES))
    except:
        print ("Error %s" % text)
        return ''


trimmed_freq['word'] = trimmed_freq['word'].map(lambda x: formatter(x))

trimmed_freq.sort('freq', ascending=False)[:10]

fibro_aliases = ['Fibromyalgia', 'Fibro', 'fibro*']
for t in fibro_aliases:
    fibro = condition_searcher(t, fibro)

crps = Condition('crps')
crps_aliases = ['crps', 'RSD', 'c.r.p.s.', 'r.s.d.', 'complex regional pain syndrome', 'reflex sympathetic dystrophy']
for t in crps_aliases:
    crps = condition_searcher(t, crps)

import numpy
d = numpy.genfromtxt('/Users/adam/Desktop/freq_wordlist.txt')

import asciitable
h = DataFrame(asciitable.read('/Users/adam/Desktop/freq_wordlist.txt'))

dt = pd.read_table('/Users/adam/Desktop/freq_wordlist.csv', delim_whitespace=True, encoding='utf-8')

dt = dt.applymap(lambda x: x['word'].encode('ascii', 'replace'))

s = Searcher()
crps_results = s.search('((crps) | (RSD) | (r.s.d.) | (c.r.p.s.) | (complex regional pain syndrome) | (chronic regional pain syndrome) | (reflex sympathetic dystrophy))')
s = Searcher()
migraine_results = s.search('((migraine) | (Migraineur) | (migr*))')
s = Searcher()
fibro_results = s.search('((Fibromyalgia) | (Fibro) | (fibro*) | (fm) | (fms))')
s = Searcher()
spoonie_results = s.search('Spoonie')
s = Searcher()
vulvodynia_results = s.search('Vulvodynia | Vulvadynia')
s = Searcher()
endo_results = s.search('endometriosis | endo')
s = Searcher()
neuropathy_results = s.search('neuropathy')
s = Searcher()
arthritis_results = s.search('((arthritis) | (*arthritis) | (oa) | (ra))' )
s = Searcher()
neuralgia_results = s.search('(neuralgia) | (*neuralgia)')
s = Searcher()
shingles_results = s.search('((shingles) | (post-herpetic neuralgia) | (PHN))')
s = Searcher()
backpain_results = s.search('(back pain | backpain)')
s = Searcher()
headache = s.search('headache')

migraine_users = ug.get_from_list('migraine', migraine_results['userids'])
crps_users = ug.get_from_list('crps', crps_results['userids'])
fibro_users = ug.get_from_list('fibromyalgia', fibro_results['userids'])
spoonie_users = ug.get_from_list('spoonie', spoonie_results['userids'])
vulvodynia_users = ug.get_from_list('vulvodynia', vulvodynia_results['userids'])
endo_users = ug.get_from_list('endometriosis', endo_results['userids'])
neuropathy_users = ug.get_from_list('neuropathy', neuropathy_results['userids'])
arthritis_users = ug.get_from_list('arthritis', arthritis_results['userids'])
neuralgia_users = ug.get_from_list('neuralgia', neuralgia_results['userids'])
shingles_users = ug.get_from_list('shingles', shingles_results['userids'])
backpain_users = ug.get_from_list('backpain', backpain_results['userids'])

migraine_users.to_csv('%s/migraine.csv' % DATAFOLDER)
crps_users.to_csv('%s/crps.csv' % DATAFOLDER)
fibro_users.to_csv('%s/fibromyalgia.csv' % DATAFOLDER)
spoonie_users.to_csv('%s/spoonie.csv' % DATAFOLDER)
vulvodynia_users.to_csv('%s/vulvodynia.csv' % DATAFOLDER)
endo_users.to_csv('%s/endo.csv' % DATAFOLDER)
neuropathy_users.to_csv('%s/neuropathy.csv' % DATAFOLDER)
arthritis_users.to_csv('%s/arthritis.csv' % DATAFOLDER)
neuralgia_users.to_csv('%s/neuralgia.csv' % DATAFOLDER)
shingles_users.to_csv('%s/shingles.csv' % DATAFOLDER)
backpain_users.to_csv('%s/backpain.csv' % DATAFOLDER)

In [None]:
j='jjj'
print("%s" % j)

In [None]:
TextTools.TextFilters.remove_numerals("7")

In [None]:
"""
Used to be in FILEFOLDER = '%s/Desktop/user_categories' % BASE
Moved into Data folder in pain narratives lab
"""
def file_getter(filefolder=FILEFOLDER):
    """
    Loads the the files in the folder    
    Args:
        filefolder: Folder path for where all the folders are stored
    
    Returns:
        Dataframe
    """
    datafiles = []
    sourceFolder = os.walk(filefolder)
    for f in sourceFolder:
        filelist = f[2]
        for fl in filelist:
            fl = str(fl)
            #if fl is not '.DS_Store':
            if fl[-5:] == '.xlsx':
                print (fl)
                loc = filefolder + '/' + str(fl)
                datafiles.append(loc)
    return datafiles


def cat_importer(filenames):
    """
    Import spreadsheets with categorizations of users, 
    combine them, and return the result
        
    Args:
        filenames: List of full path excel files
    
    Returns:
        Dataframe
    """
    frames = []
    for f in filenames:
        try:
            frames.append(pd.read_excel(f))
        except:
            print ("error with %s" % f)
    combined = pd.concat(frames)
    return combined

In [None]:
files = file_getter()
compiled = cat_importer(files)
print (len(compiled))

In [None]:
compiled.columns

In [None]:
# Make frames with categorized users
compiled = compiled[compiled.relevant == 1]
patients = compiled[compiled.patient == 1]
clinicians = compiled[compiled.clinician == 1]

# Separate by condition
patients_by_condition = patients.groupby('term')
clinicians_by_condition = clinicians.groupby('term')

In [None]:
for g, n in patients_by_condition:
    print(g, len(n)) 

In [None]:
migraine_bag = []
m = patients_by_condition.get_group('migraine')
for i in m.profile:
    migraine_bag.append(word_tokenize(i))
#for p in m:
#    migraine_bag.append(word_tokenize(p.profile))
print(len(migraine_bag))

In [None]:

bagmaker = TextTools.WordBagMaker()
bagmaker.add_to_ignorelist(ConstantsAndUtilities.Ignore.get_list())
bagmaker.add_to_ignorelist(list(string.punctuation))
bagmaker.add_to_ignorelist(nltk.corpus.stopwords.words('english'))
bagmaker.add_to_cleaners(TextTools.URLCleaner())
#bagmaker.add_to_cleaners(TextTools.NumeralCleaner())

In [None]:
'3'.isalpha()

In [None]:
nc = TextTools.NumeralCleaner()

In [None]:
nc.clean('cat')

In [None]:
m = patients_by_condition.get_group('migraine')
pw = [w for w in m.profile]

bagmaker.process(pw)
print(len(bagmaker.masterbag))

In [None]:
bagmaker.masterbag

In [None]:
wf = TextStats.WordFreq(bagmaker.masterbag)

In [None]:
wf.topN(20)

In [None]:
wf.plot(30)

In [None]:
mb = tuple(migraine_bag)
mfd = nltk.FreqDist(mb)

In [None]:
%cd twitteranalysis
import DataTools.Cursors

# Create tables in database
from DataTools.TweetORM import create_db_tables
# create_db_tables()

#%cd /Users/adam/Dropbox/PainNarrativesLab/TwitterDataAnalysis
#%bookmark twitteranalysis
%cd twitteranalysis
%run -i environment.py
%run -i ConstantsAndUtilities.py
%run -i TestingTools/DataAndFunctionsForTesting.py
%run -i "DataTools/DataStructures.py"
%run -i "DataTools/DataConnections.py"
%run -i "DataTools/WordORM.py"
%run -i "DataTools/DataRepositories.py" 

# Initialize the tools for filtering and modifying the individual tweet words
from TextProcessors.Filters import *
from TextProcessors.Modifiers import *
%cd texttools
%run -i TextProcessors/Processors
%run -i TextProcessors/Tokenizers

%run -i ProcessingTools/ProcessingControllers.py
%run -i ProcessingTools/Listeners.py
%run -i ProcessingTools/Workers.py
import DataTools.Cursors