In [1]:
import pandas as pd

import string
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist

In [9]:
pd.set_option('display.min_rows', 50)
pd.options.display.max_colwidth = 150

In [3]:
#read in tweets queried containing 'anxiety'
df = pd.read_csv('csv_files/anxiety_tweets.csv')

In [5]:
#removing unwanted columns for this analysis
df.drop(columns = ['user', 'date', 'retweet','mention', 'hashtags', 'location'], inplace = True)

In [13]:
#sanity check
df.head()

Unnamed: 0,text
0,It's okay to not totally understand the specific struggles of queer Japanese / Japanese people... but you can be open to learning about it. Just b...
1,"10 things to know before returning to your hair salon Excitement, anxiety and a new normal at salons and barber shops as Long Island enters Phase ..."
2,"Well that information would have been useful 2 months ago, and avoided wasted anxiety every single time a person came even slightly close to enter..."
3,My anxiety hasn’t been the greatest lately.
4,i was mostly afraid because i have really bad anxiety and we've had misunderstandings in the past so i kept a lot to myself but recently it reache...


In [14]:
stop_words = stopwords.words('english')

#add punctuation char's to stopwords list
stop_words += list(string.punctuation)
stop_words += ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'anxiety', 'rt']

In [16]:
#stop_words

In [17]:
def process_text(text):
    tokens = word_tokenize(text)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
    return stopwords_removed

In [18]:
df['text'] = df['text'].apply(process_text)

### Checking words to see where preprocessing failed.

In [20]:
df['text'][15:20]

15                                                                            [season, 13, reasons, stressing, tf, already, clay, really, think, batman, ’]
16    [fear, love, covid, horror, life, coronavirus, mentalhealth, art, depression, faith, hope, motivation, fearless, scary, dark, quotes, courage, cre...
17                                                                                                   [feel, like, 7pm, cheer, slowly, morphed, 7pm, scream]
18    [twitter, started, original, issues, twitter, wrong, place, get, help, unless, 's, really, intent, was.twitter, n't, professional, help, fact, thi...
19    [thread, beautiful, moments, protests, ’, attended, nyc, past, week, half, post, intended, glorify, protests, show, wish, spread, bit, positivity,...
Name: text, dtype: object

In [21]:
print(df['text'][9973])

['painting', 'batman', 'procreate', 'pocket', 'art', 'color', 'paint', 'iphone', 'finger', 'mark', 'stroke', 'brush', 'manhattan', 'new', 'york', 'https', '//www.instagram.com/p/b5lxqgylvx0/', 'igshid=j68k86tqc9l']


#### Results
* Tokens appear to have been split effectively with some exceptions. 
* Hashtags and mentions have been removed. Standalone punctuation has been removed. 
* Stop words are not present. However, URL's have been split. In order to prevent the extra noise from the URL's, I will remove them prior to the split.

## Iteration 2 

In [23]:
df = pd.read_csv('csv_files/anxiety_tweets.csv')

In [24]:
#removing unwanted columns for this analysis
df.drop(columns = ['user', 'date', 'retweet','mention', 'hashtags', 'location'], inplace = True)

### Proposed Steps

* Remove Urls
* Remove stopwords
* Lowercase text
* Analyze text to see what was missed


In [25]:
df.head(2)

Unnamed: 0,text
0,It's okay to not totally understand the specific struggles of queer Japanese / Japanese people... but you can be open to learning about it. Just b...
1,"10 things to know before returning to your hair salon Excitement, anxiety and a new normal at salons and barber shops as Long Island enters Phase ..."


In [27]:
#remove urls from text first
def remove_urls(text):
    return re.sub(r'http\S+','', text)

df['text'] = df['text'].apply(remove_urls)

In [28]:
stop_words = stopwords.words('english')

#add punctuation char's to stopwords list
stop_words += list(string.punctuation)
stop_words += ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'anxiety', 'rt']

In [29]:
def process_text(text):
    tokens = word_tokenize(text)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
    return stopwords_removed

In [30]:
df['text'] =  df['text'].apply(process_text)

In [31]:
df['text'][9973]

['painting',
 'batman',
 'procreate',
 'pocket',
 'art',
 'color',
 'paint',
 'iphone',
 'finger',
 'mark',
 'stroke',
 'brush',
 'manhattan',
 'new',
 'york']

In [32]:
df['text'][15]

['season',
 '13',
 'reasons',
 'stressing',
 'tf',
 'already',
 'clay',
 'really',
 'think',
 'batman',
 '’']

### Results

* While the url's have been dealt with, words like "n't", "'s", "was.twitter", "7PM", "13, etc. all still remain.


### Additional removal

In [33]:
test = list(df['text'][15:20])

In [42]:
test[2]

['feel', 'like', '7pm', 'cheer', 'slowly', 'morphed', '7pm', 'scream']

In [44]:
list(filter(lambda x: x.isalpha(), test[2]))

['feel', 'like', 'cheer', 'slowly', 'morphed', 'scream']

In [45]:
def remove_nums(text_object):
    no_nums = list(filter(lambda x: x.isalpha(), text_object))
    return no_nums

In [46]:
df['text'] = df['text'].apply(remove_nums)

In [47]:
df['text'].head()

0    [okay, totally, understand, specific, struggles, queer, japanese, japanese, people, open, learning, feel, personally, mean, exist, applies, racism...
1                                      [things, know, returning, hair, salon, excitement, new, normal, salons, barber, shops, long, island, enters, phase]
2    [well, information, would, useful, months, ago, avoided, wasted, every, single, time, person, came, even, slightly, close, entering, general, vici...
3                                                                                                                                       [greatest, lately]
4    [mostly, afraid, really, bad, misunderstandings, past, kept, lot, recently, reached, point, desperation, something, surprisingly, attentive, amp, ...
Name: text, dtype: object

### Lemmatize

In [48]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(df_text):
    lemmatized =[]
    for w in df_text:
        lemmatized.append(lemmatizer.lemmatize(w))
    return lemmatized

In [50]:
df['text'] = df['text'].apply(lemmatize_text)

In [52]:
testers = list(df['text'][15:20])

In [53]:
other_tester = list(df['text'][9973])

In [54]:
print(other_tester)

['painting', 'batman', 'procreate', 'pocket', 'art', 'color', 'paint', 'iphone', 'finger', 'mark', 'stroke', 'brush', 'manhattan', 'new', 'york']


In [55]:
print(testers)

[['season', 'reason', 'stressing', 'tf', 'already', 'clay', 'really', 'think', 'batman'], ['fear', 'love', 'covid', 'horror', 'life', 'coronavirus', 'mentalhealth', 'art', 'depression', 'faith', 'hope', 'motivation', 'fearless', 'scary', 'dark', 'quote', 'courage', 'creepy', 'corona', 'peace', 'poetry', 'success', 'stress', 'selflove', 'inspiration', 'terror', 'halloween', 'pain', 'formylus'], ['feel', 'like', 'cheer', 'slowly', 'morphed', 'scream'], ['twitter', 'started', 'original', 'issue', 'twitter', 'wrong', 'place', 'get', 'help', 'unless', 'really', 'intent', 'professional', 'help', 'fact', 'thing', 'said', 'often', 'make', 'situation', 'worse'], ['thread', 'beautiful', 'moment', 'protest', 'attended', 'nyc', 'past', 'week', 'half', 'post', 'intended', 'glorify', 'protest', 'show', 'wish', 'spread', 'bit', 'positivity', 'time', 'many', 'feeling', 'intense', 'anger', 'heartbreak']]


### Examine Vocab and Frequency Distributions

In [56]:
all_words = [word for tokens in df['text'] for word in tokens]
tweet_lengths = [len(tokens) for tokens in df['text']]
vocab = sorted(list(set(all_words)))

print('{} words total, with a vocabulary size of {}'.format(len(all_words), len(vocab)))
print('Max tweet length is {}'.format(max(tweet_lengths)))

115932 words total, with a vocabulary size of 14505
Max tweet length is 35


In [57]:
flat_words = [item for sublist in df['text'] for item in sublist]

In [58]:
word_freq = FreqDist(flat_words)

In [59]:
word_freq.most_common(30)

[('like', 1066),
 ('time', 888),
 ('get', 864),
 ('people', 785),
 ('day', 684),
 ('depression', 633),
 ('feel', 625),
 ('much', 619),
 ('give', 600),
 ('amp', 590),
 ('one', 531),
 ('help', 527),
 ('know', 515),
 ('attack', 507),
 ('really', 479),
 ('stress', 479),
 ('go', 466),
 ('going', 465),
 ('u', 456),
 ('today', 454),
 ('thing', 452),
 ('need', 452),
 ('social', 428),
 ('make', 419),
 ('right', 390),
 ('work', 370),
 ('think', 357),
 ('good', 355),
 ('way', 355),
 ('fear', 353)]

In [60]:
#preprocess steps as one function

lemmatizer = WordNetLemmatizer()

def preprocess(df_text):
    tokens = word_tokenize(df_text)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words and len(token) > 3]
    
    lemmatized =[]
    
    for w in stopwords_removed:
        lemmatized.append(lemmatizer.lemmatize(w))
        
    processed = list(filter(lambda x: x.isalpha(), lemmatized))  
        
    return processed
    

## Conclusion

* initial preprocessing steps are complete
* it is likely that I will add more stop words, and iteratively perform more cleaning steps once the modeling begins. But this is a good start.
