# Detecting Twitter moments that have significant impact on a public sentiment

## Collecting the data

We use [Twitterscraper](https://github.com/taspinar/twitterscraper) to collect tweets that mention our celebrity figures of note.

In [None]:
from datetime import datetime, date, timedelta

import json

import sys

from twitterscraper import query_tweets

def datetime_handler(x):
    if isinstance(x, dt.datetime):
        return x.isoformat()
    raise TypeError('Unknown type')

def collect_tweets(name, articleDate):
    name = name.lower()

    articleDate = datetime.strptime(articleDate, '%m/%d/%y')
    beginDate = (articleDate - timedelta(days=90)).date()
    endDate = (articleDate + timedelta(days=90)).date()

    # Collect tweets with mentions in the form of "FirstName LastName"
    tweets = query_tweets(name, limit=None, begindate=beginDate, enddate=endDate, poolsize=40, lang='en')
    tweets_serialized_pt1 = [tweet.__dict__ for tweet in tweets]

    # Collect tweets with mentions in the form of "FirstNameLastName"
    no_space_name = name.replace(' ', '')

    tweets = query.query_tweets(no_space_name, limit=None, begindate=beginDate, enddate=endDate, poolsize=40, lang='en')
    tweets_serialized_pt2 = [tweet.__dict__ for tweet in tweets]

    tweets_serialized = tweets_serialized_pt1 + tweets_serialized_pt2

    with open(outfile_str, 'w') as outfile:
        json.dump(tweets_serialized, outfile, default=datetime_handler)
        print('tweets saved!')

In [None]:
name = input("Name (FirstName LastName): ")
articleDate = input("Article date (mm/dd/yy): ")

print('Collecting tweets for ' + name)
print('Article release ~ ' + articleDate + '\n')

collect_tweets(name, articleDate)

## Pre-processing the data for sentiment analysis

In [None]:
import pandas as pd

from nltk.corpus import opinion_lexicon
from nltk.tokenize import treebank

import sys

from multiprocessing import cpu_count, Pool

from tqdm import tqdm

tokenizer = treebank.TreebankWordTokenizer()

def get_lexicon_polarity(row):
    # Possible improvements: Make entity-based; consider lexicon in context
    polarity = 'NaN'
    pos_words = 0
    neg_words = 0

    tokenized_sent = [word.lower() for word in tokenizer.tokenize(row['text'])]

    for word in tokenized_sent:
        if word in opinion_lexicon.positive():
            pos_words += 1
        elif word in opinion_lexicon.negative():
            neg_words += 1

    if pos_words > neg_words:
        polarity = 'positive'
    elif pos_words < neg_words:
        polarity = 'negative'
    elif pos_words == neg_words:
        polarity = 'neutral'

    # print(row['text'] + ': ' + polarity)

    return polarity

def process_data(func, df, num_processes=None):
    if num_processes == None:
        num_processes = min(df.shape[0], cpu_count())

    with Pool(processes = num_processes) as pool:
        seq = []

        for index, row in df.iterrows():
            seq.append(row)

        results_list = list(tqdm(pool.imap(get_lexicon_polarity, seq), total=len(df.index)))

        df['lex_polarity'] = results_list

In [None]:
name = input('Name of .json file: ')

infile = name + '.json'
outfile = name + '_lex_pol.json'

print('reading from ' + infile)

corpus = pd.read_json(infile)

print('identifying sentiments...')
process_data(get_lexicon_polarity, corpus, num_processes=cpu_count())

print('saving to ' + outfile)
corpus.to_json(path_or_buf=outfile)

## Visualizing sentiment over time

In [None]:
%matplotlib inline

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import date

sns.set()
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})

In [None]:
tweets = pd.read_json('james_franco_lex_pol.json')
tweets.sort_values(by='timestamp', inplace=True)
startDate = date(2017, 12, 15)
endDate = date(2018, 2, 15)

tweets['timestamp'] = pd.to_datetime(tweets['timestamp']).apply(lambda x: x.date())

tweets_stub = tweets[tweets['timestamp'] >= startDate]
tweets_stub = tweets_stub[tweets_stub['timestamp'] <= endDate]
tweets_stub.sort_values(by='timestamp', inplace=True)

df = tweets_stub.groupby(['timestamp', 'lexicon_polarity']).size()
df = df.reset_index()
df.columns = ['timestamp', 'polarity', 'size']

In [None]:
dims = (20, 7)
fig, ax = plt.subplots(figsize=dims)

ax = sns.lineplot(ax = ax, x='timestamp', y='size', hue='polarity', palette='pastel', data=df)
ax.set_xlabel("Date")
ax.set_ylabel("Number of Tweets")

In [None]:
from wordcloud import WordCloud, STOPWORDS
import string

In [None]:
# the regex used to detect words is a combination of normal words, ascii art, and emojis
# 2+ consecutive letters (also include apostrophes), e.x It's
normal_word = r"(?:\w[\w']+)"
# 2+ consecutive punctuations, e.x. :)
ascii_art = r"(?:[{punctuation}][{punctuation}]+)".format(punctuation=string.punctuation)
# a single character that is not alpha_numeric or other ascii printable
emoji = r"(?:[^\s])(?<![\w{ascii_printable}])".format(ascii_printable=string.printable)
regexp = r"{normal_word}|{ascii_art}|{emoji}".format(normal_word=normal_word, ascii_art=ascii_art,
                                                     emoji=emoji)


words = ' '.join(tweets['text'])

wordcloud = WordCloud(max_words=500, scale=3, background_color='white', regexp=regexp).generate(no_urls_no_tags)

In [None]:
plt.figure(figsize=(20,10))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
wordcloud.to_file('james_franco_cloud.png')

Check out this website to do word to frequency
https://github.com/Mantej-Singh/Word-Frequency---Python/blob/master/Word%20Frequency.ipynb