In [None]:
from scripts.preprocessing import PreProcessor
from scripts.visualizer import Visualizer
import pandas as pd
from ast import literal_eval

DATA_FILE = 'data/congress-tweets.csv'
PREPROCESSED_FILE = 'data/processed-tweets.csv'

### Pre-process Tweets (Once)

In [None]:
# Don't run if you already have data/processed-tweets.csv
tweets = pd.read_csv(DATA_FILE, encoding='utf-8')
pp = PreProcessor()
processed_tweets = pp.pre_process_df(tweets)
# Saving locally so we only have to pre-process once
processed_tweets.to_csv(PREPROCESSED_FILE, encoding='utf-8', index=False)

### Load Processed Tweets

In [None]:
processed_tweets = pd.read_csv(PREPROCESSED_FILE, 
                               encoding='utf-8', 
                               converters={'tidy_tweet_tokens': literal_eval ,'tokens_no_stop': literal_eval})
# Working with subset of data for speed
processed_tweets = processed_tweets.sample(n=10000)

### Visualize data

In [None]:
v = Visualizer()
v.visualize_words(processed_tweets)

### Analyzing Sentiment

In [None]:
neg_words = pd.read_csv('data/negativewords.csv').squeeze('columns')
v.visualize_word_freq(neg_words, processed_tweets, word_type='Negative')

In [None]:
pos_words = pd.read_csv("data/positivewords.csv").squeeze('columns')
v.visualize_word_freq(pos_words, processed_tweets, word_type='Positive')

### Analyzing Bad Words

In [None]:
bad_words = pd.read_csv('data/bad-words.csv').squeeze('columns')
v.visualize_word_freq(bad_words, processed_tweets, word_type='Bad')

### Analyzing Inclusive and Exclusive Language

In [None]:
inclusive_words = pd.Series(['we', 'our', 'us'])
v.visualize_word_freq(inclusive_words, processed_tweets, word_type='Inclusive')

In [None]:
exclusive_words = pd.Series(['my', 'i', 'mine', 'me', 'them', 'their', 'they', 'those'])
v.visualize_word_freq(exclusive_words, processed_tweets, word_type='Exclusive')