# Twitter Mining & Modeling

## Get Data

In [38]:
import csv

In [39]:
import tweepy as tw
import pandas as pd

In [121]:
auth = tw.OAuthHandler(API_key, API_secret)
auth.set_access_token(access_token, access_secret)
api = tw.API(auth, wait_on_rate_limit=True)

#### Query the API

In [189]:
search_words = "covax vaccine vaccination vax hoax #idonotconsent -filter:retweets"
date_since = "2020-12-01"

In [190]:
for i in range(100):
    print(f"Starting search: #{i}")
    tweets = tw.Cursor(api.search,
              q=search_words,
              lang="en",
              since=date_since).items(1000)
    
    with open("../data/twitter_search.csv", 'a', newline='') as csvFile:
        csvWriter = csv.writer(csvFile)
        for tweet in tweets:
            tweets_encoded = tweet.text.encode('utf-8')
            tweets_decoded = tweets_encoded.decode('utf-8')
            if tweet.id not in ids:
                csvWriter.writerow([tweet.id, tweets_decoded, tweet.created_at, tweet.geo, tweet.place.name if tweet.place else None, tweet.coordinates, tweet._json["user"]["location"]])

    twitter_df = pd.read_csv("../data/twitter_search.csv", 
                         names=["id", "tweet", "date", "drop0", "drop1", "drop2", "location"])
    
    print(f"New data count: {len(twitter_df)}")
    ids = twitter_df.id.to_list()  

Starting search: #0
New data count: 8237
Starting search: #1
New data count: 8237
Starting search: #2
New data count: 8237
Starting search: #3


KeyboardInterrupt: 

#### Apparently the free API limits you to just a week's worth of data :( 

In [191]:
twitter_df.head()

Unnamed: 0,id,tweet,date,drop0,drop1,drop2,location
0,1407417288106680322,@MLAStefanson @mingoertzen @BrianPallister tha...,2021-06-22 19:16:44,,,,YWG
1,1407417265994223622,@akelvinlab But Pfizer Fact Sheet for Health C...,2021-06-22 19:16:39,,,,
2,1407417186948243458,Vaccination centers for 18-44 age group: \n1. ...,2021-06-22 19:16:20,,,,"Hyderabad, India"
3,1407417182577889281,@peterktodd „WHO's Strategic Advisory Group of...,2021-06-22 19:16:19,,,,Cologne
4,1407417156032155651,@Truthseeker1985 This turd also ignores Expert...,2021-06-22 19:16:13,,,,


In [194]:
df = twitter_df.drop_duplicates()

In [195]:
len(df)

5436

In [197]:
df.drop(["drop0", "drop1", "drop2"], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [198]:
df.drop(["date", "location", "id"], axis=1, inplace=True)

In [199]:
df.head()

Unnamed: 0,tweet
0,@MLAStefanson @mingoertzen @BrianPallister tha...
1,@akelvinlab But Pfizer Fact Sheet for Health C...
2,Vaccination centers for 18-44 age group: \n1. ...
3,@peterktodd „WHO's Strategic Advisory Group of...
4,@Truthseeker1985 This turd also ignores Expert...


In [200]:
df.to_csv("../data/cleaned_twitter.csv")

## Modeling

In [158]:
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/enceladosaurus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/enceladosaurus/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/enceladosaurus/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

### Sentiment140 Dataset for training

Source: https://www.kaggle.com/kazanova/sentiment140

In [210]:
twitter_training = pd.read_csv("../data/twitter_training.csv", names=["positivity", "tweetid", "date", "query", "user", "tweet"])

In [215]:
twitter_training.head()

Unnamed: 0,positivity,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [214]:
#twitter_training.drop(["tweetid", "date", "query", "user"], axis=1, inplace=True)

In [217]:
twitter_training.positivity = twitter_training.positivity.replace(4, 1)

In [226]:
negative = twitter_training[twitter_training.positivity == 0]

In [227]:
positive = twitter_training[twitter_training.positivity == 1]

### Clean Training Data

In [144]:
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer

In [228]:
negative_tweets = negative.tweet.to_list()
positive_tweets = positive.tweet.to_list()

In [312]:
tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True)

In [220]:
lemmatizer = WordNetLemmatizer()

In [146]:
def lemmatize(token_list: list) -> list:
    lemmatized_tokens = []
    for token, tag in pos_tag(token_list):
        if tag.startswith('VB'):
            pos = 'v'
        elif tag.startswith('NN'):
            pos = 'n'
        else:
            pos = 'a'
        lemmatized_tokens.append(lemmatizer.lemmatize(token, pos))
    
    return lemmatized_tokens   

In [291]:
def clean_data(token_list: list, stop_words: list) -> list:
    trim_characters = ['!', '.', ',', '*', '&', '%', '$', '?', '""', ":", ";", "(", ")", "-", "/"]
    remove_characters = ['@', '#', '//']
    cleaned_tokens = []
    
    for token in token_list:
        for character in trim_characters:
            if character in token:
                token = token.strip(character)
        for character in remove_characters:
            if character in token:
                token = ''
                
        if len(token) > 1 and token.lower() not in stop_words:
            cleaned_tokens.append(token)
    
    return cleaned_tokens

In [150]:
stop_words = stopwords.words('english')

In [229]:
negative_data  = [tweet_tokenizer.tokenize(x) for x in negative_tweets]

In [230]:
positive_data  = [tweet_tokenizer.tokenize(x) for x in positive_tweets]

In [313]:
def text_processing(tweet: str) -> list:
    token_list = tweet_tokenizer.tokenize(tweet)
    stop_words = stopwords.words('english')
    cleaned_tokens = clean_data(token_list, stop_words)
    lemmatized_tokens = lemmatize(cleaned_tokens)
    
    return lemmatized_tokens

In [180]:
def prepare_data(all_tokens: list, stop_words: list):
    cleaned_data = []
    for token_list in all_tokens:
        cleaned_tokens = clean_data(token_list, stop_words)
        lemmatized_tokens = lemmatize(cleaned_tokens)
        cleaned_data.append(lemmatized_tokens)
    
    return cleaned_data

In [292]:
cleaned_negative = prepare_data(negative_data, stop_words)

In [293]:
cleaned_positive = prepare_data(positive_data, stop_words)

In [183]:
def convert_to_dict(all_tokens: list) -> dict:
    for token_list in all_tokens:
        yield dict([token, True] for token in token_list)

In [236]:
def prepare_data_for_model(positive_tokens: list, negative_tokens: list) -> list:
    positive_tweets = [(positive_dict, "Positive") for positive_dict in convert_to_dict(positive_tokens)]
    negative_tweets = [(negative_dict, "Negative") for negative_dict in convert_to_dict(negative_tokens)]
    
    return positive_tweets + negative_tweets

In [295]:
all_tweets = prepare_data_for_model(cleaned_positive, cleaned_negative)

## Sklearn Model

In [302]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score

In [316]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_processing)),  
    ('tfidf', TfidfTransformer()), 
    ('classifier', MultinomialNB()), 
])

In [309]:
twitter_training.head()

Unnamed: 0,positivity,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [317]:
msg_train, msg_test, label_train, label_test = train_test_split(twitter_training['tweet'], twitter_training['positivity'], test_size=0.2)
pipeline.fit(msg_train,label_train)
predictions = pipeline.predict(msg_test)
print(classification_report(predictions,label_test))
print(confusion_matrix(predictions,label_test))
print(accuracy_score(predictions,label_test))

              precision    recall  f1-score   support

           0       0.76      0.75      0.76    162927
           1       0.75      0.76      0.75    157073

    accuracy                           0.76    320000
   macro avg       0.76      0.76      0.76    320000
weighted avg       0.76      0.76      0.76    320000

[[122701  40226]
 [ 37709 119364]]
0.756453125


## NLTK Model

In [239]:
from nltk import classify
from nltk import NaiveBayesClassifier

In [240]:
import random

In [297]:
random.shuffle(all_tweets)

In [298]:
train = all_tweets[:round(len(all_tweets) * 0.2)]
test = all_tweets[round(len(all_tweets) * 0.2):]

In [299]:
nbc = NaiveBayesClassifier.train(train)

In [300]:
classify.accuracy(nbc, test)

0.75542109375

In [301]:
nbc.show_most_informative_features(10)

Most Informative Features
                 saddest = True           Negati : Positi =     37.5 : 1.0
                  farrah = True           Negati : Positi =     29.5 : 1.0
                  poorly = True           Negati : Positi =     26.1 : 1.0
                  arghhh = True           Negati : Positi =     24.9 : 1.0
                 honored = True           Positi : Negati =     24.4 : 1.0
             condolences = True           Negati : Positi =     24.3 : 1.0
                saddened = True           Negati : Positi =     23.6 : 1.0
                  boohoo = True           Negati : Positi =     21.6 : 1.0
                   arghh = True           Negati : Positi =     20.9 : 1.0
                  gutted = True           Negati : Positi =     20.9 : 1.0


In [318]:
tweets = df.tweet.to_list()

In [319]:
vax_data = [tweet_tokenizer.tokenize(x) for x in tweets]

In [320]:
vax_tweets = prepare_data(vax_data, stop_words)

In [321]:
nbc.classify(dict([token, True] for token in vax_tweets[0]))

'Positive'

In [322]:
negatives = []
positives = []
for tweet in vax_tweets:
    if nbc.classify(dict([token, True] for token in tweet)) == 'Negative':
        negatives.append(tweet)
    else:
        positives.append(tweet)

In [323]:
negative_tokens = []
for tweet in negatives:
    for token in tweet:
        negative_tokens.append(token)

In [342]:
positive_tokens = []
for tweet in positives:
    for token in tweet:
        positive_tokens.append(token)

In [324]:
len(negative_tokens)

38463

In [343]:
len(positive_tokens)

19231

In [345]:
positive_list = list(set(positive_tokens))

In [346]:
positive_counts = [positive_tokens.count(x) for x in positive_list]

In [347]:
positive_dict = {
    "token" : positive_list,
    "count" : positive_counts
}

In [348]:
positive_df = pd.DataFrame.from_dict(positive_dict)

In [350]:
positive_df.sort_values(by="count", ascending=False, inplace=True)

In [351]:
positive_df["prevalence"] = positive_df["count"] / len(positives)

In [352]:
positive_df.head(10)

Unnamed: 0,token,count,prevalence
1833,vaccination,924,0.497577
1573,vaccine,665,0.358104
965,covid,374,0.2014
1284,19,270,0.145396
4215,available,167,0.08993
202,doses,152,0.081852
117,appointments,126,0.067851
596,jun,124,0.066774
2884,sign,122,0.065697
1812,code,120,0.06462


In [325]:
token_list = list(set(negative_tokens))

In [326]:
token_counts = [negative_tokens.count(x) for x in token_list]

In [327]:
len(token_counts) == len(token_list)

True

In [328]:
token_dict = {
    "token" : token_list,
    "count" : token_counts
}

In [329]:
token_df = pd.DataFrame.from_dict(token_dict)

In [330]:
token_df.sort_values(by="count", ascending=False, inplace=True)

In [332]:
1467/len(negatives) # "Available"

0.4098910310142498

In [333]:
830/len(negatives) #appointments

0.23190835428890752

In [286]:
445/len(negatives) # slot

0.10220486908589803

In [287]:
936 / len(negatives) # walgreens

0.21497473587505742

In [338]:
token_df["prevalence"] = token_df["count"] / len(negatives)

In [340]:
token_df.head(10)

Unnamed: 0,token,count,prevalence
541,vaccine,2283,0.637888
5032,vaccination,1467,0.409891
5783,available,1257,0.351215
4474,appointments,830,0.231908
3369,walgreens,817,0.228276
5366,sign,776,0.21682
2083,code,775,0.216541
1701,jun,775,0.216541
2859,zip,773,0.215982
1543,jul,772,0.215703


In [341]:
token_df.to_csv("../data/token_prevalence.csv")