In [1]:
import pandas as pd
import nltk

In [4]:
twitter_df = pd.read_csv('twitter_df.csv')

In [6]:
twitter_df.columns

Index(['Unnamed: 0', 'author_id', 'conversation_id', 'tweet_created_at',
       'edit_history_tweet_ids', 'id', 'in_reply_to_user_id', 'lang',
       'reply_settings', 'source', 'text', 'geo', 'retweeted', 'replied_to',
       'quoted', 'like_count', 'quote_count', 'reply_count', 'retweet_count',
       'Place', 'Organization', 'Person', 'Other', 'Probability',
       'user_mentioned', 'index', 'user_created_at', 'description', 'name',
       'username', 'location', 'followers_count', 'following_count',
       'listed_count', 'tweet_count', 'description.mentions', 'hashtags'],
      dtype='object')

In [32]:
text_json = twitter_df['text'].to_json(orient='records')

In [33]:
from nltk.tokenize import TweetTokenizer
tt = TweetTokenizer()
tweet_tokens = twitter_df['text'].apply(tt.tokenize)

In [18]:
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /home/cbeckham/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/cbeckham/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [21]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /home/cbeckham/nltk_data...


True

In [34]:
from nltk.stem.wordnet import WordNetLemmatizer

def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

tweet_tokens = tweet_tokens.apply(lambda x: lemmatize_sentence(x))

In [35]:
import re, string

def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [36]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cbeckham/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [38]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

tweet_tokens = tweet_tokens.apply(lambda x: remove_noise(x, stop_words))

In [39]:
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

all_pos_words = get_all_words(tweet_tokens)

In [40]:
from nltk import FreqDist

freq_dist_pos = FreqDist(all_pos_words)
# Most common words across tweets
print(freq_dist_pos.most_common(10))

[('maricopa', 3434), ('dominion', 3406), ('rt', 2704), ('county', 2616), ('voting', 2573), ('…', 2188), ('use', 1835), ('arizona', 1820), ('vote', 1628), ('gop', 1365)]


In [42]:
from nltk.corpus import twitter_samples

In [44]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /home/cbeckham/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

In [45]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

In [46]:
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

In [47]:
def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

In [48]:
import random

positive_dataset = [(tweet_dict, "Positive")
                     for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative")
                     for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

In [101]:
from nltk import classify
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0.9953333333333333
Most Informative Features
                      :( = True           Negati : Positi =   2059.6 : 1.0
                      :) = True           Positi : Negati =    984.7 : 1.0
                 welcome = True           Positi : Negati =     30.8 : 1.0
                  arrive = True           Positi : Negati =     30.2 : 1.0
                     sad = True           Negati : Positi =     23.7 : 1.0
                    glad = True           Positi : Negati =     21.6 : 1.0
                     bam = True           Positi : Negati =     20.9 : 1.0
                   didnt = True           Negati : Positi =     15.1 : 1.0
                followed = True           Negati : Positi =     13.9 : 1.0
               community = True           Positi : Negati =     13.6 : 1.0
None


In [74]:
from nltk.tokenize import word_tokenize

In [57]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/cbeckham/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [83]:
classified = tweet_tokens.apply(lambda x: classifier.classify(dict([token, True] for token in x)))

In [84]:
classified

0       Positive
1       Positive
2       Positive
3       Positive
4       Positive
          ...   
3487    Positive
3488    Positive
3489    Positive
3490    Positive
3491    Positive
Name: text, Length: 3492, dtype: object

In [91]:
pd.set_option('display.max_rows', 5000)

In [95]:
classified.value_counts()

Positive    2086
Negative    1406
Name: text, dtype: int64

In [99]:
twitter_df['text'][0]

"@JavoeGator @JonahLandergan The voting machines related to the 6,000 confirmed incorrect ballots in MI are made by Dominion.\n\nBiden is up by 40K votes in AZ\n\nBiden is up by 60K votes in Maricopa County\n\nBiden's entire AZ lead and more is in Maricopa\n\nMaricopa is the ONLY county in AZ that uses Dominion."

In [106]:
classified.rename("sentiment")

0       Positive
1       Positive
2       Positive
3       Positive
4       Positive
5       Positive
6       Positive
7       Positive
8       Positive
9       Positive
10      Positive
11      Positive
12      Positive
13      Positive
14      Positive
15      Negative
16      Positive
17      Positive
18      Positive
19      Positive
20      Positive
21      Positive
22      Positive
23      Positive
24      Positive
25      Positive
26      Positive
27      Positive
28      Positive
29      Positive
30      Positive
31      Positive
32      Positive
33      Positive
34      Positive
35      Positive
36      Positive
37      Positive
38      Positive
39      Positive
40      Positive
41      Positive
42      Positive
43      Positive
44      Positive
45      Positive
46      Positive
47      Positive
48      Positive
49      Positive
50      Positive
51      Positive
52      Positive
53      Positive
54      Positive
55      Positive
56      Positive
57      Positive
58      Positi

In [111]:
classified_w_twitter_df = twitter_df.join(classified)

In [112]:
classified_w_twitter_df.head()

Unnamed: 0.1,Unnamed: 0,author_id,conversation_id,tweet_created_at,edit_history_tweet_ids,id,in_reply_to_user_id,lang,reply_settings,source,...,name,username,location,followers_count,following_count,listed_count,tweet_count,description.mentions,hashtags,sentiment
0,0,1229060372105134080,1324865785513332738,2020-11-07T00:09:58.000Z,['1324866646834606081'],1324866646834606081,1.236086e+18,en,everyone,Twitter Web App,...,Mr. Gekko,sicilianslice12,,90,477,1,6549,,,Positive
1,1,1229060372105134080,1324865385448042496,2020-11-07T00:09:32.000Z,['1324866538059558913'],1324866538059558913,1.270763e+18,en,everyone,Twitter Web App,...,Mr. Gekko,sicilianslice12,,90,477,1,6549,,,Positive
2,2,1229060372105134080,1324903877360152578,2020-11-07T15:08:06.000Z,['1325092671032004609'],1325092671032004609,64820960.0,en,everyone,Twitter for iPhone,...,Mr. Gekko,sicilianslice12,,90,477,1,6549,,,Positive
3,3,1229060372105134080,1324478995782651905,2020-11-07T15:54:13.000Z,['1325104276859990017'],1325104276859990017,3223426000.0,en,everyone,Twitter Web App,...,Mr. Gekko,sicilianslice12,,90,477,1,6549,,,Positive
4,4,1229060372105134080,1324866302100508676,2020-11-07T00:08:55.000Z,['1324866382966775810'],1324866382966775810,1661961000.0,en,everyone,Twitter Web App,...,Mr. Gekko,sicilianslice12,,90,477,1,6549,,,Positive


In [113]:
classified_w_twitter_df.to_csv('twitter_df_w_sentiment.csv')