In [124]:
#Requisite Libraries
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#nltk.download("punkt") #this only needs to be run once per system
import spacy #also run: "python -m spacy download en_core_web_sm" in terminal
from spellchecker import SpellChecker
import matplotlib.pyplot as plt
from datetime import datetime

Question 1: It is an important skill to look at the data and come up with questions that you can
answer. What are some compelling questions that you can ask with the provided dataset (list at
least 2 questions)? (5 points)

1. What is the distribution of sentiments within the tweets?
2. Is there a correlation between sentiment and tweet engagement (retweet or favorite)?
3. What are the most frequent topics mentioned in the tweets?
4. Who were the most frequently mentioned people or hashtags within the tweets?

Question 2: Inspect & Data Cleaning  
● Inspect: Write code to inspect the data. What do you observe? Along with the code,
write your observation in the markdown cell. (2.5 points)  
● Clean: Write code to clean the data. Use at least 5 methods. For each method, along
with the code, you need to write the rationale behind the cleaning process. For this
question, you can assume that you are solving one of the questions that you wrote for
the first question. (5 points)  
● Tokenize: Write code to tokenize your entire dataset. Use at least 2 different types of
tokenizers. Compare their results and write your observations. (2.5 points)

In [125]:
#Read in data and inspect
tweet_df = pd.read_csv("trump_20200530.csv")
print(tweet_df.head())
print(tweet_df.info())
print(tweet_df.shape)
print(tweet_df.columns)
print("\nmissing values count:\n{}".format(tweet_df.isna().sum()))
print("Duplicate count: {}".format(tweet_df.duplicated().sum()))
print("Source contains homogenous values: {}".format((tweet_df["source"] == tweet_df["source"].iloc[0]).all()))
print("The unique values for source are: {}".format(tweet_df["source"].unique()))

               source                                               text  \
0  Twitter for iPhone  RT @ScottAdamsSays: Malaria drug and zinc the ...   
1  Twitter for iPhone  RT @YoungDems4Trump: In Democrat cities you ca...   
2  Twitter for iPhone  RT @YoungDems4Trump: So sad. This poor busines...   
3  Twitter for iPhone   Time for a change! #2020 https://t.co/AECy2GBfys   
4  Twitter for iPhone  RT @TallahForTrump: Trump spoke at my church i...   

            created_at  retweet_count  favorite_count is_retweet  \
0  05-30-2020 03:26:31          10566               0       True   
1  05-30-2020 03:21:41          22320               0       True   
2  05-30-2020 03:21:32          23961               0       True   
3  05-30-2020 03:20:18          42879          164022      False   
4  05-30-2020 03:19:01          36563               0       True   

                id_str  
0  1266571665204527109  
1  1266570449431003138  
2  1266570411678019584  
3  1266570099454103553  
4  126656

In [127]:
#Clean data
#Assuming we're trying to find the distribution of sentiments within the tweets

#There are 58 rows with missing values out of 18446 total rows. We can drop these rows with negligable impact.
tweet_df.dropna(inplace=True)
print(tweet_df.isna().sum())

#homogenize text by converting text to lower case, this should increase model accuracy
tweet_df["text"].str.lower()

#remove stopwords, this reduces noise and dimensionality yielding better performance
def cleanStopWords(text):
    words = text.split() 
    filtered_words = [word for word in words if word not in stopwords.words('english')]
    cleaned_text = ' '.join(filtered_words)
    return cleaned_text
    
tweet_df["text"] = tweet_df["text"].apply(cleanStopWords)

#extract date from time produced field to make it usable for plotting
tweet_df["date"] = tweet_df["created_at"].dt.date
tweet_df.drop("created_at")

#drop non-relevant column, we don't need it - less memory consumed - better performance
tweet_df.drop(column="source")


source            0
text              0
created_at        0
retweet_count     0
favorite_count    0
is_retweet        0
id_str            0
dtype: int64


In [None]:
#Tokenize text
def spacy_tokenize(text):
    tokenizer = spacy.load("en_core_web_sm")
    document = tokenizer(text)
    tokens = [token.text for token in document]
    return tokens
    
tweet_df["spacy_tokens"] = tweet_df["text"].apply(spacy_tokenize)

def nltk_tokenize(text):
    tokens = nltk.word_tokenize(text)
    return tokens
    
tweet_df["nltk_tokens"] = tweet_df["text"].apply(nltk_tokenize)

KeyboardInterrupt: 

In [None]:
#Compare Results

Question 3: Learn how to use new Python packages or online APIs (10 points)
Pick an existing package a library or an API to determine the sentiment (positive, negative,
neutral) for each of the tweets in the dataset. You can also use open-source code provided on
GitHub repos. DO NOT WRITE THE SENTIMENT ANALYSIS CODE FROM SCRATCH
For example: Use VADER sentiment analysis from NLTK

In [None]:
sentiment_analyzer = SentimentIntensityAnalyzer()

def analyze_sentiment(tokens):
    text = " ".join(tokens)
    sentiment = analyzer.polarity_scores(text)
    return sentiment["compound"]

tweet_df["sentiment_score"] = df["tokens"].apply(analyze_sentiment)

def classify_sentiment_score(score):
    if score >= 0.05:
        return "positive"
    elif score <= -0.05:
        return "negative"
    else
        return "neutral"

tweet_df["sentiment_label"] = tweet_df["sentiment_score"].apply(classify_sentiment_score)

Question 4: Analyze Data over time (10 points)
How does the sentiment of your corpus change over time? Answer this question by showing
plots (at least 2 graphs). Be creative!

In [None]:
sentiment_by_day = tweet_df.groupby[tweet_df["date"].dt.date["sentiment_score"].mean()]

plt.plot(daily_sentiment.index, daily_sentiment.values, label='Sentiment Over Time')
plt.xlabel('Date')
plt.ylabel('Sentiment')
plt.title('Sentiment Over Time')
plt.legend()
plt.show()

negative_tweets = tweet_df[tweet_df["sentiment_label"] == "negative"]
negative_tweets_count = negative_tweets.groupby('date').size().reset_index(name='count')
plt.figure(figsize=(12, 6))
plt.plot(negative_tweets_count['date'], negative_tweets_count['count'], marker='o', linestyle='-')
plt.xlabel('Date')
plt.ylabel('Number of Negative Tweets')
plt.title('Number of Negative Tweets Over Time')
plt.grid(True)
plt.show()

negative_tweets = tweet_df[tweet_df["sentiment_label"] == "positive"]
negative_tweets_count = negative_tweets.groupby('date').size().reset_index(name='count')
plt.figure(figsize=(12, 6))
plt.plot(negative_tweets_count['date'], negative_tweets_count['count'], marker='o', linestyle='-')
plt.xlabel('Date')
plt.ylabel('Number of Positive Tweets')
plt.title('Number of Positive Tweets Over Time')
plt.grid(True)
plt.show()