In [128]:
!pip install nrclex



In [129]:
import pandas as pd
import string
import re
import nltk
from nltk import word_tokenize, ngrams
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from collections import Counter
from nrclex import NRCLex
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

In [130]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [131]:
# Load and clean dataset
def load_and_clean_data(file_1, file_2):
    df_1 = pd.read_csv(file_1)
    df_1.dropna(inplace=True)
    df_1.drop_duplicates(inplace=True)

    df_2 = pd.read_csv(file_2)
    df_2.dropna(inplace=True)
    df_2.drop_duplicates(inplace=True)

    merged_df = pd.concat([df_1, df_2], ignore_index=True)
    merged_df['TweetAt'] = pd.to_datetime(merged_df['TweetAt'], format='%d-%m-%Y')

    return merged_df
merged_df

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,2020-03-02,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",2020-03-02,When I couldn't find hand sanitizer at Fred Me...,Positive
2,4,44956,Chicagoland,2020-03-02,#Panic buying hits #NewYork City as anxious sh...,Negative
3,5,44957,"Melbourne, Victoria",2020-03-03,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral
4,6,44958,Los Angeles,2020-03-03,Do you remember the last time you paid $2.99 a...,Neutral
...,...,...,...,...,...,...
35526,44946,89898,"Brooklyn, NY",2020-04-14,YÂ’all really shitting that much more at home?...,Negative
35527,44948,89900,"Toronto, Ontario",2020-04-14,Still shocked by the number of #Toronto superm...,Negative
35528,44949,89901,OHIO,2020-04-14,I never that weÂ’d be in a situation &amp; wor...,Positive
35529,44951,89903,"Wellington City, New Zealand",2020-04-14,Airline pilots offering to stock supermarket s...,Neutral


In [132]:
# Filter data by the month of March and April
filtered_df = merged_df[merged_df['TweetAt'].dt.month.isin([3, 4])].copy()

In [133]:
# Text cleaning
def clean_text(text):
    text = text.lower()
    text = "".join([char for char in text if char not in string.punctuation])
    return text

filtered_df.loc[:, 'OriginalTweet'] = filtered_df['OriginalTweet'].apply(clean_text)

In [134]:
# Generate n-grams
def get_ngrams(text, n ):
    n_grams = ngrams(word_tokenize(text), n)
    return [ ' '.join(grams) for grams in n_grams]

filtered_df['bigrams'] = filtered_df['OriginalTweet'].apply(lambda x: get_ngrams(x, 2))

In [135]:
# Compute n-gram frequency
counter = Counter()

for bigrams in filtered_df['bigrams']:
    counter.update(bigrams)

print(counter.most_common(10))

[('grocery store', 4137), ('to the', 3363), ('in the', 3154), ('of the', 2633), ('’ s', 2145), ('covid 19', 2139), ('the grocery', 1923), ('the coronavirus', 1919), ('coronavirus covid19', 1705), ('due to', 1608)]


In [136]:
# Text preprocessing
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = re.sub('['+punctuation+']', '', text)  # Remove punctuations
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]  # Remove stopwords
    return tokens

In [137]:
# Apply preprocessing and ngram generation to each tweet
filtered_df['preprocessed_bigrams'] = filtered_df['OriginalTweet'].apply(lambda x: list(ngrams(preprocess(x), 2)))

In [138]:
# Frequency analysis
all_bigrams = [bigram for bigrams in filtered_df['preprocessed_bigrams'] for bigram in bigrams]
freq_dist = FreqDist(all_bigrams)

for bigram, frequency in freq_dist.most_common(10):
    print(' '.join(bigram), frequency)

grocery store 4138
covid 19 2139
coronavirus covid19 1715
covid19 coronavirus 1491
hand sanitizer 1237
online shopping 1211
toilet paper 1170
panic buying 940
â “ 896
oil prices 811


#### It appears that the encoding used in the dataset has caused some unusual characters to appear. We can modify our preprocess function to remove these characters and also to handle different spellings or formats of the same term

In [139]:
# Frequency counter for 'covid' related bigrams
covid_related_bigrams_freq = 0

# Frequency counter for other bigrams
other_bigram_freq = nltk.FreqDist()

# Check each bigram
for bigram, freq in bigram_freq.items():
    if 'covid' in bigram[0].lower() or 'covid' in bigram[1].lower():
        covid_related_bigrams_freq += freq
    else:
        other_bigram_freq[bigram] = freq

# Print the frequencies
print(f"covid related bigrams {covid_related_bigrams_freq}")

# Print the 10 most common non-covid related bigrams
for bigram, freq in other_bigram_freq.most_common(10):
    print(f"{bigram[0]} {bigram[1]} {freq}")

covid related bigrams 36002
grocery store 4208
hand sanitizer 1264
online shopping 1229
toilet paper 1198
panic buying 1032
oil prices 816
coronavirus pandemic 659
social distancing 517
stay home 435
stock food 419


## preprocessing it again yay

#### generate bigrams

In [140]:
from nrclex import NRCLex

def get_emotion_from_text(text):
    emotion = NRCLex(text)
    return emotion.top_emotions[0][0]

# Text preprocessing
def preprocess(text):
    text = re.sub('['+punctuation+']', '', text)  # Remove punctuations
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]  # Remove stopwords
    return tokens

# Apply preprocessing and ngram generation to each tweet
filtered_df['preprocessed_bigrams'] = filtered_df['OriginalTweet'].apply(lambda x: list(ngrams(preprocess(x), 2)))

# Add emotion detection to the dataframe
filtered_df['Emotion'] = filtered_df['OriginalTweet'].apply(get_emotion_from_text)


In [141]:
# Print the first few rows of the dataframe
filtered_df

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,bigrams,preprocessed_bigrams,Emotion
0,1,44953,NYC,2020-03-02,trending new yorkers encounter empty supermark...,Extremely Negative,"[trending new, new yorkers, yorkers encounter,...","[(trending, new), (new, yorkers), (yorkers, en...",fear
1,2,44954,"Seattle, WA",2020-03-02,when i couldnt find hand sanitizer at fred mey...,Positive,"[when i, i couldnt, couldnt find, find hand, h...","[(couldnt, find), (find, hand), (hand, sanitiz...",fear
2,4,44956,Chicagoland,2020-03-02,panic buying hits newyork city as anxious shop...,Negative,"[panic buying, buying hits, hits newyork, newy...","[(panic, buying), (buying, hits), (hits, newyo...",positive
3,5,44957,"Melbourne, Victoria",2020-03-03,toiletpaper dunnypaper coronavirus coronavirus...,Neutral,"[toiletpaper dunnypaper, dunnypaper coronaviru...","[(toiletpaper, dunnypaper), (dunnypaper, coron...",positive
4,6,44958,Los Angeles,2020-03-03,do you remember the last time you paid 299 a g...,Neutral,"[do you, you remember, remember the, the last,...","[(remember, last), (last, time), (time, paid),...",anticipation
...,...,...,...,...,...,...,...,...,...
35526,44946,89898,"Brooklyn, NY",2020-04-14,yâ’all really shitting that much more at home ...,Negative,"[yâ ’, ’ all, all really, really shitting, shi...","[(yâ, ’), (’, really), (really, shitting), (sh...",fear
35527,44948,89900,"Toronto, Ontario",2020-04-14,still shocked by the number of toronto superma...,Negative,"[still shocked, shocked by, by the, the number...","[(still, shocked), (shocked, number), (number,...",positive
35528,44949,89901,OHIO,2020-04-14,i never that weâ’d be in a situation amp world...,Positive,"[i never, never that, that weâ, weâ ’, ’ d, d ...","[(never, weâ), (weâ, ’), (’, situation), (situ...",anticipation
35529,44951,89903,"Wellington City, New Zealand",2020-04-14,airline pilots offering to stock supermarket s...,Neutral,"[airline pilots, pilots offering, offering to,...","[(airline, pilots), (pilots, offering), (offer...",trust


In [142]:
from nltk.corpus import stopwords

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www.\S+', '', text)  # remove URLs
    text = re.sub(r'\W', ' ', text)  # remove special characters
    text = re.sub(r'\s+', ' ', text)  # remove extra spaces
    text = text.replace("covid19", "covid-19")  # unify term
    text = text.replace("coronavirus covid-19", "covid-19")  # unify term
    text = text.replace("covid-19 coronavirus", "covid-19")  # unify term
    text = text.replace("â", "")  # remove unusual characters
    text = text.replace("iâ", "")  # remove unusual characters
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]  # remove stop words
    return ' '.join(words)

filtered_df['OriginalTweet'] = filtered_df['OriginalTweet'].apply(preprocess_text)
filtered_df

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,bigrams,preprocessed_bigrams,Emotion
0,1,44953,NYC,2020-03-02,trending new yorkers encounter empty supermark...,Extremely Negative,"[trending new, new yorkers, yorkers encounter,...","[(trending, new), (new, yorkers), (yorkers, en...",fear
1,2,44954,"Seattle, WA",2020-03-02,couldnt find hand sanitizer fred meyer turned ...,Positive,"[when i, i couldnt, couldnt find, find hand, h...","[(couldnt, find), (find, hand), (hand, sanitiz...",fear
2,4,44956,Chicagoland,2020-03-02,panic buying hits newyork city anxious shopper...,Negative,"[panic buying, buying hits, hits newyork, newy...","[(panic, buying), (buying, hits), (hits, newyo...",positive
3,5,44957,"Melbourne, Victoria",2020-03-03,toiletpaper dunnypaper coronavirus coronavirus...,Neutral,"[toiletpaper dunnypaper, dunnypaper coronaviru...","[(toiletpaper, dunnypaper), (dunnypaper, coron...",positive
4,6,44958,Los Angeles,2020-03-03,remember last time paid 299 gallon regular gas...,Neutral,"[do you, you remember, remember the, the last,...","[(remember, last), (last, time), (time, paid),...",anticipation
...,...,...,...,...,...,...,...,...,...
35526,44946,89898,"Brooklyn, NY",2020-04-14,really shitting much home covid-19 toiletpaper,Negative,"[yâ ’, ’ all, all really, really shitting, shi...","[(yâ, ’), (’, really), (really, shitting), (sh...",fear
35527,44948,89900,"Toronto, Ontario",2020-04-14,still shocked number toronto supermarket emplo...,Negative,"[still shocked, shocked by, by the, the number...","[(still, shocked), (shocked, number), (number,...",positive
35528,44949,89901,OHIO,2020-04-14,never situation amp world going supermarket pi...,Positive,"[i never, never that, that weâ, weâ ’, ’ d, d ...","[(never, weâ), (weâ, ’), (’, situation), (situ...",anticipation
35529,44951,89903,"Wellington City, New Zealand",2020-04-14,airline pilots offering stock supermarket shel...,Neutral,"[airline pilots, pilots offering, offering to,...","[(airline, pilots), (pilots, offering), (offer...",trust


# Cleaning the texts by creating a new collumn called "cleaned_text" 

In [143]:
import pandas as pd
import re
import string
import nltk
from nltk import word_tokenize, ngrams
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from collections import Counter
from nrclex import NRCLex
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

In [144]:
# Load and clean dataset
def load_and_clean_data(file_1, file_2):
    df_1 = pd.read_csv(file_1)
    df_1.dropna(inplace=True)
    df_1.drop_duplicates(inplace=True)

    df_2 = pd.read_csv(file_2)
    df_2.dropna(inplace=True)
    df_2.drop_duplicates(inplace=True)

    merged_df = pd.concat([df_1, df_2], ignore_index=True)
    merged_df['TweetAt'] = pd.to_datetime(merged_df['TweetAt'], format='%d-%m-%Y')

    return merged_df

merged_df = load_and_clean_data('Corona_NLP_test2.csv', 'Corona_NLP_train.csv')

# Filter data by the month of March and April
filtered_df = merged_df[merged_df['TweetAt'].dt.month.isin([3, 4])].copy()

# Text cleaning
def clean_text(text):
    text = text.lower()
    text = "".join([char for char in text if char not in string.punctuation])
    return text

filtered_df.loc[:, 'OriginalTweet'] = filtered_df['OriginalTweet'].apply(clean_text)

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove RT (retweet sign)
    text = re.sub(r'rt[\s]+', '', text)
    # Remove mentions
    text = re.sub(r'@\S+', '', text)
    # Remove all non-alphabetic characters
    text = re.sub(r'\W', ' ', text)
    # Remove single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    # Remove leading and trailing whitespaces
    text = text.strip()
    # Remove stopwords
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

filtered_df['Text_Cleaned'] = filtered_df['OriginalTweet'].apply(preprocess_text)

# Function to detect emotion
def get_emotion(text):
    emotion = NRCLex(text)
    freq_dict = emotion.affect_frequencies
    return max(freq_dict, key=freq_dict.get)

# Add emotion to dataframe
filtered_df['Emotion'] = filtered_df['Text_Cleaned'].apply(get_emotion)

In [145]:
filtered_df

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,Text_Cleaned,Emotion
0,1,44953,NYC,2020-03-02,trending new yorkers encounter empty supermark...,Extremely Negative,trending new yorkers encounter empty supermark...,fear
1,2,44954,"Seattle, WA",2020-03-02,when i couldnt find hand sanitizer at fred mey...,Positive,couldnt find hand sanitizer fred meyer turned ...,fear
2,4,44956,Chicagoland,2020-03-02,panic buying hits newyork city as anxious shop...,Negative,panic buying hits newyork city anxious shopper...,positive
3,5,44957,"Melbourne, Victoria",2020-03-03,toiletpaper dunnypaper coronavirus coronavirus...,Neutral,toiletpaper dunnypaper coronavirus coronavirus...,positive
4,6,44958,Los Angeles,2020-03-03,do you remember the last time you paid 299 a g...,Neutral,remember last time paid 299 gallon regular gas...,anticipation
...,...,...,...,...,...,...,...,...
35526,44946,89898,"Brooklyn, NY",2020-04-14,yâ’all really shitting that much more at home ...,Negative,yâ really shitting much home covid19 coronavir...,fear
35527,44948,89900,"Toronto, Ontario",2020-04-14,still shocked by the number of toronto superma...,Negative,still shocked number toronto supermarket emplo...,positive
35528,44949,89901,OHIO,2020-04-14,i never that weâ’d be in a situation amp world...,Positive,never weâ situation amp world going supermarke...,anticipation
35529,44951,89903,"Wellington City, New Zealand",2020-04-14,airline pilots offering to stock supermarket s...,Neutral,airline pilots offering stock supermarket shel...,trust
