# INTRNLP Twitter

Insert short introduction here.

# Import Modules

In [47]:
from nltk.stem import WordNetLemmatizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os
import emoji
import nltk

In [48]:
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\niloj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Load Data

In [2]:
def check_data_exists(data):
    return os.path.exists(data)

In [3]:
if check_data_exists("all_tweets.csv"):
    df = pd.read_csv("all_tweets.csv")
    df.drop(["Unnamed: 0"], inplace=True, axis=1)
else:
    for i in range(0, 24):
        if i == 0:
            df = pd.read_csv(f"Data/tweets_{i}.csv")
        else:
            more_df = pd.read_csv(f"Data/tweets_{i}.csv")
            df.append(more_df, ignore_index=True)
    df.to_csv("all_tweets.csv")

In [4]:
df.head(5)

Unnamed: 0,tweet_id,author_id,created_at_utc+8,lang,text
0,1264847297260605441,303106526,2020-05-25 17:14:30,tl,@alphangela bes wag ganyan ang attitude char i...
1,1264847299030597632,2647675584,2020-05-25 17:14:30,en,Bridgetowne yow
2,1264847299198414848,1074247547907174401,2020-05-25 17:14:30,tl,@Kookie07Jeon Can't stop laughing tf HAHAHAHAH...
3,1264847299210928128,1010402467878658048,2020-05-25 17:14:30,tl,@azilhanna09 HAHAHAHHAHAHAHHAAA ML NA LANG
4,1264847301568159746,947031284177502208,2020-05-25 17:14:31,en,One of my fav🤧♥️\n#백현 #BAEKHYUN #엑소 #EXO #wear...


# Preprocessing

Let's check if there are any null values.

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   tweet_id          250000 non-null  int64 
 1   author_id         250000 non-null  int64 
 2   created_at_utc+8  250000 non-null  object
 3   lang              250000 non-null  object
 4   text              250000 non-null  object
dtypes: int64(2), object(3)
memory usage: 9.5+ MB


There are no null values in the corpus. We can start checking the texts. Before that, we will acquire all of the English texts available in the corpus.

In [6]:
df["lang"].unique()

array(['tl', 'en', 'und', 'et', 'es', 'ko', 'nl', 'tr', 'in', 'hi', 'fi',
       'pt', 'de', 'it', 'ca', 'ht', 'pl', 'lt', 'hu', 'sv', 'el', 'cy',
       'da', 'fr', 'cs', 'no', 'eu', 'lv', 'ja', 'sl', 'vi', 'is', 'ro',
       'ar', 'th', 'zh', 'ur', 'ru', 'iw', 'bn'], dtype=object)

In [7]:
eng_df = df[df["lang"] == "en"]
eng_df = eng_df.reset_index()
eng_df.drop(["index"], inplace=True, axis=1)
eng_df.head(5)

Unnamed: 0,tweet_id,author_id,created_at_utc+8,lang,text
0,1264847299030597632,2647675584,2020-05-25 17:14:30,en,Bridgetowne yow
1,1264847301568159746,947031284177502208,2020-05-25 17:14:31,en,One of my fav🤧♥️\n#백현 #BAEKHYUN #엑소 #EXO #wear...
2,1264847307943473152,569607182,2020-05-25 17:14:32,en,@killakushla Loveyou!!!
3,1264847328491393024,916702759537090560,2020-05-25 17:14:37,en,byeee imma listen to the b-tracks #Delight
4,1264847331788066816,1246426505871126530,2020-05-25 17:14:38,en,@gmawowowin 0⃣9⃣3⃣0⃣ 3⃣1⃣8⃣ 8⃣6⃣8⃣1⃣\nMARIPO...


All of the English tweets are acquired. We will have to clean the texts now. We will store the clean texts as a new column. First, we will remove all `@user` tags since they don't play a role in finding sentiment.

In [8]:
def remove_text_regex(text, regex):
    patterns = re.findall(regex, text)
    for pattern in patterns:
        text = re.sub(pattern, '', text)
    return text

In [9]:
def give_emoji_free_text(text):
    return emoji.get_emoji_regexp().sub(r'', text)

In [56]:
def lemma_text(text):
    new_text = ""
    lemma = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    for token in tokens:
        new_text = new_text + " " + token
    return new_text

In [10]:
eng_df["clean_text"] = np.vectorize(remove_text_regex)(eng_df["text"], "@[\w]*")
eng_df.head(5)

Unnamed: 0,tweet_id,author_id,created_at_utc+8,lang,text,clean_text
0,1264847299030597632,2647675584,2020-05-25 17:14:30,en,Bridgetowne yow,Bridgetowne yow
1,1264847301568159746,947031284177502208,2020-05-25 17:14:31,en,One of my fav🤧♥️\n#백현 #BAEKHYUN #엑소 #EXO #wear...,One of my fav🤧♥️\n#백현 #BAEKHYUN #엑소 #EXO #wear...
2,1264847307943473152,569607182,2020-05-25 17:14:32,en,@killakushla Loveyou!!!,Loveyou!!!
3,1264847328491393024,916702759537090560,2020-05-25 17:14:37,en,byeee imma listen to the b-tracks #Delight,byeee imma listen to the b-tracks #Delight
4,1264847331788066816,1246426505871126530,2020-05-25 17:14:38,en,@gmawowowin 0⃣9⃣3⃣0⃣ 3⃣1⃣8⃣ 8⃣6⃣8⃣1⃣\nMARIPO...,0⃣9⃣3⃣0⃣ 3⃣1⃣8⃣ 8⃣6⃣8⃣1⃣\nMARIPOL SEBASTIAN...


We have our new column to store clean texts. However, we are far from done. Let's check if there are links found in the texts. If links are present, we will remove them as well.

In [11]:
eng_df["clean_text"].head(20)

0                                       Bridgetowne yow
1     One of my fav🤧♥️\n#백현 #BAEKHYUN #엑소 #EXO #wear...
2                                            Loveyou!!!
3            byeee imma listen to the b-tracks #Delight
4      0⃣9⃣3⃣0⃣  3⃣1⃣8⃣  8⃣6⃣8⃣1⃣\nMARIPOL SEBASTIAN...
5               \n\n-cutie pie\n-baby face\n-gorgeous!!
6      0⃣9⃣3⃣0⃣  3⃣1⃣8⃣  8⃣6⃣8⃣1⃣\nMARIPOL SEBASTIAN...
7     No empty carbs day starts tomorrow! Help me. H...
8                                Every wait has a worth
9     Snack Time🥤🍝🍔🍟 from McDonald's  when we crave ...
10    Already out on Spotify \n#baekhyun #Delight #B...
11    Did you all notice those City Lights posters? ...
12    Helluva Summer 🔥 #SummerTime https://t.co/j5BL...
13     I Don't get this?\nI fixed Everything and thi...
14    Not in any way close to what I feel right now....
15                 as in!!! 😍😍😍 https://t.co/102rhUvrnX
16                    Superpower to bring in more POGO?
17             stream candy now https://t.co/Pwz

In [12]:
eng_df["clean_text"] = np.vectorize(remove_text_regex)(eng_df["clean_text"], "https?:\/\/[a-zA-Z0-9._]+\/?[a-zA-Z0-9._]+")
eng_df["clean_text"].head(20)

0                                       Bridgetowne yow
1     One of my fav🤧♥️\n#백현 #BAEKHYUN #엑소 #EXO #wear...
2                                            Loveyou!!!
3            byeee imma listen to the b-tracks #Delight
4      0⃣9⃣3⃣0⃣  3⃣1⃣8⃣  8⃣6⃣8⃣1⃣\nMARIPOL SEBASTIAN...
5               \n\n-cutie pie\n-baby face\n-gorgeous!!
6      0⃣9⃣3⃣0⃣  3⃣1⃣8⃣  8⃣6⃣8⃣1⃣\nMARIPOL SEBASTIAN...
7     No empty carbs day starts tomorrow! Help me. H...
8                                Every wait has a worth
9     Snack Time🥤🍝🍔🍟 from McDonald's  when we crave ...
10    Already out on Spotify \n#baekhyun #Delight #B...
11    Did you all notice those City Lights posters? ...
12                        Helluva Summer 🔥 #SummerTime 
13     I Don't get this?\nI fixed Everything and thi...
14    Not in any way close to what I feel right now....
15                                        as in!!! 😍😍😍 
16                    Superpower to bring in more POGO?
17                                    stream can

As we can notice with the clean texts, there are some `\n` and other string returns that we need to delete. They are not significant after preprocessing.

In [13]:
for i in range(len(eng_df["clean_text"])):
    eng_df["clean_text"][i] = eng_df["clean_text"][i].replace('\n', ' ').replace('\r', '').replace('\t', ' ')
eng_df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eng_df["clean_text"][i] = eng_df["clean_text"][i].replace('\n', ' ').replace('\r', '').replace('\t', ' ')


Unnamed: 0,tweet_id,author_id,created_at_utc+8,lang,text,clean_text
0,1264847299030597632,2647675584,2020-05-25 17:14:30,en,Bridgetowne yow,Bridgetowne yow
1,1264847301568159746,947031284177502208,2020-05-25 17:14:31,en,One of my fav🤧♥️\n#백현 #BAEKHYUN #엑소 #EXO #wear...,One of my fav🤧♥️ #백현 #BAEKHYUN #엑소 #EXO #weare...
2,1264847307943473152,569607182,2020-05-25 17:14:32,en,@killakushla Loveyou!!!,Loveyou!!!
3,1264847328491393024,916702759537090560,2020-05-25 17:14:37,en,byeee imma listen to the b-tracks #Delight,byeee imma listen to the b-tracks #Delight
4,1264847331788066816,1246426505871126530,2020-05-25 17:14:38,en,@gmawowowin 0⃣9⃣3⃣0⃣ 3⃣1⃣8⃣ 8⃣6⃣8⃣1⃣\nMARIPO...,0⃣9⃣3⃣0⃣ 3⃣1⃣8⃣ 8⃣6⃣8⃣1⃣ MARIPOL SEBASTIAN ...
5,1264847349412696064,811738165400641536,2020-05-25 17:14:42,en,@Alonzotrishaa\n\n-cutie pie\n-baby face\n-gor...,-cutie pie -baby face -gorgeous!!
6,1264847379007598592,1246426505871126530,2020-05-25 17:14:49,en,@gmawowowin 0⃣9⃣3⃣0⃣ 3⃣1⃣8⃣ 8⃣6⃣8⃣1⃣\nMARIPO...,0⃣9⃣3⃣0⃣ 3⃣1⃣8⃣ 8⃣6⃣8⃣1⃣ MARIPOL SEBASTIAN ...
7,1264847404118839296,750706888354910208,2020-05-25 17:14:55,en,No empty carbs day starts tomorrow! Help me. H...,No empty carbs day starts tomorrow! Help me. H...
8,1264847404718669827,727839200653901824,2020-05-25 17:14:55,en,Every wait has a worth,Every wait has a worth
9,1264847406606233600,459997572,2020-05-25 17:14:56,en,Snack Time🥤🍝🍔🍟 from McDonald's when we crave ...,Snack Time🥤🍝🍔🍟 from McDonald's when we crave ...


Now, we are going to remove the emojis.

In [14]:
eng_df["clean_text"] = np.vectorize(give_emoji_free_text)(eng_df["clean_text"])
eng_df.head(10)

Unnamed: 0,tweet_id,author_id,created_at_utc+8,lang,text,clean_text
0,1264847299030597632,2647675584,2020-05-25 17:14:30,en,Bridgetowne yow,Bridgetowne yow
1,1264847301568159746,947031284177502208,2020-05-25 17:14:31,en,One of my fav🤧♥️\n#백현 #BAEKHYUN #엑소 #EXO #wear...,One of my fav #백현 #BAEKHYUN #엑소 #EXO #weareone...
2,1264847307943473152,569607182,2020-05-25 17:14:32,en,@killakushla Loveyou!!!,Loveyou!!!
3,1264847328491393024,916702759537090560,2020-05-25 17:14:37,en,byeee imma listen to the b-tracks #Delight,byeee imma listen to the b-tracks #Delight
4,1264847331788066816,1246426505871126530,2020-05-25 17:14:38,en,@gmawowowin 0⃣9⃣3⃣0⃣ 3⃣1⃣8⃣ 8⃣6⃣8⃣1⃣\nMARIPO...,MARIPOL SEBASTIAN LAS PIÑAS CITY #Shop...
5,1264847349412696064,811738165400641536,2020-05-25 17:14:42,en,@Alonzotrishaa\n\n-cutie pie\n-baby face\n-gor...,-cutie pie -baby face -gorgeous!!
6,1264847379007598592,1246426505871126530,2020-05-25 17:14:49,en,@gmawowowin 0⃣9⃣3⃣0⃣ 3⃣1⃣8⃣ 8⃣6⃣8⃣1⃣\nMARIPO...,MARIPOL SEBASTIAN LAS PIÑAS CITY #Shop...
7,1264847404118839296,750706888354910208,2020-05-25 17:14:55,en,No empty carbs day starts tomorrow! Help me. H...,No empty carbs day starts tomorrow! Help me. H...
8,1264847404718669827,727839200653901824,2020-05-25 17:14:55,en,Every wait has a worth,Every wait has a worth
9,1264847406606233600,459997572,2020-05-25 17:14:56,en,Snack Time🥤🍝🍔🍟 from McDonald's when we crave ...,Snack Time from McDonald's when we crave for ...


We will make all characters in lowercase format to reduce variation.

In [15]:
for i in range(len(eng_df["clean_text"])):
    eng_df["clean_text"][i] = eng_df["clean_text"][i].lower()
eng_df.head(5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eng_df["clean_text"][i] = eng_df["clean_text"][i].lower()


Unnamed: 0,tweet_id,author_id,created_at_utc+8,lang,text,clean_text
0,1264847299030597632,2647675584,2020-05-25 17:14:30,en,Bridgetowne yow,bridgetowne yow
1,1264847301568159746,947031284177502208,2020-05-25 17:14:31,en,One of my fav🤧♥️\n#백현 #BAEKHYUN #엑소 #EXO #wear...,one of my fav #백현 #baekhyun #엑소 #exo #weareone...
2,1264847307943473152,569607182,2020-05-25 17:14:32,en,@killakushla Loveyou!!!,loveyou!!!
3,1264847328491393024,916702759537090560,2020-05-25 17:14:37,en,byeee imma listen to the b-tracks #Delight,byeee imma listen to the b-tracks #delight
4,1264847331788066816,1246426505871126530,2020-05-25 17:14:38,en,@gmawowowin 0⃣9⃣3⃣0⃣ 3⃣1⃣8⃣ 8⃣6⃣8⃣1⃣\nMARIPO...,maripol sebastian las piñas city #shop...


In [57]:
eng_df["lemma_text"] = np.vectorize(lemma_text)(eng_df["clean_text"])
eng_df.head(5)

Unnamed: 0,tweet_id,author_id,created_at_utc+8,lang,text,clean_text,lemma_text
0,1264847299030597632,2647675584,2020-05-25 17:14:30,en,Bridgetowne yow,bridgetowne yow,bridgetowne yow
1,1264847301568159746,947031284177502208,2020-05-25 17:14:31,en,One of my fav🤧♥️\n#백현 #BAEKHYUN #엑소 #EXO #wear...,one of my fav #백현 #baekhyun #엑소 #exo #weareone...,one of my fav # 백현 # baekhyun # 엑소 # exo # we...
2,1264847307943473152,569607182,2020-05-25 17:14:32,en,@killakushla Loveyou!!!,loveyou!!!,loveyou ! ! !
3,1264847328491393024,916702759537090560,2020-05-25 17:14:37,en,byeee imma listen to the b-tracks #Delight,byeee imma listen to the b-tracks #delight,byeee imma listen to the b-tracks # delight
4,1264847331788066816,1246426505871126530,2020-05-25 17:14:38,en,@gmawowowin 0⃣9⃣3⃣0⃣ 3⃣1⃣8⃣ 8⃣6⃣8⃣1⃣\nMARIPO...,maripol sebastian las piñas city #shop...,maripol sebastian las piñas city # shopee # f...


In [24]:
regex = "#[A-Za-z0-9']+"
hashtags = []

for i in range(len(eng_df["clean_text"])):
    words = re.findall(regex, eng_df["clean_text"][i])
    hashtag = ' '.join(words)
    hashtags.append(hashtag)
    
hashtags = pd.Series(hashtags)

In [25]:
print(hashtags)

0                                                         
1              #baekhyun #exo #weareoneexo #delight #candy
2                                                         
3                                                 #delight
4                        #shopee #frontrow #gencee #plemex
                               ...                        
73139                                                     
73140    #blacklivesmatter #justiceforgeogefloyd #black...
73141                                                     
73142                                                     
73143                                                     
Length: 73144, dtype: object


In [41]:
tweets = 0
for i in range(len(hashtags)):
    if hashtags.loc[i].find("#covid") != -1 or hashtags.loc[i].find("#pandemic") != -1 or hashtags.loc[i].find("#vaccine") != -1:
        print(hashtags.loc[i])
        tweets += 1
print(tweets)

#shopee #frontrow #gencee #plemex #tutoktowinsawowowin #pandemicperiod
#rdinthemetro #buhayurban #covid19ph
#stayathomechallenge #covid19 #coronovirusoutbreak #coronavirus #wakandaforever #restrictedmovementordermy #lockdown #malaysia
#covid19
#wishfulthinking #beachlife #missingthebeach #pinaswonders #pinastravel #goprohero8 #communityquarantine #covid
#covid
#covid19 #philippines #covid19ph #coronavirus #covid
#manila #cagayandeoro #pandemic
#labanglungs #covid19
#philippines #pandemic #frontliners #surgery #themedicalcity #tmc #nurse #nurses
#harryroque #mecq #ncr #communityquarantine #covid19ph
#covid #covidhaircut
#upcovid19responseteam #upri #covid19ph
#mecq #covid19ph #pasigriver #manila #ripples #waterbender #blackandwhite #summer2020
#covid19
#mothernature #healing #covid19philippines #peace #whyweshouldleaveearth #followme #followers #followforfollow
#covid19
#2012 #boracay #island #world #bestfriends #best #photography #covid19 #philppines
#intramuros #prepandemic #readytotr

In [58]:
vader = SentimentIntensityAnalyzer()
for i in range(100):
    #print(eng_df.loc[i]["clean_text"])
    print(i)
    print(vader.polarity_scores(eng_df.loc[i]["text"]))
    print(vader.polarity_scores(eng_df.loc[i]["clean_text"]))
    print(vader.polarity_scores(eng_df.loc[i]["lemma_text"]))

0
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
1
{'neg': 0.0, 'neu': 0.59, 'pos': 0.41, 'compound': 0.9022}
{'neg': 0.0, 'neu': 0.592, 'pos': 0.408, 'compound': 0.7845}
{'neg': 0.0, 'neu': 0.723, 'pos': 0.277, 'compound': 0.7845}
2
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
3
{'neg': 0.0, 'neu': 0.606, 'pos': 0.394, 'compound': 0.5994}
{'neg': 0.0, 'neu': 0.606, 'pos': 0.394, 'compound': 0.5994}
{'neg': 0.0, 'neu': 0.642, 'pos': 0.358, 'compound': 0.5994}
4
{'neg': 0.224, 'neu': 0.776, 'pos': 0.0, 'compound': -0.6523}
{'neg': 0.247, 'neu': 0.753, 'pos': 0.0, 'compound': -0.5574}
{'neg': 0.194, 'neu': 0.806, 'pos': 0.0, 'compound': -0.5574}
5
{'neg': 0.0, 'neu': 0.361, 'pos': 0.639, 'compound': 0.7955}
{'neg': 0.0, 'neu': 0.298, 'pos': 0.702, 'compound': 

 Three bean inside a colored bag ha escaped !
