# INTRNLP Twitter

Insert short introduction here.

# Import Modules

In [1]:
import numpy as np
import pandas as pd
import re
import os

# Load Data

In [2]:
def check_data_exists(data):
    return os.path.exists(data)

In [3]:
if check_data_exists("all_tweets.csv"):
    df = pd.read_csv("all_tweets.csv")
    df.drop(["Unnamed: 0"], inplace=True, axis=1)
else:
    for i in range(0, 24):
        if i == 0:
            df = pd.read_csv(f"Data/tweets_{i}.csv")
        else:
            more_df = pd.read_csv(f"Data/tweets_{i}.csv")
            df.append(more_df, ignore_index=True)
    df.to_csv("all_tweets.csv")

In [4]:
df.head(5)

Unnamed: 0,tweet_id,author_id,created_at_utc+8,lang,text
0,1264847297260605441,303106526,2020-05-25 17:14:30,tl,@alphangela bes wag ganyan ang attitude char i...
1,1264847299030597632,2647675584,2020-05-25 17:14:30,en,Bridgetowne yow
2,1264847299198414848,1074247547907174401,2020-05-25 17:14:30,tl,@Kookie07Jeon Can't stop laughing tf HAHAHAHAH...
3,1264847299210928128,1010402467878658048,2020-05-25 17:14:30,tl,@azilhanna09 HAHAHAHHAHAHAHHAAA ML NA LANG
4,1264847301568159746,947031284177502208,2020-05-25 17:14:31,en,One of my fav🤧♥️\n#백현 #BAEKHYUN #엑소 #EXO #wear...


# Preprocessing

Let's check if there are any null values.

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   tweet_id          250000 non-null  int64 
 1   author_id         250000 non-null  int64 
 2   created_at_utc+8  250000 non-null  object
 3   lang              250000 non-null  object
 4   text              250000 non-null  object
dtypes: int64(2), object(3)
memory usage: 9.5+ MB


There are no null values in the corpus. We can start checking the texts. Before that, we will acquire all of the English texts available in the corpus.

In [6]:
df["lang"].unique()

array(['tl', 'en', 'und', 'et', 'es', 'ko', 'nl', 'tr', 'in', 'hi', 'fi',
       'pt', 'de', 'it', 'ca', 'ht', 'pl', 'lt', 'hu', 'sv', 'el', 'cy',
       'da', 'fr', 'cs', 'no', 'eu', 'lv', 'ja', 'sl', 'vi', 'is', 'ro',
       'ar', 'th', 'zh', 'ur', 'ru', 'iw', 'bn'], dtype=object)

In [7]:
eng_df = df[df["lang"] == "en"]
eng_df = eng_df.reset_index()
eng_df.drop(["index"], inplace=True, axis=1)
eng_df.head(5)

Unnamed: 0,tweet_id,author_id,created_at_utc+8,lang,text
0,1264847299030597632,2647675584,2020-05-25 17:14:30,en,Bridgetowne yow
1,1264847301568159746,947031284177502208,2020-05-25 17:14:31,en,One of my fav🤧♥️\n#백현 #BAEKHYUN #엑소 #EXO #wear...
2,1264847307943473152,569607182,2020-05-25 17:14:32,en,@killakushla Loveyou!!!
3,1264847328491393024,916702759537090560,2020-05-25 17:14:37,en,byeee imma listen to the b-tracks #Delight
4,1264847331788066816,1246426505871126530,2020-05-25 17:14:38,en,@gmawowowin 0⃣9⃣3⃣0⃣ 3⃣1⃣8⃣ 8⃣6⃣8⃣1⃣\nMARIPO...


All of the English tweets are acquired. We will have to clean the texts now. We will store the clean texts as a new column. First, we will remove all `@user` tags since they don't play a role in finding sentiment.

In [8]:
def remove_text_regex(text, regex):
    patterns = re.findall(regex, text)
    for pattern in patterns:
        text = re.sub(pattern, '', text)
    return text

In [9]:
def extract_text_regex(text, regex):
    patterns = re.findall(regex, text)
    for pattern in patterns:
        text = ' '.join(pattern)
    return text

In [10]:
eng_df["clean_text"] = np.vectorize(remove_text_regex)(eng_df["text"], "@[\w]*")
eng_df.head(5)

Unnamed: 0,tweet_id,author_id,created_at_utc+8,lang,text,clean_text
0,1264847299030597632,2647675584,2020-05-25 17:14:30,en,Bridgetowne yow,Bridgetowne yow
1,1264847301568159746,947031284177502208,2020-05-25 17:14:31,en,One of my fav🤧♥️\n#백현 #BAEKHYUN #엑소 #EXO #wear...,One of my fav🤧♥️\n#백현 #BAEKHYUN #엑소 #EXO #wear...
2,1264847307943473152,569607182,2020-05-25 17:14:32,en,@killakushla Loveyou!!!,Loveyou!!!
3,1264847328491393024,916702759537090560,2020-05-25 17:14:37,en,byeee imma listen to the b-tracks #Delight,byeee imma listen to the b-tracks #Delight
4,1264847331788066816,1246426505871126530,2020-05-25 17:14:38,en,@gmawowowin 0⃣9⃣3⃣0⃣ 3⃣1⃣8⃣ 8⃣6⃣8⃣1⃣\nMARIPO...,0⃣9⃣3⃣0⃣ 3⃣1⃣8⃣ 8⃣6⃣8⃣1⃣\nMARIPOL SEBASTIAN...


We have our new column to store clean texts. However, we are far from done. Let's check if there are links found in the texts. If links are present, we will remove them as well.

In [11]:
eng_df["clean_text"].head(20)

0                                       Bridgetowne yow
1     One of my fav🤧♥️\n#백현 #BAEKHYUN #엑소 #EXO #wear...
2                                            Loveyou!!!
3            byeee imma listen to the b-tracks #Delight
4      0⃣9⃣3⃣0⃣  3⃣1⃣8⃣  8⃣6⃣8⃣1⃣\nMARIPOL SEBASTIAN...
5               \n\n-cutie pie\n-baby face\n-gorgeous!!
6      0⃣9⃣3⃣0⃣  3⃣1⃣8⃣  8⃣6⃣8⃣1⃣\nMARIPOL SEBASTIAN...
7     No empty carbs day starts tomorrow! Help me. H...
8                                Every wait has a worth
9     Snack Time🥤🍝🍔🍟 from McDonald's  when we crave ...
10    Already out on Spotify \n#baekhyun #Delight #B...
11    Did you all notice those City Lights posters? ...
12    Helluva Summer 🔥 #SummerTime https://t.co/j5BL...
13     I Don't get this?\nI fixed Everything and thi...
14    Not in any way close to what I feel right now....
15                 as in!!! 😍😍😍 https://t.co/102rhUvrnX
16                    Superpower to bring in more POGO?
17             stream candy now https://t.co/Pwz

In [12]:
eng_df["clean_text"] = np.vectorize(remove_text_regex)(eng_df["clean_text"], "https?:\/\/[a-zA-Z0-9._]+\/?[a-zA-Z0-9._]+")
eng_df["clean_text"].head(20)

0                                       Bridgetowne yow
1     One of my fav🤧♥️\n#백현 #BAEKHYUN #엑소 #EXO #wear...
2                                            Loveyou!!!
3            byeee imma listen to the b-tracks #Delight
4      0⃣9⃣3⃣0⃣  3⃣1⃣8⃣  8⃣6⃣8⃣1⃣\nMARIPOL SEBASTIAN...
5               \n\n-cutie pie\n-baby face\n-gorgeous!!
6      0⃣9⃣3⃣0⃣  3⃣1⃣8⃣  8⃣6⃣8⃣1⃣\nMARIPOL SEBASTIAN...
7     No empty carbs day starts tomorrow! Help me. H...
8                                Every wait has a worth
9     Snack Time🥤🍝🍔🍟 from McDonald's  when we crave ...
10    Already out on Spotify \n#baekhyun #Delight #B...
11    Did you all notice those City Lights posters? ...
12                        Helluva Summer 🔥 #SummerTime 
13     I Don't get this?\nI fixed Everything and thi...
14    Not in any way close to what I feel right now....
15                                        as in!!! 😍😍😍 
16                    Superpower to bring in more POGO?
17                                    stream can