In [1]:
import numpy as np
import pandas as pd
import csv
import advertools as adv
import nltk
import string
import re

# Stop words libraries

In [2]:
stopwords = nltk.corpus.stopwords.words('english')

In [3]:
stopwordsT = adv.stopwords['tagalog']
sorted(adv.stopwords['tagalog'])[:20]

['akin',
 'aking',
 'ako',
 'alin',
 'am',
 'amin',
 'aming',
 'ang',
 'ano',
 'anumang',
 'apat',
 'at',
 'atin',
 'ating',
 'ay',
 'bababa',
 'bago',
 'bakit',
 'bawat',
 'bilang']

# Import datasets

In [4]:
df = pd.read_csv("Englishs.csv")
df["tweet"] = df["tweets"].str.lower()
df.head()

Unnamed: 0,tweets,score,tweet
0,RIP sa mga pang lakad na damit na ginawa na la...,1,rip sa mga pang lakad na damit na ginawa na la...
1,Ever since the rise and fall of Dictator Ferdi...,2,ever since the rise and fall of dictator ferdi...
2,Sa ika- apatnapu't siyam na taon mula nang ipi...,2,sa ika- apatnapu't siyam na taon mula nang ipi...
3,#NeverForget #NeverAgain #OustDuterte #Impeach...,1,#neverforget #neveragain #oustduterte #impeach...
4,"Uy, gagi! 'Wag mong palampasin ang araw na 'to...",2,"uy, gagi! 'wag mong palampasin ang araw na 'to..."


# Removal of Punctiations and URLs

In [5]:
def depure_data(df):

    #Removing URLs with a regular expression
    url_pattern = re.compile(r'https?://\S+|www.\S+')
    df = url_pattern.sub(r'', df)

    # Remove Emails
    df = re.sub('\S@\S\s?', '', df)

    # Remove new line characters
    df = re.sub('\s+', ' ', df)

    # Remove distracting single quotes
    df = re.sub("'", "", df)
    
    #Remove @ tags (mentions)
    df = re.sub(r'@[A-Za-z0-9]+','',df)

    #Remove hashtags
    #df = re.sub("[^a-zA-Z]", " ", df)

    #Remove non-alphanumeric characters
    df = re.sub("[^a-z0-9]"," ", df)

    return df

df['tweet1'] = df['tweet'].apply(lambda x: depure_data(x))
df

Unnamed: 0,tweets,score,tweet,tweet1
0,RIP sa mga pang lakad na damit na ginawa na la...,1,rip sa mga pang lakad na damit na ginawa na la...,rip sa mga pang lakad na damit na ginawa na la...
1,Ever since the rise and fall of Dictator Ferdi...,2,ever since the rise and fall of dictator ferdi...,ever since the rise and fall of dictator ferdi...
2,Sa ika- apatnapu't siyam na taon mula nang ipi...,2,sa ika- apatnapu't siyam na taon mula nang ipi...,sa ika apatnaput siyam na taon mula nang ipin...
3,#NeverForget #NeverAgain #OustDuterte #Impeach...,1,#neverforget #neveragain #oustduterte #impeach...,neverforget neveragain oustduterte impeach...
4,"Uy, gagi! 'Wag mong palampasin ang araw na 'to...",2,"uy, gagi! 'wag mong palampasin ang araw na 'to...",uy gagi wag mong palampasin ang araw na to u...
...,...,...,...,...
3205,"@mariaressa Romeo, Nasapol mo Isko, ang galing...",1,"@mariaressa romeo, nasapol mo isko, ang galing...",romeo nasapol mo isko ang galing medyo kul...
3206,@ADPR72463 sabi dati daw yang rebelde noong pa...,1,@adpr72463 sabi dati daw yang rebelde noong pa...,sabi dati daw yang rebelde noong panahon ng m...
3207,Help slow the spread of #COVID19 and identify ...,1,help slow the spread of #covid19 and identify ...,help slow the spread of covid19 and identify ...
3208,"“It is emotionally tough to see patients, figh...",1,"“it is emotionally tough to see patients, figh...",it is emotionally tough to see patients figh...


In [6]:
def remove_punct(text):
    text = "".join([char for char in text if char not in string.punctuation])
    return text

df['text'] = df['tweet1'].apply(lambda x: remove_punct(x))

df.head(20)

Unnamed: 0,tweets,score,tweet,tweet1,text
0,RIP sa mga pang lakad na damit na ginawa na la...,1,rip sa mga pang lakad na damit na ginawa na la...,rip sa mga pang lakad na damit na ginawa na la...,rip sa mga pang lakad na damit na ginawa na la...
1,Ever since the rise and fall of Dictator Ferdi...,2,ever since the rise and fall of dictator ferdi...,ever since the rise and fall of dictator ferdi...,ever since the rise and fall of dictator ferdi...
2,Sa ika- apatnapu't siyam na taon mula nang ipi...,2,sa ika- apatnapu't siyam na taon mula nang ipi...,sa ika apatnaput siyam na taon mula nang ipin...,sa ika apatnaput siyam na taon mula nang ipin...
3,#NeverForget #NeverAgain #OustDuterte #Impeach...,1,#neverforget #neveragain #oustduterte #impeach...,neverforget neveragain oustduterte impeach...,neverforget neveragain oustduterte impeach...
4,"Uy, gagi! 'Wag mong palampasin ang araw na 'to...",2,"uy, gagi! 'wag mong palampasin ang araw na 'to...",uy gagi wag mong palampasin ang araw na to u...,uy gagi wag mong palampasin ang araw na to u...
5,"akala ko burger king, snr pala hehehe okay po",1,"akala ko burger king, snr pala hehehe okay po",akala ko burger king snr pala hehehe okay po,akala ko burger king snr pala hehehe okay po
6,@GemhilGeorge @DickGordonDG Ilang Oust Duterte...,2,@gemhilgeorge @dickgordondg ilang oust duterte...,ilang oust duterte na nga kayo eh na oust...,ilang oust duterte na nga kayo eh na oust...
7,no time to wait for 2022! if the policeâ€™s ac...,1,no time to wait for 2022! if the policeâ€™s ac...,no time to wait for 2022 if the police s ac...,no time to wait for 2022 if the police s ac...
8,tangina ano na,1,tangina ano na,tangina ano na,tangina ano na
9,oust duterte,1,oust duterte,oust duterte,oust duterte


# Tokenization

In [7]:
# Define a function to split our sentences into a list of words
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

df['tweet_Tokenized'] = df['text'].apply(lambda x: tokenize(x.lower()))

df.head()

Unnamed: 0,tweets,score,tweet,tweet1,text,tweet_Tokenized
0,RIP sa mga pang lakad na damit na ginawa na la...,1,rip sa mga pang lakad na damit na ginawa na la...,rip sa mga pang lakad na damit na ginawa na la...,rip sa mga pang lakad na damit na ginawa na la...,"[rip, sa, mga, pang, lakad, na, damit, na, gin..."
1,Ever since the rise and fall of Dictator Ferdi...,2,ever since the rise and fall of dictator ferdi...,ever since the rise and fall of dictator ferdi...,ever since the rise and fall of dictator ferdi...,"[ever, since, the, rise, and, fall, of, dictat..."
2,Sa ika- apatnapu't siyam na taon mula nang ipi...,2,sa ika- apatnapu't siyam na taon mula nang ipi...,sa ika apatnaput siyam na taon mula nang ipin...,sa ika apatnaput siyam na taon mula nang ipin...,"[sa, ika, apatnaput, siyam, na, taon, mula, na..."
3,#NeverForget #NeverAgain #OustDuterte #Impeach...,1,#neverforget #neveragain #oustduterte #impeach...,neverforget neveragain oustduterte impeach...,neverforget neveragain oustduterte impeach...,"[, neverforget, neveragain, oustduterte, impea..."
4,"Uy, gagi! 'Wag mong palampasin ang araw na 'to...",2,"uy, gagi! 'wag mong palampasin ang araw na 'to...",uy gagi wag mong palampasin ang araw na to u...,uy gagi wag mong palampasin ang araw na to u...,"[uy, gagi, wag, mong, palampasin, ang, araw, n..."


# Stopwords Removal

In [8]:
# Define a function to remove all stopwords
def remove_stopwords(tokenized_text):    
    text = [word for word in tokenized_text if word not in stopwords]
    return text

df['cleanedTweet'] = df['tweet_Tokenized'].apply(lambda x: remove_stopwords(x))

df.head()

Unnamed: 0,tweets,score,tweet,tweet1,text,tweet_Tokenized,cleanedTweet
0,RIP sa mga pang lakad na damit na ginawa na la...,1,rip sa mga pang lakad na damit na ginawa na la...,rip sa mga pang lakad na damit na ginawa na la...,rip sa mga pang lakad na damit na ginawa na la...,"[rip, sa, mga, pang, lakad, na, damit, na, gin...","[rip, sa, mga, pang, lakad, na, damit, na, gin..."
1,Ever since the rise and fall of Dictator Ferdi...,2,ever since the rise and fall of dictator ferdi...,ever since the rise and fall of dictator ferdi...,ever since the rise and fall of dictator ferdi...,"[ever, since, the, rise, and, fall, of, dictat...","[ever, since, rise, fall, dictator, ferdinand,..."
2,Sa ika- apatnapu't siyam na taon mula nang ipi...,2,sa ika- apatnapu't siyam na taon mula nang ipi...,sa ika apatnaput siyam na taon mula nang ipin...,sa ika apatnaput siyam na taon mula nang ipin...,"[sa, ika, apatnaput, siyam, na, taon, mula, na...","[sa, ika, apatnaput, siyam, na, taon, mula, na..."
3,#NeverForget #NeverAgain #OustDuterte #Impeach...,1,#neverforget #neveragain #oustduterte #impeach...,neverforget neveragain oustduterte impeach...,neverforget neveragain oustduterte impeach...,"[, neverforget, neveragain, oustduterte, impea...","[, neverforget, neveragain, oustduterte, impea..."
4,"Uy, gagi! 'Wag mong palampasin ang araw na 'to...",2,"uy, gagi! 'wag mong palampasin ang araw na 'to...",uy gagi wag mong palampasin ang araw na to u...,uy gagi wag mong palampasin ang araw na to u...,"[uy, gagi, wag, mong, palampasin, ang, araw, n...","[uy, gagi, wag, mong, palampasin, ang, araw, n..."


In [9]:
def remove_tagalog_stopwords(tokenized_text):    
    text = [word for word in tokenized_text if word not in stopwordsT]
    return text

df['cleanedTweet2'] = df['cleanedTweet'].apply(lambda x: remove_tagalog_stopwords(x))

df.head()

Unnamed: 0,tweets,score,tweet,tweet1,text,tweet_Tokenized,cleanedTweet,cleanedTweet2
0,RIP sa mga pang lakad na damit na ginawa na la...,1,rip sa mga pang lakad na damit na ginawa na la...,rip sa mga pang lakad na damit na ginawa na la...,rip sa mga pang lakad na damit na ginawa na la...,"[rip, sa, mga, pang, lakad, na, damit, na, gin...","[rip, sa, mga, pang, lakad, na, damit, na, gin...","[rip, pang, lakad, damit, lang, pang, bahay, o..."
1,Ever since the rise and fall of Dictator Ferdi...,2,ever since the rise and fall of dictator ferdi...,ever since the rise and fall of dictator ferdi...,ever since the rise and fall of dictator ferdi...,"[ever, since, the, rise, and, fall, of, dictat...","[ever, since, rise, fall, dictator, ferdinand,...","[ever, since, rise, fall, dictator, ferdinand,..."
2,Sa ika- apatnapu't siyam na taon mula nang ipi...,2,sa ika- apatnapu't siyam na taon mula nang ipi...,sa ika apatnaput siyam na taon mula nang ipin...,sa ika apatnaput siyam na taon mula nang ipin...,"[sa, ika, apatnaput, siyam, na, taon, mula, na...","[sa, ika, apatnaput, siyam, na, taon, mula, na...","[ika, apatnaput, siyam, taon, nang, ipinataw, ..."
3,#NeverForget #NeverAgain #OustDuterte #Impeach...,1,#neverforget #neveragain #oustduterte #impeach...,neverforget neveragain oustduterte impeach...,neverforget neveragain oustduterte impeach...,"[, neverforget, neveragain, oustduterte, impea...","[, neverforget, neveragain, oustduterte, impea...","[, neverforget, neveragain, oustduterte, impea..."
4,"Uy, gagi! 'Wag mong palampasin ang araw na 'to...",2,"uy, gagi! 'wag mong palampasin ang araw na 'to...",uy gagi wag mong palampasin ang araw na to u...,uy gagi wag mong palampasin ang araw na to u...,"[uy, gagi, wag, mong, palampasin, ang, araw, n...","[uy, gagi, wag, mong, palampasin, ang, araw, n...","[uy, gagi, wag, mong, palampasin, araw, upang,..."


In [10]:
clean_df = df.drop(['tweets','tweet','tweet_Tokenized','cleanedTweet','text','tweet1'], axis=1)
clean_df

Unnamed: 0,score,cleanedTweet2
0,1,"[rip, pang, lakad, damit, lang, pang, bahay, o..."
1,2,"[ever, since, rise, fall, dictator, ferdinand,..."
2,2,"[ika, apatnaput, siyam, taon, nang, ipinataw, ..."
3,1,"[, neverforget, neveragain, oustduterte, impea..."
4,2,"[uy, gagi, wag, mong, palampasin, araw, upang,..."
...,...,...
3205,1,"[, romeo, nasapol, mo, isko, galing, medyo, ta..."
3206,1,"[, dati, daw, yang, rebelde, noong, martial, l..."
3207,1,"[help, slow, spread, covid19, identify, risk, ..."
3208,1,"[, emotionally, tough, see, patients, fighting..."


In [11]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3210 entries, 0 to 3209
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   score          3210 non-null   object
 1   cleanedTweet2  3210 non-null   object
dtypes: object(2)
memory usage: 50.3+ KB


In [12]:
#clean_df.to_csv(r"Preprocessed english.csv", index = False)

# Lemmatization

In [13]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Error loading wordnet: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [14]:
lemmatizer = WordNetLemmatizer()

lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in clean_df['cleanedTweet2'][3209]])
print(lemmatized_output)

vaccine used southafrica tested approved sahpra serious side effect could cause long term health problem extremely unlikely following vaccination including covid19 vaccination 


In [15]:
def lemmatize(s):
     s = [lemmatizer.lemmatize(word) for word in s]
     return s

df5 = clean_df.assign(col_lemma = clean_df['cleanedTweet2'].apply(lambda x: lemmatize(x)))
df5

Unnamed: 0,score,cleanedTweet2,col_lemma
0,1,"[rip, pang, lakad, damit, lang, pang, bahay, o...","[rip, pang, lakad, damit, lang, pang, bahay, o..."
1,2,"[ever, since, rise, fall, dictator, ferdinand,...","[ever, since, rise, fall, dictator, ferdinand,..."
2,2,"[ika, apatnaput, siyam, taon, nang, ipinataw, ...","[ika, apatnaput, siyam, taon, nang, ipinataw, ..."
3,1,"[, neverforget, neveragain, oustduterte, impea...","[, neverforget, neveragain, oustduterte, impea..."
4,2,"[uy, gagi, wag, mong, palampasin, araw, upang,...","[uy, gagi, wag, mong, palampasin, araw, upang,..."
...,...,...,...
3205,1,"[, romeo, nasapol, mo, isko, galing, medyo, ta...","[, romeo, nasapol, mo, isko, galing, medyo, ta..."
3206,1,"[, dati, daw, yang, rebelde, noong, martial, l...","[, dati, daw, yang, rebelde, noong, martial, l..."
3207,1,"[help, slow, spread, covid19, identify, risk, ...","[help, slow, spread, covid19, identify, risk, ..."
3208,1,"[, emotionally, tough, see, patients, fighting...","[, emotionally, tough, see, patient, fighting,..."


In [16]:
df6 = df5.drop(['cleanedTweet2'], axis=1)
df6

Unnamed: 0,score,col_lemma
0,1,"[rip, pang, lakad, damit, lang, pang, bahay, o..."
1,2,"[ever, since, rise, fall, dictator, ferdinand,..."
2,2,"[ika, apatnaput, siyam, taon, nang, ipinataw, ..."
3,1,"[, neverforget, neveragain, oustduterte, impea..."
4,2,"[uy, gagi, wag, mong, palampasin, araw, upang,..."
...,...,...
3205,1,"[, romeo, nasapol, mo, isko, galing, medyo, ta..."
3206,1,"[, dati, daw, yang, rebelde, noong, martial, l..."
3207,1,"[help, slow, spread, covid19, identify, risk, ..."
3208,1,"[, emotionally, tough, see, patient, fighting,..."


In [17]:
df6.to_csv(r"Preprocessed english.csv", index = False)