In [1]:
import numpy as np
import pandas as pd
import csv
import advertools as adv
import nltk
import string
import re

# Stop words libraries

In [2]:
stopwords = nltk.corpus.stopwords.words('english')

In [3]:
stopwordsT = adv.stopwords['tagalog']
sorted(adv.stopwords['tagalog'])[:10]

['akin',
 'aking',
 'ako',
 'alin',
 'am',
 'amin',
 'aming',
 'ang',
 'ano',
 'anumang']

# Import datasets

In [4]:
df = pd.read_csv("Tagalogs.csv")
df["tweet"] = df["tweets"].str.lower()
df.head()

Unnamed: 0,tweets,score,tweet
0,"Uy, gagi! 'Wag kang magpaniwala sa haka-hakang...",1,"uy, gagi! 'wag kang magpaniwala sa haka-hakang..."
1,"Marcos, Duterte, walang pinag-iba! Parehong tu...",2,"marcos, duterte, walang pinag-iba! parehong tu..."
2,Malinaw na malinaw na malinaw na pinapatagal l...,2,malinaw na malinaw na malinaw na pinapatagal l...
3,"MARCOS, HINDI MAGIGING BAYANI KAILANMAN! Basa...",1,"marcos, hindi magiging bayani kailanman! basa..."
4,#NeverAgainToMartialLaw resist the rehabilitat...,2,#neveragaintomartiallaw resist the rehabilitat...


# Removal of Punctiations and URLs

In [5]:
def depure_data(df):

    #Removing URLs with a regular expression
    url_pattern = re.compile(r'https?://\S+|www.\S+')
    df = url_pattern.sub(r'', df)

    # Remove Emails
    df = re.sub('\S@\S\s?', '', df)

    # Remove new line characters
    df = re.sub('\s+', ' ', df)

    # Remove distracting single quotes
    df = re.sub("'", "", df)
    
    #Remove @ tags (mentions)
    df = re.sub(r'@[A-Za-z0-9]+','',df)

    #Remove hashtags
    #df = re.sub("[^a-zA-Z]", " ", df)

    #Remove non-alphanumeric characters
    df = re.sub("[^a-z0-9]"," ", df)

    return df

df['tweet1'] = df['tweet'].apply(lambda x: depure_data(x))
df

Unnamed: 0,tweets,score,tweet,tweet1
0,"Uy, gagi! 'Wag kang magpaniwala sa haka-hakang...",1,"uy, gagi! 'wag kang magpaniwala sa haka-hakang...",uy gagi wag kang magpaniwala sa haka hakang ...
1,"Marcos, Duterte, walang pinag-iba! Parehong tu...",2,"marcos, duterte, walang pinag-iba! parehong tu...",marcos duterte walang pinag iba parehong tu...
2,Malinaw na malinaw na malinaw na pinapatagal l...,2,malinaw na malinaw na malinaw na pinapatagal l...,malinaw na malinaw na malinaw na pinapatagal l...
3,"MARCOS, HINDI MAGIGING BAYANI KAILANMAN! Basa...",1,"marcos, hindi magiging bayani kailanman! basa...",marcos hindi magiging bayani kailanman basah...
4,#NeverAgainToMartialLaw resist the rehabilitat...,2,#neveragaintomartiallaw resist the rehabilitat...,neveragaintomartiallaw resist the rehabilitat...
...,...,...,...,...
2355,BoBong Marcos can't even admit the fault of fa...,1,bobong marcos can't even admit the fault of fa...,bobong marcos cant even admit the fault of fam...
2356,ginawang sofa yung presidency stomoyern,1,ginawang sofa yung presidency stomoyern,ginawang sofa yung presidency stomoyern
2357,parang di presidency ang pinaguusapan pLEEK 🥴,1,parang di presidency ang pinaguusapan pleek 🥴,parang di presidency ang pinaguusapan pleek
2358,Presidency: Occupation is the essence of terro...,1,presidency: occupation is the essence of terro...,presidency occupation is the essence of terro...


In [6]:
def remove_punct(text):
    text = "".join([char for char in text if char not in string.punctuation])
    return text

df['text'] = df['tweet1'].apply(lambda x: remove_punct(x))

df.head(20)

Unnamed: 0,tweets,score,tweet,tweet1,text
0,"Uy, gagi! 'Wag kang magpaniwala sa haka-hakang...",1,"uy, gagi! 'wag kang magpaniwala sa haka-hakang...",uy gagi wag kang magpaniwala sa haka hakang ...,uy gagi wag kang magpaniwala sa haka hakang ...
1,"Marcos, Duterte, walang pinag-iba! Parehong tu...",2,"marcos, duterte, walang pinag-iba! parehong tu...",marcos duterte walang pinag iba parehong tu...,marcos duterte walang pinag iba parehong tu...
2,Malinaw na malinaw na malinaw na pinapatagal l...,2,malinaw na malinaw na malinaw na pinapatagal l...,malinaw na malinaw na malinaw na pinapatagal l...,malinaw na malinaw na malinaw na pinapatagal l...
3,"MARCOS, HINDI MAGIGING BAYANI KAILANMAN! Basa...",1,"marcos, hindi magiging bayani kailanman! basa...",marcos hindi magiging bayani kailanman basah...,marcos hindi magiging bayani kailanman basah...
4,#NeverAgainToMartialLaw resist the rehabilitat...,2,#neveragaintomartiallaw resist the rehabilitat...,neveragaintomartiallaw resist the rehabilitat...,neveragaintomartiallaw resist the rehabilitat...
5,As we remember the 49th anniversary of Martial...,1,as we remember the 49th anniversary of martial...,as we remember the 49th anniversary of martial...,as we remember the 49th anniversary of martial...
6,#NeverAgain #NoToHistoricalRevisionism #EndImp...,1,#neveragain #notohistoricalrevisionism #endimp...,neveragain notohistoricalrevisionism endimp...,neveragain notohistoricalrevisionism endimp...
7,"DUTERTE, MARCOS, WALANG PINAG-IBA, PAREHONG TU...",2,"duterte, marcos, walang pinag-iba, parehong tu...",duterte marcos walang pinag iba parehong tu...,duterte marcos walang pinag iba parehong tu...
8,#NeverAgain sa Batas Militar at #MarcosHindiBa...,1,#neveragain sa batas militar at #marcoshindiba...,neveragain sa batas militar at marcoshindiba...,neveragain sa batas militar at marcoshindiba...
9,"Uy, gagi! 'Wag kang magpaniwala sa haka-hakang...",2,"uy, gagi! 'wag kang magpaniwala sa haka-hakang...",uy gagi wag kang magpaniwala sa haka hakang ...,uy gagi wag kang magpaniwala sa haka hakang ...


# Tokenization

In [7]:
# Define a function to split our sentences into a list of words
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

df['tweet_Tokenized'] = df['text'].apply(lambda x: tokenize(x.lower()))

df.head()

Unnamed: 0,tweets,score,tweet,tweet1,text,tweet_Tokenized
0,"Uy, gagi! 'Wag kang magpaniwala sa haka-hakang...",1,"uy, gagi! 'wag kang magpaniwala sa haka-hakang...",uy gagi wag kang magpaniwala sa haka hakang ...,uy gagi wag kang magpaniwala sa haka hakang ...,"[uy, gagi, wag, kang, magpaniwala, sa, haka, h..."
1,"Marcos, Duterte, walang pinag-iba! Parehong tu...",2,"marcos, duterte, walang pinag-iba! parehong tu...",marcos duterte walang pinag iba parehong tu...,marcos duterte walang pinag iba parehong tu...,"[marcos, duterte, walang, pinag, iba, parehong..."
2,Malinaw na malinaw na malinaw na pinapatagal l...,2,malinaw na malinaw na malinaw na pinapatagal l...,malinaw na malinaw na malinaw na pinapatagal l...,malinaw na malinaw na malinaw na pinapatagal l...,"[malinaw, na, malinaw, na, malinaw, na, pinapa..."
3,"MARCOS, HINDI MAGIGING BAYANI KAILANMAN! Basa...",1,"marcos, hindi magiging bayani kailanman! basa...",marcos hindi magiging bayani kailanman basah...,marcos hindi magiging bayani kailanman basah...,"[marcos, hindi, magiging, bayani, kailanman, b..."
4,#NeverAgainToMartialLaw resist the rehabilitat...,2,#neveragaintomartiallaw resist the rehabilitat...,neveragaintomartiallaw resist the rehabilitat...,neveragaintomartiallaw resist the rehabilitat...,"[, neveragaintomartiallaw, resist, the, rehabi..."


# Stopwords Removal

In [8]:
# Define a function to remove all stopwords
def remove_stopwords(tokenized_text):    
    text = [word for word in tokenized_text if word not in stopwords]
    return text

df['cleanedTweet'] = df['tweet_Tokenized'].apply(lambda x: remove_stopwords(x))

df.head()

Unnamed: 0,tweets,score,tweet,tweet1,text,tweet_Tokenized,cleanedTweet
0,"Uy, gagi! 'Wag kang magpaniwala sa haka-hakang...",1,"uy, gagi! 'wag kang magpaniwala sa haka-hakang...",uy gagi wag kang magpaniwala sa haka hakang ...,uy gagi wag kang magpaniwala sa haka hakang ...,"[uy, gagi, wag, kang, magpaniwala, sa, haka, h...","[uy, gagi, wag, kang, magpaniwala, sa, haka, h..."
1,"Marcos, Duterte, walang pinag-iba! Parehong tu...",2,"marcos, duterte, walang pinag-iba! parehong tu...",marcos duterte walang pinag iba parehong tu...,marcos duterte walang pinag iba parehong tu...,"[marcos, duterte, walang, pinag, iba, parehong...","[marcos, duterte, walang, pinag, iba, parehong..."
2,Malinaw na malinaw na malinaw na pinapatagal l...,2,malinaw na malinaw na malinaw na pinapatagal l...,malinaw na malinaw na malinaw na pinapatagal l...,malinaw na malinaw na malinaw na pinapatagal l...,"[malinaw, na, malinaw, na, malinaw, na, pinapa...","[malinaw, na, malinaw, na, malinaw, na, pinapa..."
3,"MARCOS, HINDI MAGIGING BAYANI KAILANMAN! Basa...",1,"marcos, hindi magiging bayani kailanman! basa...",marcos hindi magiging bayani kailanman basah...,marcos hindi magiging bayani kailanman basah...,"[marcos, hindi, magiging, bayani, kailanman, b...","[marcos, hindi, magiging, bayani, kailanman, b..."
4,#NeverAgainToMartialLaw resist the rehabilitat...,2,#neveragaintomartiallaw resist the rehabilitat...,neveragaintomartiallaw resist the rehabilitat...,neveragaintomartiallaw resist the rehabilitat...,"[, neveragaintomartiallaw, resist, the, rehabi...","[, neveragaintomartiallaw, resist, rehabilitat..."


In [9]:
def remove_tagalog_stopwords(tokenized_text):    
    text = [word for word in tokenized_text if word not in stopwordsT]
    return text

df['cleanedTweet2'] = df['cleanedTweet'].apply(lambda x: remove_tagalog_stopwords(x))

df.head()

Unnamed: 0,tweets,score,tweet,tweet1,text,tweet_Tokenized,cleanedTweet,cleanedTweet2
0,"Uy, gagi! 'Wag kang magpaniwala sa haka-hakang...",1,"uy, gagi! 'wag kang magpaniwala sa haka-hakang...",uy gagi wag kang magpaniwala sa haka hakang ...,uy gagi wag kang magpaniwala sa haka hakang ...,"[uy, gagi, wag, kang, magpaniwala, sa, haka, h...","[uy, gagi, wag, kang, magpaniwala, sa, haka, h...","[uy, gagi, wag, kang, magpaniwala, haka, hakan..."
1,"Marcos, Duterte, walang pinag-iba! Parehong tu...",2,"marcos, duterte, walang pinag-iba! parehong tu...",marcos duterte walang pinag iba parehong tu...,marcos duterte walang pinag iba parehong tu...,"[marcos, duterte, walang, pinag, iba, parehong...","[marcos, duterte, walang, pinag, iba, parehong...","[marcos, duterte, pinag, parehong, tuta, dikta..."
2,Malinaw na malinaw na malinaw na pinapatagal l...,2,malinaw na malinaw na malinaw na pinapatagal l...,malinaw na malinaw na malinaw na pinapatagal l...,malinaw na malinaw na malinaw na pinapatagal l...,"[malinaw, na, malinaw, na, malinaw, na, pinapa...","[malinaw, na, malinaw, na, malinaw, na, pinapa...","[malinaw, malinaw, malinaw, pinapatagal, pande..."
3,"MARCOS, HINDI MAGIGING BAYANI KAILANMAN! Basa...",1,"marcos, hindi magiging bayani kailanman! basa...",marcos hindi magiging bayani kailanman basah...,marcos hindi magiging bayani kailanman basah...,"[marcos, hindi, magiging, bayani, kailanman, b...","[marcos, hindi, magiging, bayani, kailanman, b...","[marcos, magiging, bayani, basahin, buong, pah..."
4,#NeverAgainToMartialLaw resist the rehabilitat...,2,#neveragaintomartiallaw resist the rehabilitat...,neveragaintomartiallaw resist the rehabilitat...,neveragaintomartiallaw resist the rehabilitat...,"[, neveragaintomartiallaw, resist, the, rehabi...","[, neveragaintomartiallaw, resist, rehabilitat...","[, neveragaintomartiallaw, resist, rehabilitat..."


In [10]:
clean_df = df.drop(['tweet','tweet_Tokenized','cleanedTweet','tweets','text','tweets','tweet1'], axis=1)
clean_df

Unnamed: 0,score,cleanedTweet2
0,1,"[uy, gagi, wag, kang, magpaniwala, haka, hakan..."
1,2,"[marcos, duterte, pinag, parehong, tuta, dikta..."
2,2,"[malinaw, malinaw, malinaw, pinapatagal, pande..."
3,1,"[marcos, magiging, bayani, basahin, buong, pah..."
4,2,"[, neveragaintomartiallaw, resist, rehabilitat..."
...,...,...
2355,1,"[bobong, marcos, cant, even, admit, fault, fam..."
2356,1,"[sofa, yung, presidency, stomoyern]"
2357,1,"[parang, di, presidency, pinaguusapan, pleek, ]"
2358,1,"[presidency, occupation, essence, terrorism, ]"


In [11]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2360 entries, 0 to 2359
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   score          2360 non-null   object
 1   cleanedTweet2  2360 non-null   object
dtypes: object(2)
memory usage: 37.0+ KB


In [12]:
#clean_df.to_csv(r"Preprocessed tagalog.csv", index = False)

# Lemmatization

In [13]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Error loading wordnet: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [15]:
lemmatizer = WordNetLemmatizer()

lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in clean_df['cleanedTweet2'][2359]])
print(lemmatized_output)

mo respetuhin yung choice mo iboboto mong candidate presidency respetuhin yung personal choice nag share nang ibang candidate without supporting evidence san yung respect san yung respect mine respect 


In [16]:
def lemmatize(s):
     s = [lemmatizer.lemmatize(word) for word in s]
     return s

df5 = clean_df.assign(col_lemma = clean_df['cleanedTweet2'].apply(lambda x: lemmatize(x)))
df5

Unnamed: 0,score,cleanedTweet2,col_lemma
0,1,"[uy, gagi, wag, kang, magpaniwala, haka, hakan...","[uy, gagi, wag, kang, magpaniwala, haka, hakan..."
1,2,"[marcos, duterte, pinag, parehong, tuta, dikta...","[marcos, duterte, pinag, parehong, tuta, dikta..."
2,2,"[malinaw, malinaw, malinaw, pinapatagal, pande...","[malinaw, malinaw, malinaw, pinapatagal, pande..."
3,1,"[marcos, magiging, bayani, basahin, buong, pah...","[marcos, magiging, bayani, basahin, buong, pah..."
4,2,"[, neveragaintomartiallaw, resist, rehabilitat...","[, neveragaintomartiallaw, resist, rehabilitat..."
...,...,...,...
2355,1,"[bobong, marcos, cant, even, admit, fault, fam...","[bobong, marcos, cant, even, admit, fault, fam..."
2356,1,"[sofa, yung, presidency, stomoyern]","[sofa, yung, presidency, stomoyern]"
2357,1,"[parang, di, presidency, pinaguusapan, pleek, ]","[parang, di, presidency, pinaguusapan, pleek, ]"
2358,1,"[presidency, occupation, essence, terrorism, ]","[presidency, occupation, essence, terrorism, ]"


In [17]:
df6 = df5.drop(['cleanedTweet2'], axis=1)
df6

Unnamed: 0,score,col_lemma
0,1,"[uy, gagi, wag, kang, magpaniwala, haka, hakan..."
1,2,"[marcos, duterte, pinag, parehong, tuta, dikta..."
2,2,"[malinaw, malinaw, malinaw, pinapatagal, pande..."
3,1,"[marcos, magiging, bayani, basahin, buong, pah..."
4,2,"[, neveragaintomartiallaw, resist, rehabilitat..."
...,...,...
2355,1,"[bobong, marcos, cant, even, admit, fault, fam..."
2356,1,"[sofa, yung, presidency, stomoyern]"
2357,1,"[parang, di, presidency, pinaguusapan, pleek, ]"
2358,1,"[presidency, occupation, essence, terrorism, ]"


In [18]:
df6.to_csv(r"Preprocessed tagalog.csv", index = False)