In [1]:
import numpy as np
import pandas as pd 
import string as st
import re
import nltk
nltk.download('omw-1.4')
import sklearn
from nltk import PorterStemmer, WordNetLemmatizer
import googletrans
from googletrans import Translator

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
# Read Raw Dataset
df = pd.read_csv('patriotic_dataset.csv',encoding='latin-1')

In [3]:
df.head()

Unnamed: 0,ï»¿Tittle,Lyrics
0,1988 Bersatu,BERSATU.BERSATU BERSATU. BERSATU MENENTANG MUS...
1,1989 Bersatu,BERSATU.BERSATU BERSATU. BERSATU MENENTANG MUS...
2,1990 Berjaya,MALAYSIA KITA SUDAH BERJAYAAMAN MAKMUR BAHAGIA...
3,1991 Wawasan 2020,BENDERA BERKIBAR DI ANGKASALAMBANG NEGARA JAYA...
4,1993 Bersatu Menuju Wawasan,MURNINYA SEBUAH WAWASAN MEMBINA BUDAYA YANG GE...


In [4]:
df.shape

(64, 2)

In [5]:
df.isnull().sum()

ï»¿Tittle    0
Lyrics       0
dtype: int64

In [6]:
df.nunique()

ï»¿Tittle    62
Lyrics       54
dtype: int64

### Translate dataset

In [7]:
headers = ['Title','Lyrics','Title_en','Lyrics_en']
data = pd.read_csv('patriotic_dataset.csv')
translator = Translator()

df = pd.DataFrame(index=range(0,len(data)), columns=headers)

In [8]:
def translate_row(row):
    ''' Translate elements A and B within `row`. '''
    a = translator.translate(row[0], dest='en')
    b = translator.translate(row[1], dest='en')
    return pd.Series([a.origin, b.origin, a.text, b.text], headers)

In [9]:
for i, row in enumerate(data.values):
    # Fill empty dataframe with given serie.
    df.loc[i] = translate_row(row)

print(df)

                          Title  \
0                  1988 Bersatu   
1                  1989 Bersatu   
2                  1990 Berjaya   
3             1991 Wawasan 2020   
4   1993 Bersatu Menuju Wawasan   
..                          ...   
59              Bahtera Merdeka   
60            Malaysia Tercinta   
61                 Aku Negaraku   
62                Gemuruh suara   
63                      Warisan   

                                               Lyrics  \
0   BERSATU.BERSATU BERSATU. BERSATU MENENTANG MUS...   
1   BERSATU.BERSATU BERSATU. BERSATU MENENTANG MUS...   
2   MALAYSIA KITA SUDAH BERJAYAAMAN MAKMUR BAHAGIA...   
3   BENDERA BERKIBAR DI ANGKASALAMBANG NEGARA JAYA...   
4   MURNINYA SEBUAH WAWASAN MEMBINA BUDAYA YANG GE...   
..                                                ...   
59  Bonda senyum riang Menerima bahtera Merdeka Pu...   
60  Tanah air tercinta Ibu Pertiwi warisan bangsa ...   
61  Ku sedar Tanah air aku bertuah Di sini tempatk...   
62  Kalau

## Text Cleaning and Processing
- Remove punctuations
- Tokenization
- Stop Words removal 
- Lemmetization
- TF*IDF

In [10]:
def remove_punct(text):
    return("".join([ch for ch in text if ch not in st.punctuation]))

In [11]:
df['removed_punct'] = df['Lyrics_en'].apply(lambda x: remove_punct(x))
df.head()

Unnamed: 0,Title,Lyrics,Title_en,Lyrics_en,removed_punct
0,1988 Bersatu,BERSATU.BERSATU BERSATU. BERSATU MENENTANG MUS...,1988 united,United. Be united.Unite against the enemy of t...,United Be unitedUnite against the enemy of the...
1,1989 Bersatu,BERSATU.BERSATU BERSATU. BERSATU MENENTANG MUS...,1989 united,United. Be united.Unite against the enemy of t...,United Be unitedUnite against the enemy of the...
2,1990 Berjaya,MALAYSIA KITA SUDAH BERJAYAAMAN MAKMUR BAHAGIA...,1990 successfully,Malaysia We have succeeded in prosperous Bahag...,Malaysia We have succeeded in prosperous Bahag...
3,1991 Wawasan 2020,BENDERA BERKIBAR DI ANGKASALAMBANG NEGARA JAYA...,1991 Insight 2020.,Flags fluttered in Jayarakyat State of Jayarak...,Flags fluttered in Jayarakyat State of Jayarak...
4,1993 Bersatu Menuju Wawasan,MURNINYA SEBUAH WAWASAN MEMBINA BUDAYA YANG GE...,1993 unite towards insight,Meanwise a vision of building a glorious cultu...,Meanwise a vision of building a glorious cultu...


In [12]:
def tokenize(text):
    text = re.split('\s+',text)
    return[x.lower() for x in text]

In [13]:
df['tokens'] = df['removed_punct'].apply(lambda msg : tokenize(msg))
df.head()

Unnamed: 0,Title,Lyrics,Title_en,Lyrics_en,removed_punct,tokens
0,1988 Bersatu,BERSATU.BERSATU BERSATU. BERSATU MENENTANG MUS...,1988 united,United. Be united.Unite against the enemy of t...,United Be unitedUnite against the enemy of the...,"[united, be, unitedunite, against, the, enemy,..."
1,1989 Bersatu,BERSATU.BERSATU BERSATU. BERSATU MENENTANG MUS...,1989 united,United. Be united.Unite against the enemy of t...,United Be unitedUnite against the enemy of the...,"[united, be, unitedunite, against, the, enemy,..."
2,1990 Berjaya,MALAYSIA KITA SUDAH BERJAYAAMAN MAKMUR BAHAGIA...,1990 successfully,Malaysia We have succeeded in prosperous Bahag...,Malaysia We have succeeded in prosperous Bahag...,"[malaysia, we, have, succeeded, in, prosperous..."
3,1991 Wawasan 2020,BENDERA BERKIBAR DI ANGKASALAMBANG NEGARA JAYA...,1991 Insight 2020.,Flags fluttered in Jayarakyat State of Jayarak...,Flags fluttered in Jayarakyat State of Jayarak...,"[flags, fluttered, in, jayarakyat, state, of, ..."
4,1993 Bersatu Menuju Wawasan,MURNINYA SEBUAH WAWASAN MEMBINA BUDAYA YANG GE...,1993 unite towards insight,Meanwise a vision of building a glorious cultu...,Meanwise a vision of building a glorious cultu...,"[meanwise, a, vision, of, building, a, gloriou..."


In [14]:
#remove tokens of less than 3
def remove_small_words(text):
    return [ x for x in text if len(x) > 3]

In [15]:
df['larger_tokens'] = df['tokens'].apply(lambda x : remove_small_words(x))
df.head()

Unnamed: 0,Title,Lyrics,Title_en,Lyrics_en,removed_punct,tokens,larger_tokens
0,1988 Bersatu,BERSATU.BERSATU BERSATU. BERSATU MENENTANG MUS...,1988 united,United. Be united.Unite against the enemy of t...,United Be unitedUnite against the enemy of the...,"[united, be, unitedunite, against, the, enemy,...","[united, unitedunite, against, enemy, country,..."
1,1989 Bersatu,BERSATU.BERSATU BERSATU. BERSATU MENENTANG MUS...,1989 united,United. Be united.Unite against the enemy of t...,United Be unitedUnite against the enemy of the...,"[united, be, unitedunite, against, the, enemy,...","[united, unitedunite, against, enemy, country,..."
2,1990 Berjaya,MALAYSIA KITA SUDAH BERJAYAAMAN MAKMUR BAHAGIA...,1990 successfully,Malaysia We have succeeded in prosperous Bahag...,Malaysia We have succeeded in prosperous Bahag...,"[malaysia, we, have, succeeded, in, prosperous...","[malaysia, have, succeeded, prosperous, bahaga..."
3,1991 Wawasan 2020,BENDERA BERKIBAR DI ANGKASALAMBANG NEGARA JAYA...,1991 Insight 2020.,Flags fluttered in Jayarakyat State of Jayarak...,Flags fluttered in Jayarakyat State of Jayarak...,"[flags, fluttered, in, jayarakyat, state, of, ...","[flags, fluttered, jayarakyat, state, jayaraky..."
4,1993 Bersatu Menuju Wawasan,MURNINYA SEBUAH WAWASAN MEMBINA BUDAYA YANG GE...,1993 unite towards insight,Meanwise a vision of building a glorious cultu...,Meanwise a vision of building a glorious cultu...,"[meanwise, a, vision, of, building, a, gloriou...","[meanwise, vision, building, glorious, culture..."


In [16]:
#Removing stopwords
def remove_stopwords(text):
    return [word for word in text if word not in nltk.corpus.stopwords.words('english')]

In [17]:
df['clean_tokens'] = df['larger_tokens'].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,Title,Lyrics,Title_en,Lyrics_en,removed_punct,tokens,larger_tokens,clean_tokens
0,1988 Bersatu,BERSATU.BERSATU BERSATU. BERSATU MENENTANG MUS...,1988 united,United. Be united.Unite against the enemy of t...,United Be unitedUnite against the enemy of the...,"[united, be, unitedunite, against, the, enemy,...","[united, unitedunite, against, enemy, country,...","[united, unitedunite, enemy, country, defend, ..."
1,1989 Bersatu,BERSATU.BERSATU BERSATU. BERSATU MENENTANG MUS...,1989 united,United. Be united.Unite against the enemy of t...,United Be unitedUnite against the enemy of the...,"[united, be, unitedunite, against, the, enemy,...","[united, unitedunite, against, enemy, country,...","[united, unitedunite, enemy, country, defend, ..."
2,1990 Berjaya,MALAYSIA KITA SUDAH BERJAYAAMAN MAKMUR BAHAGIA...,1990 successfully,Malaysia We have succeeded in prosperous Bahag...,Malaysia We have succeeded in prosperous Bahag...,"[malaysia, we, have, succeeded, in, prosperous...","[malaysia, have, succeeded, prosperous, bahaga...","[malaysia, succeeded, prosperous, bahagamalays..."
3,1991 Wawasan 2020,BENDERA BERKIBAR DI ANGKASALAMBANG NEGARA JAYA...,1991 Insight 2020.,Flags fluttered in Jayarakyat State of Jayarak...,Flags fluttered in Jayarakyat State of Jayarak...,"[flags, fluttered, in, jayarakyat, state, of, ...","[flags, fluttered, jayarakyat, state, jayaraky...","[flags, fluttered, jayarakyat, state, jayaraky..."
4,1993 Bersatu Menuju Wawasan,MURNINYA SEBUAH WAWASAN MEMBINA BUDAYA YANG GE...,1993 unite towards insight,Meanwise a vision of building a glorious cultu...,Meanwise a vision of building a glorious cultu...,"[meanwise, a, vision, of, building, a, gloriou...","[meanwise, vision, building, glorious, culture...","[meanwise, vision, building, glorious, culture..."


In [18]:
# Apply lemmetization on tokens
def lemmatize (text):
    word_net = WordNetLemmatizer()
    return [word_net.lemmatize(word) for word in text]

In [19]:
df['lemma_words'] = df['clean_tokens'].apply(lambda x : lemmatize(x))
df.head()

Unnamed: 0,Title,Lyrics,Title_en,Lyrics_en,removed_punct,tokens,larger_tokens,clean_tokens,lemma_words
0,1988 Bersatu,BERSATU.BERSATU BERSATU. BERSATU MENENTANG MUS...,1988 united,United. Be united.Unite against the enemy of t...,United Be unitedUnite against the enemy of the...,"[united, be, unitedunite, against, the, enemy,...","[united, unitedunite, against, enemy, country,...","[united, unitedunite, enemy, country, defend, ...","[united, unitedunite, enemy, country, defend, ..."
1,1989 Bersatu,BERSATU.BERSATU BERSATU. BERSATU MENENTANG MUS...,1989 united,United. Be united.Unite against the enemy of t...,United Be unitedUnite against the enemy of the...,"[united, be, unitedunite, against, the, enemy,...","[united, unitedunite, against, enemy, country,...","[united, unitedunite, enemy, country, defend, ...","[united, unitedunite, enemy, country, defend, ..."
2,1990 Berjaya,MALAYSIA KITA SUDAH BERJAYAAMAN MAKMUR BAHAGIA...,1990 successfully,Malaysia We have succeeded in prosperous Bahag...,Malaysia We have succeeded in prosperous Bahag...,"[malaysia, we, have, succeeded, in, prosperous...","[malaysia, have, succeeded, prosperous, bahaga...","[malaysia, succeeded, prosperous, bahagamalays...","[malaysia, succeeded, prosperous, bahagamalays..."
3,1991 Wawasan 2020,BENDERA BERKIBAR DI ANGKASALAMBANG NEGARA JAYA...,1991 Insight 2020.,Flags fluttered in Jayarakyat State of Jayarak...,Flags fluttered in Jayarakyat State of Jayarak...,"[flags, fluttered, in, jayarakyat, state, of, ...","[flags, fluttered, jayarakyat, state, jayaraky...","[flags, fluttered, jayarakyat, state, jayaraky...","[flag, fluttered, jayarakyat, state, jayarakya..."
4,1993 Bersatu Menuju Wawasan,MURNINYA SEBUAH WAWASAN MEMBINA BUDAYA YANG GE...,1993 unite towards insight,Meanwise a vision of building a glorious cultu...,Meanwise a vision of building a glorious cultu...,"[meanwise, a, vision, of, building, a, gloriou...","[meanwise, vision, building, glorious, culture...","[meanwise, vision, building, glorious, culture...","[meanwise, vision, building, glorious, culture..."


In [20]:
# create sentences to get clean text as input for vectors
def return_sentences(tokens):
    return " ".join([word for word in tokens])

In [21]:
df['clean_text'] = df ['lemma_words'].apply(lambda x: return_sentences(x))
df.head()

Unnamed: 0,Title,Lyrics,Title_en,Lyrics_en,removed_punct,tokens,larger_tokens,clean_tokens,lemma_words,clean_text
0,1988 Bersatu,BERSATU.BERSATU BERSATU. BERSATU MENENTANG MUS...,1988 united,United. Be united.Unite against the enemy of t...,United Be unitedUnite against the enemy of the...,"[united, be, unitedunite, against, the, enemy,...","[united, unitedunite, against, enemy, country,...","[united, unitedunite, enemy, country, defend, ...","[united, unitedunite, enemy, country, defend, ...",united unitedunite enemy country defend dignit...
1,1989 Bersatu,BERSATU.BERSATU BERSATU. BERSATU MENENTANG MUS...,1989 united,United. Be united.Unite against the enemy of t...,United Be unitedUnite against the enemy of the...,"[united, be, unitedunite, against, the, enemy,...","[united, unitedunite, against, enemy, country,...","[united, unitedunite, enemy, country, defend, ...","[united, unitedunite, enemy, country, defend, ...",united unitedunite enemy country defend dignit...
2,1990 Berjaya,MALAYSIA KITA SUDAH BERJAYAAMAN MAKMUR BAHAGIA...,1990 successfully,Malaysia We have succeeded in prosperous Bahag...,Malaysia We have succeeded in prosperous Bahag...,"[malaysia, we, have, succeeded, in, prosperous...","[malaysia, have, succeeded, prosperous, bahaga...","[malaysia, succeeded, prosperous, bahagamalays...","[malaysia, succeeded, prosperous, bahagamalays...",malaysia succeeded prosperous bahagamalaysia e...
3,1991 Wawasan 2020,BENDERA BERKIBAR DI ANGKASALAMBANG NEGARA JAYA...,1991 Insight 2020.,Flags fluttered in Jayarakyat State of Jayarak...,Flags fluttered in Jayarakyat State of Jayarak...,"[flags, fluttered, in, jayarakyat, state, of, ...","[flags, fluttered, jayarakyat, state, jayaraky...","[flags, fluttered, jayarakyat, state, jayaraky...","[flag, fluttered, jayarakyat, state, jayarakya...",flag fluttered jayarakyat state jayarakyat sek...
4,1993 Bersatu Menuju Wawasan,MURNINYA SEBUAH WAWASAN MEMBINA BUDAYA YANG GE...,1993 unite towards insight,Meanwise a vision of building a glorious cultu...,Meanwise a vision of building a glorious cultu...,"[meanwise, a, vision, of, building, a, gloriou...","[meanwise, vision, building, glorious, culture...","[meanwise, vision, building, glorious, culture...","[meanwise, vision, building, glorious, culture...",meanwise vision building glorious culture deve...


In [22]:
df2 = pd.DataFrame(df,columns =['Title_en','Lyrics_en','clean_text'] )
print(df2)

                      Title_en  \
0                  1988 united   
1                  1989 united   
2            1990 successfully   
3           1991 Insight 2020.   
4   1993 unite towards insight   
..                         ...   
59                 Merdeka ark   
60         Malaysia is beloved   
61              I'm my country   
62                      Rumble   
63                      Legacy   

                                            Lyrics_en  \
0   United. Be united.Unite against the enemy of t...   
1   United. Be united.Unite against the enemy of t...   
2   Malaysia We have succeeded in prosperous Bahag...   
3   Flags fluttered in Jayarakyat State of Jayarak...   
4   Meanwise a vision of building a glorious cultu...   
..                                                ...   
59  Bonda Senyum Riang Accepted BANDERA MERDEKA PU...   
60  Mother's beloved homeland PERTIWI Heritage Neg...   
61  I am aware of my homeland I'm lucky here my pl...   
62  If you fall, do n

In [23]:
df2.to_csv('clean_data.csv',index=False)

In [25]:
# Convert lemmatized words to Tf-Idf feature vectors
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_vect = tfidf.fit_transform(df['clean_text'])
tfidf_vect.shape

(64, 761)

In [26]:
tfidf.get_feature_names()[-5::]

['yellow', 'youmalaysia', 'youre', 'youth', 'zamba']

In [27]:
print(tfidf_vect)

  (0, 370)	0.12601252829942933
  (0, 644)	0.2025766753289876
  (0, 716)	0.133674220213958
  (0, 95)	0.12969588831891515
  (0, 654)	0.3765562479928605
  (0, 140)	0.3765562479928605
  (0, 389)	0.21656970165455786
  (0, 372)	0.2025766753289876
  (0, 635)	0.13799891319812457
  (0, 640)	0.2025766753289876
  (0, 707)	0.1604637558569441
  (0, 394)	0.3209275117138882
  (0, 54)	0.13799891319812457
  (0, 432)	0.09719403277315716
  (0, 155)	0.3765562479928605
  (0, 138)	0.16812544777147279
  (0, 121)	0.11636246663465842
  (0, 178)	0.17718730594230853
  (0, 706)	0.2025766753289876
  (0, 705)	0.25202505659885865
  (1, 370)	0.12601252829942933
  (1, 644)	0.2025766753289876
  (1, 716)	0.133674220213958
  (1, 95)	0.12969588831891515
  (1, 654)	0.3765562479928605
  :	:
  (63, 753)	0.16442749690647523
  (63, 546)	0.16442749690647523
  (63, 701)	0.16442749690647523
  (63, 249)	0.16442749690647523
  (63, 285)	0.16442749690647523
  (63, 679)	0.14955000507830984
  (63, 211)	0.14955000507830984
  (63, 317)	0