# <em><u>Sentiment Analyzer</u></em>

## Import the necessary libraries 

In [52]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords

# Initialize the NLTK tokenizer for Kiswahili
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\piuso\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\piuso\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\piuso\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\piuso\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Load our dataset 

In [53]:
df = pd.read_csv('swh_sentences.csv')

df.head(10)

Unnamed: 0,text
0,Ninakupenda.
1,Nini?
2,"Watu wote wamezaliwa huru, hadhi na haki zao n..."
3,Hutusahau!
4,Sijui.
5,Swali nzuri.
6,Unakumbuka?
7,Kwenda kwa teksi hotelini.
8,Usisahau kupeleka barua
9,Nitarudi.


## Data Preprocessing

1. Remove URLS

In [54]:
#Remove the urls
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub('', text)

df['text'] = df['text'].apply(remove_urls)

#print out a sample
print(df['text'].sample(10))

314                                Je, ana kompyuta npya?
2000                            Ni kitabu cha kihispania.
1555                  Kawaida yeye hulala kwa masaa nane.
7023    katika umalizio wangu sitajishughulisha na sin...
7739    Vicheko pia bila shaka ni vipengele vibaya zai...
3621                              Mwishowe tutafika hapo.
1768                usipoteze hata nafaka moja la mchele!
6380                    Pia hufanya iwe rahisi kushikilia
703     Tulimchagua Jeffrey kama Kapteni wa kikosi chetu.
3737    Nilikuwa nikifikiria kuhusu Anne, alielezea. "...
Name: text, dtype: object


2. Remove URLS

In [55]:
#Remove special characters
def remove_special_characters(text):
    pattern = r'[^a-zA-Z0-9\s]' # keep only alphanumeric and whitespace characters
    text = re.sub(pattern, '', text)
    return text

df['text'] = df['text'].apply(remove_special_characters)

#print out a sample
print(df['text'].sample(10))

7564    Orodha ya vyakula inabadili ubora wa chakula n...
204                                          Jiwe ni zito
7440                        Utumishi pia ni wenye kuvutia
2800                          Inaonekana kuwa rahisi sana
7713     unatayarisha samaki wapya wa ajabu kwa uangalifu
6205                      Ni rahisi kuitumia na kukipenda
7424            honeslty haikuwa na ladha nzuri kama hiyo
4071    Tafadhali unaweza kubadilisha chupa zote za po...
2363               Sijawahi kuambia mtu hilo hapo mbeleni
4897                sinunui mkate mjini kwa sababu haifai
Name: text, dtype: object


3. Convert to lower case

In [56]:
#convert to lower case
df['text'] = df['text'].str.lower()

#print out a sample
print(df['text'].sample(10))

4445    kadi yangu ya mikopo ilikataliwa na mashine ya...
2002            pedro ana pikipiki na anapenda kuiendesha
4220                    mpenzi wangu ni mweledi wa upishi
3738    je unaniambia kuwa ni kawaida kabisa kumdangan...
4196                    wakati mwingine pia mimi huogelea
7452             kwa ujumla mimi hupenda mahali hapa sana
5163    ikiwa unapenda kifo na kuoza na nyimbo za shak...
339                                   anafundisha kiarabu
4281                        anakosa kiatu chake cha kulia
8102                      nyama nzuri iliyopondwa  pondwa
Name: text, dtype: object


4. Tokenize the text data

In [57]:
# Tokenize the text data
tokenizer = nltk.tokenize.TreebankWordTokenizer()

for i, row in df.iterrows():
    text = row['text']  
    tokens = tokenizer.tokenize(text)
    df.at[i, 'tokens'] = tokens

print(df['tokens'].sample(10))

6346                       [na, ninapo, maliza, sijitoma]
4683    [filamu, inaonekana, ya, bei, rahisi, na, isiy...
5262                                 [ana, moyo, mkujufu]
3035    [mabibi, na, mabwana, sasa, tumewasili, kwenye...
7720    [utaratibu, wa, viazi, ulisikitisha, na, labda...
4330                [walichezea, kikosi, cha, shule, yao]
6489    [mara, nyingi, mazungumzo, hayafuati, mstari, ...
3924    [kuhusu, mshumaa, na, damu, na, maandishi, uku...
3492                   [je, niliguza, hali, ya, wasiwasi]
6122                                  [basi, jihadharini]
Name: tokens, dtype: object


5. Remove stopwords

In [58]:
# Remove stopwords
def remove_stopwords(tokens):
    stopwords_list = stopwords.words('swahili')
    filtered_tokens = [token for token in tokens if token.lower() not in stopwords_list]
    return filtered_tokens

df['tokens'] = df['tokens'].apply(remove_stopwords)

print(df['tokens'].sample(10))

440                                          [si, muhimu]
650                       [tunajua, hadithi, robin, hood]
7480       [vijia, vyote, vimetengenezwa, mikono, vitamu]
2647                     [je, mlango, ulikuwa, umefungwa]
946               [jane, lazima, aache, tamaa, chokoleti]
1517          [mwanamume, ambaye, anaongea, bwana, allen]
5534    [nilifikia, simu, hii, mwishoni, mwa, hali, ku...
2106             [jiji, langu, hali, joto, nyuzi, sifuri]
4434    [alishambuliwa, genge, walimpiga, kuchukua, po...
707         [aliipowasili, london, alinitumia, telegramu]
Name: tokens, dtype: object


## 