# <em><u>Sentiment Analyzer</u></em>

## Import the necessary libraries 

In [41]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords

# Initialize the NLTK tokenizer for Kiswahili
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\piuso\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\piuso\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\piuso\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\piuso\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Load our dataset 

In [36]:
df = pd.read_csv('swh_sentences.csv')

df.head(10)

Unnamed: 0,text
0,Ninakupenda.
1,Nini?
2,"Watu wote wamezaliwa huru, hadhi na haki zao n..."
3,Hutusahau!
4,Sijui.
5,Swali nzuri.
6,Unakumbuka?
7,Kwenda kwa teksi hotelini.
8,Usisahau kupeleka barua
9,Nitarudi.


## Data Preprocessing

1. Remove URLS

In [37]:
#Remove the urls
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub('', text)

df['text'] = df['text'].apply(remove_urls)

#print out a sample
print(df['text'].sample(10))

2457    Lugha ya Kilatini haijanipotelea kabisa kwangu...
2778                               Hiyo haitakuwa rahisi.
1924    Alikuwa akitafuta funguo kwenye mfuko wake wal...
5718                                           Uirudishe.
2548                                    Sina malalamishi.
6905    Maisha hayayaweki vya kutosha ili yeyote ajue ...
8000    ukitafuta chakula halisi cha thai nenda mahali...
5788    nikiwa mgonjwa na kichwa changu kianzapo kulia...
5721                                            Bei nzuri
3416                                  Nitarekebisha hili.
Name: text, dtype: object


2. Remove URLS

In [38]:
#Remove special characters
def remove_special_characters(text):
    pattern = r'[^a-zA-Z0-9\s]' # keep only alphanumeric and whitespace characters
    text = re.sub(pattern, '', text)
    return text

df['text'] = df['text'].apply(remove_special_characters)

#print out a sample
print(df['text'].sample(10))

5626                           kaseti za kupitisha habari
1276                Mvule alikuwa na wasiwasi ya baiskeli
4819                                    mic haifanyi kazi
5630                                        Bei kubwa pia
5197    kitaalam filamu hiyo imetengenezwa vizuri na k...
526     Je Ni mwezi upi unaopenda zaidi kwa mwaka na k...
5912                          Sehemu maridadi ya takataka
174                                Kupigana kwa haki yako
270                                      Millie anampenda
6501    Kwa ujumla filamu hiyo inapendeza na inachoche...
Name: text, dtype: object


3. Convert to lower case

In [39]:
df['text'] = df['text'].str.lower()

#print out a sample
print(df['text'].sample(10))

4637                           lo filamu mbaya namna gani
1418     jaji alikuwa amechoka kutokana na tatizo la neva
6336                                      hunifanyia kazi
8320       migomo ya bellagio haikuwa kama nilivyotazamia
8123    mimi hufanya kazi katika biashara ya ukaribish...
3223    uchunguzi wa mwili wa wafu wa dan uligundua ma...
5515                    imepata bidhaa hii kuwa kubwa mno
1587                            malkia wa wachawi ameuawa
7509    kwa kweli ni kama karamu ya kimapenzi kinywani...
7614     muziki wa kawaida katika siku za mapigano makali
Name: text, dtype: object


4. Tokenize the text data

In [51]:
# Tokenize the text data
tokenizer = nltk.tokenize.TreebankWordTokenizer()

for i, row in df.iterrows():
    text = row['text']  
    tokens = tokenizer.tokenize(text)
    df.at[i, 'tokens'] = tokens

print(df['tokens'].sample(10))

2934                   [mjomba, wangu, ni, dereva, mbovu]
2450         [ni, lazima, tuzuie, pigo, hili, ni, hatari]
5913    [uzi, wa, kuunganisha, vitu, ulikuwa, na, ukub...
1004    [usimimine, maji, ya, moto, kwenye, glasi, au,...
3815                [kitabu, hiki, ni, mzee, kiasi, gani]
6215    [bei, nzuri, nilinunua, hii, baada, ya, mimi, ...
2537    [michael, jackson, alikuwa, mwimbaji, mashuhur...
6377    [hata, vibonyezo, viweje, sana, hivi, kwamba, ...
7429                                          [hangerudi]
3992     [mbao, inapaswa, kushughulikiwa, kwa, uangalifu]
Name: tokens, dtype: object


5. Remove stopwords

In [50]:
# Remove stopwords
def remove_stopwords(tokens):
    stopwords_list = stopwords.words('swahili')
    filtered_tokens = [token for token in tokens if token.lower() not in stopwords_list]
    return filtered_tokens

df['tokens'] = df['tokens'].apply(remove_stopwords)

print(df['tokens'].sample(10))

5300                                       [ubora, nzuri]
7076    [helen, baxendendale, bibi, anayeaminika, macb...
811     [uwezekano, mbadala, ulikuwa, upinzani, kutoroka]
1634    [wapo, uhusiano, wapo, hali, ambayo, rahisi, k...
6899    [hatimaye, hayo, yote, twafikia, mwisho, ambao...
6016                                          [hii, simu]
5157    [nilifurahi, kumuona, inspekta, mwala, akitubu...
8274       [liliingia, saa, yenye, furaha, orodha, divai]
3411                     [nchi, bandia, nguvu, magharibi]
5773          [chaguo, kikuza, taa, hizo, zenye, kuvutia]
Name: tokens, dtype: object
