# <em><u>Sentiment Analyzer - Milestone 1</u></em>

## Import necessary libraries

In [20]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Initialize the NLTK tokenizer for Kiswahili
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\piuso\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\piuso\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\piuso\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\piuso\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Load our dataset of labeled reviews

In [13]:
df = pd.read_csv('swahili.csv')

df.head(20)

Unnamed: 0,text,labels
0,kwa bahati mbaya fadhila yoyote katika kazi ya...,negative
1,huwa na sauti ya juu zaidi kuliko msemaji mwin...,positive
2,iligundua kwamba ni rahisi kutengeneza na kutu...,positive
3,ipurkated hii kwa ajili ya msimamizi wa gari n...,negative
4,sinema nzuri kwelikweli juu ya upendo mkubwa u...,positive
5,mwandiko huo ni wa kipumbavu sana,negative
6,kwa kushangaza mimi hupata sinema zake kupotez...,negative
7,Sasa unajua ni kwa nini niliitoa 10,positive
8,Sehemu ya pili ya spidrocock iliboresha sana m...,positive
9,kufanikiwa kwa filamu hiyo kwategemea kubebwa ...,positive


## Data Preprocessing

### 1. Remove URLS

In [14]:
#Remove the urls
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub('', text)

df['text'] = df['text'].apply(remove_urls)

#print out a sample
print(df['text'].sample(10))

3207    Ningeupa mfululizo huu wa televisheni mfululiz...
3902    simu pia zinaweza kupiga picha kubwa na hata v...
940     jambo lenye kuvunja moyo sana ni kwamba hakuku...
1931                        filamu fupi iliyo bora kabisa
773                   Huduma za wateja zilizo mbaya zaidi
1822            hufanya kila kitu kilichosemwa kingefanya
897              Bila shaka ningependekeza mabawa na piza
1255    ukubwa wa kiwambo ni wa sehemu ya juu iliyo pa...
549                             Uzoea wa kwenda hapa tena
2352           kwa hivyo hadithi nzima haina nguvu fulani
Name: text, dtype: object


### 2. Remove special characters

In [15]:
#Remove special characters
def remove_special_characters(text):
    pattern = r'[^a-zA-Z0-9\s]' # keep only alphanumeric and whitespace characters
    text = re.sub(pattern, '', text)
    return text

df['text'] = df['text'].apply(remove_special_characters)

#print out a sample
print(df['text'].sample(10))

2590                                  Wala hatarejea tena
1726                       lililokuwa jambo la kuchekesha
2932                                  sinema hii ni mbaya
3618       ililazimishwa kama kila kitu kwenye sinema hii
2517    hujui kamwe ikiwa uliisukuma kwa nguvu vya kut...
2527                                          Simu baridi
991                                    Sipsi yenye kasoro
1474                                      furaha ilitanda
234     sinema hiyo ilikufanya uwe kwenye ukingo wa ki...
3005    sana ujinga katika kila nyanja kuwa inakuwa fu...
Name: text, dtype: object


### 3. Convert to lower case

In [16]:
#convert to lower case
df['text'] = df['text'].str.lower()

#print out a sample
print(df['text'].sample(10))

2416    inapendeza na inasisimua sana kutazama na nina...
3184                   kwa ujumla nilivutiwa sana na noca
1732                            hakika huo utafungiwa nao
3179    matokeo yake ni filamu ambayo haionekani kuwa ...
421     hii ni haki kabisa kwa filamu inayowasilisha k...
1679         ndiyo mngao wake upande wa mbele na kuupenda
453     sikuzote wafanyakazi ni wenye urafiki na wenye...
953     ikiwa wakati wowote kulikuwa na wonyesho wa mw...
2412    lakini nilifikiria uigizaji wake ulikuwa na us...
760     mandhari nzuri sana na yenye kuburudisha ya ku...
Name: text, dtype: object


### 4. Remove stopwords

In [21]:
# Download the stopwords for Kiswahili language
swahili_stopwords = stopwords.words('swahili')

# Define a function to remove stopwords
def remove_stopwords(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove the stopwords
    filtered_tokens = [word for word in tokens if word.lower() not in swahili_stopwords]
    # Join the filtered tokens back into a string
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

# Apply the remove_stopwords function to the 'text' column of the DataFrame
df['text'] = df['text'].apply(remove_stopwords)

print(df['text'].sample(10))

1184                                nilifurahia kula hapa
858     kushangaza nyingi huona sinema kupoteza kabisa...
3535    imefikiriwa kirukanjia akili nyingi tangu amek...
816     ongeza betty white jean smart una waigizaji wa...
2606    kule kubadilika badilika hisia mwenzi mtu kati...
2082                                              itizame
995       duris ana muonekano mzuri anatoa utendaji mzuri
1570                                          bidhaa duni
2594                         simu huchukua saa 2 5 halisi
1599                         napiga hesabu sinema hii 910
Name: text, dtype: object


## Save the cleaned data to use in Milestone 2

In [24]:
df.to_csv('cleaned_data.csv', index=False)

# <em><u>THE END</u></em> 