# <em><u>Sentiment Analyzer</u></em>

## Import the necessary libraries 

In [25]:
import re
import pandas as pd
import nltk
import joblib
import ipywidgets as widgets
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from IPython.display import display

# Initialize the NLTK tokenizer for Kiswahili
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\piuso\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\piuso\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\piuso\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\piuso\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Load our dataset 

In [26]:
df = pd.read_csv('swahili.csv')

df.head(10)

Unnamed: 0,text,labels
0,team 2019merimera alikuwa takataka,negative
1,sijafurahishwa,negative
2,kubuni dosari,negative
3,bila kusema nilipoteza pesa zangu,negative
4,sema kupoteza pesa na wakati,negative
5,ubunifu ni isiyo ya kawaida sana kwani kipande...,negative
6,akili yako imeoza,negative
7,aki si maisha ni magumu,negative
8,enyewe safaricom ni wezi,negative
9,mtandao duni hata line yao niliweka nyuma ya s...,negative


## Data Preprocessing

1. Remove URLS

In [27]:
#Remove the urls
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub('', text)

df['text'] = df['text'].apply(remove_urls)

#print out a sample
print(df['text'].sample(10))

3920    Nafikiri chakula chapasa kuwa na ladha na umbi...
191     mojawapo ya mambo yanayokatisha tamaa zaidi ni...
109     lakini wakati mtu anajitahidi kwa ukuu na usha...
3367    Hata hivyo mkahawa huu una kifungua - kinywa k...
447                                        krismasi njema
2918                 lilionekana kama hadithi nzuri ajabu
2090    Hakuna mtu atambulishaye herufi hizi kwa sabab...
793     sinema hii inasawazishwa vizuri na vichekesho ...
176     tukiacha ubaguzi wa rangi wacha tuangalie upun...
313                                         haiweki shati
Name: text, dtype: object


2. Remove Special Characters

In [28]:
#Remove special characters
def remove_special_characters(text):
    pattern = r'[^a-zA-Z0-9\s]' # keep only alphanumeric and whitespace characters
    text = re.sub(pattern, '', text)
    return text

df['text'] = df['text'].apply(remove_special_characters)

#print out a sample
print(df['text'].sample(10))

2786    Tokeo ni mafanikio ya ujenzi wa nguvu za umeme...
1824    vifaa hivyo viwili vilikuwa rahisi sana hivi k...
619     sinema hii ni maoni mazuri na ya kuchekesha ju...
1581    Hatimaye baada ya mara tatu au nne masika ya n...
3536    Nilichunguza mahali hapa miaka kadhaa iliyopit...
1976    Mbali na kuwa na moja ya nyimbo zenye kupendez...
3440    Chakula ni kizuri sana kwa chakula chako cha k...
2647          Ni kama sinema nzuri au nzuri isiyotabirika
1186    mwezi mmoja tu ulikuwa na kazi yake lakini bil...
2040    mandhari za majirani wa ohsovierl zilizokomaa ...
Name: text, dtype: object


3. Convert to lower case

In [29]:
#convert to lower case
df['text'] = df['text'].str.lower()

#print out a sample
print(df['text'].sample(10))

110     jambo moja zaidi naweza kuvumilia usahihi wa k...
470                                         shukrani sana
1150       hakika hii bila ya shaka ni haki ya kupambanua
1877                     zikapangwa vizuri pamoja na hilo
1622                   haikufanyi uonekane mwenye ubaridi
696     nadhani nilipenda maelezo ya dysfunction yake ...
3207                                i alingoja na kungoja
1511                                betri inashika vizuri
2576    ni ajabu kwamba hadithi hizo zimefumwa pamoja ...
2181                       lililokuwa jambo la kuchekesha
Name: text, dtype: object


4. Remove stopwords

In [30]:
# Download the stopwords for Kiswahili language
swahili_stopwords = stopwords.words('swahili')

# Define a function to remove stopwords
def remove_stopwords(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove the stopwords
    filtered_tokens = [word for word in tokens if word.lower() not in swahili_stopwords]
    # Join the filtered tokens back into a string
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

# Apply the remove_stopwords function to the 'text' column of the DataFrame
df['text'] = df['text'].apply(remove_stopwords)

print(df['text'].sample(10))

3379       tatu mwisho chakula mchana hapa zimekuwa mbaya
2499    mbali risasi mbaya filamu ina mizigo mingi vit...
3921                                  hamu kula ilitoweka
1253                                         kiruu amazon
3043                 namna ugumu kuharibu nyama walifanya
304                                             neno aibu
1904                                      si buku kutosha
1209               niliipata sababu ndogo yenye kupendeza
1895    imeiunganisha kujua si kitu kilichotengenezwa ...
933               kusema sababu natumia vibaya pesa zangu
Name: text, dtype: object


### Naive Bayes classifier using scikit-learn library

1. Here we train a model with our cleaned dataset
2. Then we get the accuracy of the model using a test dataset

In [31]:
# split the data into training and testing sets
train_data = df.sample(frac=0.8, random_state=42)
test_data = df.drop(train_data.index)

# extract the features and labels from the training and testing data
vectorizer = CountVectorizer()
train_features = vectorizer.fit_transform(train_data['text'])
test_features = vectorizer.transform(test_data['text'])
train_labels = train_data['labels']
test_labels = test_data['labels']

# train the Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(train_features, train_labels)

# evaluate the classifier on the testing data
predictions = classifier.predict(test_features)
accuracy = accuracy_score(test_labels, predictions)
report = classification_report(test_labels, predictions)

# print the evaluation results
print('Accuracy: {:.2f}%'.format(accuracy*100))
print('Classification Report:\n', report)

Accuracy: 76.05%
Classification Report:
               precision    recall  f1-score   support

    negative       0.79      0.70      0.74       387
    positive       0.74      0.82      0.78       398

    accuracy                           0.76       785
   macro avg       0.76      0.76      0.76       785
weighted avg       0.76      0.76      0.76       785



### Deploy the model

Save the trained model to my local machine

In [34]:
joblib.dump(classifier, 'swahili_naive_bayes_model.joblib')

['swahili_naive_bayes_model.joblib']

#### Classify new text data

In [40]:
text = input("Enter your swahili text: ")
new_text = [text]
new_features = vectorizer.transform(new_text)
predicted_label = classifier.predict(new_features)
print(predicted_label)

Enter your swahili text: Mimi napenda huyo mwalimu
['positive']
