# <em><u>Sentiment Analyzer - Milestone 2</u></em>

## Import necessary libraries

In [1]:
import re
import joblib
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from IPython.display import display
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Initialize the NLTK tokenizer for Kiswahili
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\piuso\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\piuso\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\piuso\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\piuso\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Load our clean data set

In [2]:
df = pd.read_csv('cleaned_data.csv')

df.head()

Unnamed: 0,text,labels
0,bahati mbaya fadhila yoyote kazi utengenezaji ...,negative
1,huwa kuliko msemaji mwingine yeyote ukubwa huu...,positive
2,iligundua rahisi kutengeneza kutumia bidhaa,positive
3,ipurkated hii ajili msimamizi gari haifaulu,negative
4,sinema nzuri kwelikweli upendo usio masharti,positive


## Naive Bayes classifier using scikit-learn library

1. Here we train a model with our cleaned dataset
2. Then we get the accuracy of the model using a test dataset

In [3]:
# Split the data into training and testing sets
train_data = df.sample(frac=0.8, random_state=42)
test_data = df.drop(train_data.index)

# Extract the features and labels from the training and testing data
vectorizer = CountVectorizer()
train_features = vectorizer.fit_transform(train_data['text'].values.astype('U'))
test_features = vectorizer.transform(test_data['text'].values.astype('U'))
train_labels = train_data['labels']
test_labels = test_data['labels']

# Train the Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(train_features, train_labels)

# Evaluate the classifier on the testing data
predictions = classifier.predict(test_features)
accuracy = accuracy_score(test_labels, predictions)
report = classification_report(test_labels, predictions)

# Print the evaluation results
print('Accuracy: {:.2f}%'.format(accuracy*100))
print('Classification Report:\n', report)

Accuracy: 75.80%
Classification Report:
               precision    recall  f1-score   support

    negative       0.75      0.75      0.75       382
    positive       0.76      0.77      0.77       403

    accuracy                           0.76       785
   macro avg       0.76      0.76      0.76       785
weighted avg       0.76      0.76      0.76       785



## Save the trained model and vectorizer as joblib file

1. We will use this joblib files in making a web interface

In [4]:
joblib.dump(classifier, 'swahili_naive_bayes.joblib')

['swahili_naive_bayes.joblib']

In [5]:
joblib.dump(vectorizer, 'swahili_vectorizer.joblib')

['swahili_vectorizer.joblib']

## Classify new text data

### Function that allows us to preprocess new text data

In [7]:
def preprocess(text):
    swahili_stopwords = stopwords.words('swahili')
    text = re.sub(r'\d+', '', text) # Remove numbers
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)# Remove special characters
    text = text.lower() # Convert to lowercase
    tokens = word_tokenize(text) # Tokenize the text
    filtered_tokens = [word for word in tokens if word.lower() not in swahili_stopwords] # Remove the stopwords   
    filtered_text = ' '.join(filtered_tokens) # Join the filtered tokens back into a string
    return filtered_text

preprocess(input())

TULIENDA HUKO LAKINI HUYO MAMA ALIKUWA AMEENDA NA KILA KITU


'tulienda huko huyo mama ameenda kitu'

### Asking a user for new text data with an input box and displaying the sentiment

In [12]:
# get teh user input
text = input("Enter your swahili text: ")

# Preprocess the text wiht our funciton
new_text = [preprocess(text)]

# Extract the features and try and predict the label forom our model
new_features = vectorizer.transform(new_text)
predicted_label = classifier.predict(new_features)
print(predicted_label)

Enter your swahili text: Sitaki ujinga
['negative']


## Deployment and hosting

1. I have created a flask application that consists of html, css, python and javascript files. The folder for the application is Flask_App which can be found in the Github repository 
2. We use the joblib files for the model and vectorizer to predict sentiment of user input on the site. 
3. The site is hosted on <a href="https://www.pythonanywhere.com/">pythonanyhwere.com</a> 
4. The link to my site is <a href="https://openda.pythonanywhere.com/">openda.pythonanywhere.com</a>

# <em><u>THE END</u></em>