We First feed raw data to the model which cleans the data by;
1. Cleaning 
2. preparing the text data 
3. removing stopwords, punctuation, and other noise
The data is then saved as a pdf, which contains clean data without any sentiment label

In [2]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.mwananchi.co.tz/'  # Replace with the URL of the website you want to scrape
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Find all the <p> tags on the page
paragraphs = soup.find_all('p')

# Print the text of each paragraph
for p in paragraphs:
    print(p.text)
    print("hello")


In [None]:
import pandas as pd
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

data = pd.read_csv('Sentiment_Analyzer/dataset/Dataset_1.csv')

# Remove URLs
data['text'] = data['text'].apply(lambda x: re.sub(r'http\S+', '', x))

# Remove special characters
data['text'] = data['text'].apply(lambda x: re.sub(r'[^a-zA-Z0-9]+', ' ', x))

# Convert text to lowercase
data['text'] = data['text'].apply(lambda x: x.lower())

# Tokenize the text data
data['tokenized_text'] = data['text'].apply(lambda x: word_tokenize(x))

# Remove stopwords
stop_words = set(stopwords.words('swahili'))

data['filtered_tokens'] = data['tokenized_text'].apply(lambda x: [token for token in x if token not in stop_words])

stemmer = SnowballStemmer('english')

data['stemmed_tokens'] = data['filtered_tokens'].apply(lambda x: [stemmer.stem(token) for token in x])

# Display a sample of the stemmed data
print(data['stemmed_tokens'].sample(min(10, len(data)), replace=True))

lemmatizer = WordNetLemmatizer()

data['lemmatized_tokens'] = data['filtered_tokens'].apply(lambda x: [lemmatizer.lemmatize(token) for token in x])

# Display a sample of the lemmatized data
print(data['lemmatized_tokens'].sample(min(10, len(data)), replace=True))

all_words = ' '.join([word for tokens in data['lemmatized_tokens'] for word in tokens])
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110, stopwords=STOPWORDS).generate(all_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

data.to_csv('Sentiment_Analyzer/dataset/Cleaned_Dataset_1.csv', index=False)

In [2]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from textblob import TextBlob
import gradio as gr

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
# Load the dataset
df = pd.read_csv("/workspaces/Sentiment_Analyzer/Sentiment_Analyzer/dataset/swahili.csv")
df.head

<bound method NDFrame.head of       Unnamed: 0                                             maneno     lugha
0              0                 team 2019merimera alikuwa takataka  negative
1              1                                     sijafurahishwa  negative
2              2                                      kubuni dosari  negative
3              3                  bila kusema nilipoteza pesa zangu  negative
4              4                       sema kupoteza pesa na wakati  negative
...          ...                                                ...       ...
3920        2995  Nafikiri chakula chapasa kuwa na ladha na umbi...  negative
3921        2996                   hamu ya kula ilitoweka mara moja  negative
3922        2997            Kwa ujumla sikuvutiwa na nisirudi nyuma  negative
3923        2998  Mambo yote yaliyoonwa yalikuwa chini ya kiwang...  negative
3924        2999  Basi ni kana kwamba nilipoteza maisha yangu ya...  negative

[3925 rows x 3 columns]>

In [4]:
# Text preprocessing
stop_words = stopwords.words("swahili")
df["maneno"] = df["maneno"].apply(lambda x: " ".join([word for word in re.sub('[^a-zA-Z0-9\s]', '', x).split() if word not in stop_words]))


In [5]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df["maneno"], df["lugha"], test_size=0.3, random_state=42)


In [6]:
# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [7]:
# Train an SVM classifier
svm = SVC(kernel='linear', C=1.0)
svm.fit(X_train_vec, y_train)

In [8]:
# Make predictions and print results
y_pred = svm.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')

print(f"Results for swahili.csv")
print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1_score:.2f}")
print(f"Polarity: {TextBlob(' '.join(df['maneno'])).sentiment.polarity:.2f}")


Results for swahili.csv
Accuracy: 0.78
F1 Score: 0.78
Polarity: -0.09


In [10]:
def predict_sentiment(text):
    # Preprocess the input text
    text = " ".join([word for word in re.sub('[^a-zA-Z0-9\s]', '', text).split() if word not in stop_words])
    # Vectorize the input text
    text_vec = vectorizer.transform([text])
    # Predict the sentiment of the input text
    sentiment = svm.predict(text_vec)[0]
    return sentiment


In [11]:
def predict_sentimentcsv(data):
    # Preprocess the new data
    data["maneno"] = data["maneno"].apply(lambda x: " ".join([word for word in re.sub('[^a-zA-Z0-9\s]', '', x).split() if word not in stop_words]))

    # Vectorize the new data
    new_data_vec = vectorizer.transform(data["maneno"])

    # Predict the sentiment of the new data
    sentiment = svm.predict(new_data_vec)

    # Calculate the polarity scores of the new data
    new_data_polarity = [TextBlob(text).sentiment.polarity for text in data["maneno"]]

    # Calculate the precision, recall, F1 score, and support for the new data
    precision_new, recall_new, f1_score_new, support_new = precision_recall_fscore_support(sentiment, data["lugha"], average='weighted')

    # Calculate the accuracy for the new data
    accuracy_new = accuracy_score(data["lugha"], sentiment)*100

    return sentiment, new_data_polarity, precision_new, recall_new, f1_score_new, support_new, accuracy_new


In [None]:
# Create the Gradio interface
#input_text = gr.inputs.Textbox(label="Input Text")
#output_sentiment = gr.outputs.Label(label="Sentiment Prediction")
#gr.Interface(fn=predict_sentiment, inputs=input_text, outputs=output_sentiment, 
  #           title="Swahili Sentiment Analyzer", description="Predict the sentiment of Swahili text using an SVM classifier trained on a dataset of Swahili text.").launch()

In [None]:
#def load_csv(file_path):
    # Load the CSV data
 #   df = pd.read_csv(file_path)

    # Text preprocessing
  #  df["maneno"] = df["maneno"].apply(lambda x: " ".join([word for word in re.sub('[^a-zA-Z0-9\s]', '', x).split() if word not in stop_words]))

   # return df

In [None]:
#def get_sentiment_metrics(df):
    # Vectorize the text data using TF-IDF
 #   X_vec = vectorizer.transform(df["maneno"])

    # Make predictions and print results
  #  y_pred = svm.predict(X_vec)
   # accuracy = accuracy_score(df["lugha"], y_pred)
    #precision, recall, f1_score, _ = precision_recall_fscore_support(df["lugha"], y_pred, average='weighted')
    
    #print(f"Results for input CSV data")
    #print(f"Accuracy: {accuracy:.2f}")
    #print(f"F1 Score: {f1_score:.2f}")


In [22]:
import gradio as gr
import pandas as pd
#from swahili_sentiment import test_sentiment, predict_sentiment

# Define input and output interfaces
csv_input = gr.inputs.File(label="Upload CSV file")


# Define input and output interfaces
input_text = gr.inputs.Textbox(label="Input Text")
output_sentiment = gr.outputs.Textbox(label="Sentiment")

# Define function to load CSV data and predict sentiment
def predict_csv_sentiment(data):
    df = pd.read_csv(data["csv"])
    sentiment, _, precision, recall, f1_score, support, accuracy = predict_sentimentcsv(df)

    # Assign predicted sentiment to a new column in the DataFrame
    df["Sentiment"] = sentiment

    # Return the DataFrame as HTML table with summary statistics
    summary_stats = f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1_score:.2f}, Support: {support:.2f}, Accuracy: {accuracy:.2f}%"
    return f"{df.to_html()}<br>{summary_stats}"

# Define interfaces for CSV and Text input
iface_csv = gr.Interface(fn=predict_csv_sentiment, inputs=csv_input, outputs="html", 
                         title="Swahili Sentiment Analyzer for CSV", 
                         description="Predict the sentiment and polarity score of Swahili text from a CSV file using an SVM classifier trained on a dataset of Swahili text.")

iface_text = gr.Interface(fn=predict_sentiment, inputs=input_text, outputs=output_sentiment, 
             title="Swahili Sentiment Analyzer for Text", 
             description="Predict the sentiment of Swahili text using an SVM classifier trained on a dataset of Swahili text.")


# Launch the interfaces
iface_text.launch()
iface_csv.launch()





Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.


