In [49]:
import pandas as pd
import numpy as np
import collections

In [50]:
#Load and inspect data
df = pd.read_csv(r"C:\Users\User\Downloads\sentimentdataset.csv", encoding='latin-1', header=None, engine='python', quotechar='"')
display(df.head())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,,Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
1,0.0,0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12
2,1.0,1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8
3,2.0,2,Just finished an amazing workout! ðª ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15
4,3.0,3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18


In [51]:
#clean data
import nltk
import string
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    text = ' '.join([word for word in words if word not in stop_words])
    return text

# Apply preprocessing to the text column
df['cleaned_text'] = df[2].apply(preprocess_text)

display(df[['cleaned_text']].head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,cleaned_text
0,text
1,enjoying beautiful day park
2,traffic terrible morning
3,finished amazing workout ðª
4,excited upcoming weekend getaway


In [52]:
# Define x and y,split data
assert 'X_train' in globals(), "X_train is not defined. Please run the cell that splits the data first."

# Create a vocabulary of all unique words
all_words = [word for text in df['cleaned_text'] for word in text.split()]
vocabulary = collections.Counter(all_words)
print("Vocabulary size:", len(vocabulary))

# Tokenize the training and testing data
X_train_tokens = X_train.apply(lambda x: x.split())
X_test_tokens = X_test.apply(lambda x: x.split())

# Function to convert text to Bag-of-Words vector
def text_to_bow(text_tokens, vocab):
    bow_vector = np.zeros(len(vocab))
    for word in text_tokens:
        if word in vocab:
            bow_vector[list(vocab.keys()).index(word)] += 1
    return bow_vector

# Apply Bag-of-Words transformation to training and testing data
X_train_bow = np.array([text_to_bow(tokens, vocabulary) for tokens in X_train_tokens])
X_test_bow = np.array([text_to_bow(tokens, vocabulary) for tokens in X_test_tokens])

print("Training data shape after BoW:", X_train_bow.shape)
print("Testing data shape after BoW:", X_test_bow.shape)

Vocabulary size: 2479
Training data shape after BoW: (586, 2479)
Testing data shape after BoW: (147, 2479)


In [53]:
#Implement Naive Bayes class
class NaiveBayesClassifier:
    def __init__(self, alpha=1.0):
        self.alpha = alpha  # Smoothing parameter
        self.prior_probs = {}
        self.conditional_probs = {}
        self.vocabulary = None

    def fit(self, X_bow, y):
        n_samples, n_features = X_bow.shape
        self.vocabulary = range(n_features)
        classes = np.unique(y)

        for cls in classes:
            # Calculate prior probabilities
            class_samples = X_bow[y == cls]
            self.prior_probs[cls] = np.log((len(class_samples) + self.alpha) / (n_samples + len(classes) * self.alpha))

            # Calculate conditional probabilities
            word_counts = np.sum(class_samples, axis=0)
            total_words_in_class = np.sum(word_counts)
            self.conditional_probs[cls] = np.log((word_counts + self.alpha) / (total_words_in_class + n_features * self.alpha))

    def predict(self, X_bow):
        predictions = []
        for sample_bow in X_bow:
            scores = {}
            for cls in self.prior_probs:
                # Calculate posterior probability (in log scale)
                scores[cls] = self.prior_probs[cls] + np.sum(sample_bow * self.conditional_probs[cls])
            predictions.append(max(scores, key=scores.get))
        return np.array(predictions)

# train the Naive Bayes classifier
nb_classifier = NaiveBayesClassifier()
nb_classifier.fit(X_train_bow, y_train)

print("Naive Bayes classifier trained successfully!")

Naive Bayes classifier trained successfully!


In [54]:
from sklearn.model_selection import train_test_split


# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", len(X_train))
print("Testing set size:", len(X_test))

Training set size: 586
Testing set size: 147


In [55]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions 
y_pred = nb_classifier.predict(X_test_bow)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

# Print the results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Accuracy: 0.2721
Precision: 0.2103
Recall: 0.2721
F1-score: 0.2040


In [56]:
# Identify misclassified examples
misclassified_indices = np.where(y_test != y_pred)[0]
misclassified_texts = X_test.iloc[misclassified_indices]
actual_labels = y_test.iloc[misclassified_indices]
predicted_labels = y_pred[misclassified_indices]

# Display some of the misclassified examples
print("Misclassified Examples:")
for i in range(min(10, len(misclassified_indices))):
    print(f"Text: {misclassified_texts.iloc[i]}")
    print(f"Actual Label: {actual_labels.iloc[i]}")
    print(f"Predicted Label: {predicted_labels[i]}")
    print("-" * 20)

Misclassified Examples:
Text: planning surprise scavenger hunt friends anticipating thrill excitement
Actual Label:  Excitement 
Predicted Label:  Joy 
--------------------
Text: coding new project enthusiasm
Actual Label:  Positive  
Predicted Label:  Excitement 
--------------------
Text: boredom lingers stagnant pool indifference
Actual Label:  Boredom         
Predicted Label:  Positive  
--------------------
Text: radiant joy akin blooming flowers sunkissed spring morning
Actual Label:  Radiance    
Predicted Label:  Radiance      
--------------------
Text: hopeful potential personal growth
Actual Label:  Hope          
Predicted Label:  Positive  
--------------------
Text: trying new dessert recipe
Actual Label:  Positive  
Predicted Label:  Excitement 
--------------------
Text: draped warmth kindness quilt compassion stitched love
Actual Label:  Kindness 
Predicted Label:  Joy 
--------------------
Text: lost headphones vanish thin air headphonemystery teenlife
Actual Label: 

The first step was loading and cleaning the raw data ,for feature extraction ,bag of words method  was used to convert naive bayes algorithims to numerical features which involved creating vocabulary of all unique words.naive bayes algorithim  was implemented as a python class. The Data was then split,model was trained and the trained model was used to make predictions.
