https://medium.com/@zubairashfaque/sentiment-analysis-with-naive-bayes-algorithm-a31021764fb4

In [2]:
import csv

texts = []
labels = []

with open('../../data/clean/reddit_sentiments.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    has_header = csv.Sniffer().has_header(csvfile.read(1024))
    csvfile.seek(0) 
    if has_header:
        next(reader)  # just skip the header
    for row in reader:
        texts.append(row[0])
        labels.append(row[1])

print("texts:", texts[0])
print("labels:", labels[0])

texts: well there are thousands of international students here illegally so we gotta ramp it up
labels: NEG


In [13]:
import pandas as pd

df = pd.read_csv('../../data/clean/reddit_sentiments.csv')

print(df.head())

train_df = df.sample(frac=0.8, random_state=42)

                                             comment sentiment
0  well there are thousands of international stud...       NEG
1  the article said dude needed a translator lol ...       NEG
2           for those convicted of crimes thats good       POS
3            good gotta bump up those rookie numbers       POS
4                                               good       POS


In [14]:
def split_data_by_sentiment(data, sentiment):
    """
    Split the data DataFrame into separate lists based on sentiment.

    Parameters:
       data (DataFrame): The input DataFrame containing 'text' and 'sentiment' columns.
       sentiment (str): The sentiment label to filter the data.

    Returns:
        list: A list of text corresponding to the specified sentiment.
    """
    return data[data['sentiment'] == sentiment]['comment'].tolist()

# Assuming df is your DataFrame containing 'text' and 'sentiment' columns
positive_data = split_data_by_sentiment(df, 'POS')
negative_data = split_data_by_sentiment(df, 'NEG')
neutral_data = split_data_by_sentiment(df, 'NEU')


In [15]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

def preprocess_text(text):
    # Convert the text to lowercase
    text = text.lower()
    
    # Remove punctuation from the text using translation
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Tokenize the text into individual words
    tokens = nltk.word_tokenize(text)
    
    # Initialize a Porter stemmer for word stemming
    stemmer = PorterStemmer()
    
    # Get a set of English stopwords from NLTK
    stopwords_set = set(stopwords.words("english"))
    
    # Apply stemming to each token and filter out stopwords
    tokens = [stemmer.stem(token) for token in tokens if token not in stopwords_set]
    
    # Return the preprocessed tokens
    return tokens

In [16]:
from collections import defaultdict

def calculate_word_counts(textes):
    # Initialize a defaultdict to store word counts, defaulting to 0 for unseen words
    word_count = defaultdict(int)
    
    # Iterate through each text in the given list of textes
    for text in textes:
        # Tokenize and preprocess the text using the preprocess_text function
        tokens = preprocess_text(text)
        
        # Iterate through each token in the preprocessed tokens
        for token in tokens:
            # Increment the count for the current token in the word_count dictionary
            word_count[token] += 1
    
    # Return the word_count dictionary containing word frequencies
    return word_count

# Calculate word counts for textes with positive sentiment
word_count_positive = calculate_word_counts(train_df[train_df['sentiment'] == 'POS']['comment'])

# Calculate word counts for textes with negative sentiment
word_count_negative = calculate_word_counts(train_df[train_df['sentiment'] == 'NEG']['comment'])

# Calculate word counts for textes with neutral sentiment
word_count_neutral = calculate_word_counts(train_df[train_df['sentiment'] == 'NEU']['comment'])

In [17]:
def calculate_likelihood(word_count, total_words, laplacian_smoothing=1):
    # Create an empty dictionary to store the likelihood values
    likelihood = {}
    
    # Get the number of unique words in the vocabulary
    vocabulary_size = len(word_count)

    # Iterate through each word and its corresponding count in the word_count dictionary
    for word, count in word_count.items():
        # Calculate the likelihood using Laplacian smoothing formula
        # Laplacian smoothing is used to handle unseen words in training data
        # The formula is (count + smoothing) / (total_words + smoothing * vocabulary_size)
        likelihood[word] = (count + laplacian_smoothing) / (total_words + laplacian_smoothing * vocabulary_size)

    # Return the calculated likelihood dictionary
    return likelihood

In [18]:
likelihood_positive = calculate_likelihood(word_count_positive, len(train_df[train_df['sentiment'] == 'POS']), 1)
likelihood_negative = calculate_likelihood(word_count_negative, len(train_df[train_df['sentiment'] == 'NEG']), 1)
likelihood_neutral = calculate_likelihood(word_count_neutral, len(train_df[train_df['sentiment'] == 'NEU']), 1)

In [19]:
import math

def calculate_log_prior(sentiment, data):
    # Calculate the natural logarithm of the ratio of textes with the specified sentiment to the total number of textes
    log_prior = math.log(len(data[data['sentiment'] == sentiment]) / len(data))
    
    # Return the calculated log prior
    return log_prior

# Calculate the log prior for textes with positive sentiment
log_prior_positive = calculate_log_prior('POS', df)

# Calculate the log prior for textes with negative sentiment
log_prior_negative = calculate_log_prior('NEG', df)

# Calculate the log prior for textes with neutral sentiment
log_prior_neutral = calculate_log_prior('NEU', df)

In [20]:
# Create a dictionary of log-likelihood values for positive sentiment
log_likelihood_positive = {word: math.log(prob) for word, prob in likelihood_positive.items()}

# Create a dictionary of log-likelihood values for negative sentiment
log_likelihood_negative = {word: math.log(prob) for word, prob in likelihood_negative.items()}

# Create a dictionary of log-likelihood values for neutral sentiment
log_likelihood_neutral = {word: math.log(prob) for word, prob in likelihood_neutral.items()}

In [21]:
def classify_text_with_scores(text, log_likelihood_positive, log_likelihood_negative, log_likelihood_neutral,
                               log_prior_positive, log_prior_negative, log_prior_neutral):
    # Tokenize and preprocess the input text
    tokens = preprocess_text(text)

    # Calculate the log scores for each sentiment category
    log_score_positive = log_prior_positive + sum([log_likelihood_positive.get(token, 0) for token in tokens])
    log_score_negative = log_prior_negative + sum([log_likelihood_negative.get(token, 0) for token in tokens])
    log_score_neutral = log_prior_neutral + sum([log_likelihood_neutral.get(token, 0) for token in tokens])

    # Store the sentiment scores in a dictionary
    sentiment_scores = {
        'positive': log_score_positive,
        'negative': log_score_negative,
        'neutral': log_score_neutral
    }

    # Determine the predicted sentiment based on the highest sentiment score
    predicted_sentiment = max(sentiment_scores, key=sentiment_scores.get)
    
    # Return the predicted sentiment and the sentiment scores
    return predicted_sentiment, sentiment_scores


In [25]:

# Classify a sample text using the trained model
text = "I love you so much!"
predicted_sentiment, sentiment_scores = classify_text_with_scores(text, log_likelihood_positive, log_likelihood_negative, log_likelihood_neutral,
                                                                    log_prior_positive, log_prior_negative, log_prior_neutral)

print("Sample sentence:", text)
print("Predicted Sentiment:", predicted_sentiment)
print("Sentiment Scores:", sentiment_scores)


Sample sentence: I love you so much!
Predicted Sentiment: positive
Sentiment Scores: {'positive': -11.383843163704702, 'negative': -11.663342264697556, 'neutral': -13.402463765735227}
