# Data module

## dataloader.py

In [None]:
import os
import json
import pandas as pd
from icecream import ic
from data import data_dir_path

from sklearn.model_selection import train_test_split
import pandas as pd

def split_dataframe(df):
    """
    Split a dataframe into train, test, and validation sets.

    Parameters:
    df (pandas.DataFrame): The dataframe to be split.
    train_size (float): Proportion of the dataset to include in the train split.
    test_size (float): Proportion of the dataset to include in the test split.
    val_size (float): Proportion of the dataset to include in the validation split.

    Returns:
    tuple: Tuple containing three dataframes (train, test, validation).
    """
    train_size=0.7
    test_size=0.2
    val_size=0.1

    # First, split into train and temp (test + validation)
    train_df, temp_df = train_test_split(df, train_size=train_size)

    # Calculate the proportion of temp_df to be used for test to maintain overall test_size proportion
    proportion_of_temp_for_test = test_size / (test_size + val_size)

    # Split temp into test and validation
    test_df, val_df = train_test_split(temp_df, train_size=proportion_of_temp_for_test)

    return train_df, test_df, val_df

def balance_data(df:pd.DataFrame)->pd.DataFrame:

    # Min count of sentiment category
    min_count = df['sentiment'].value_counts().min()
    # min_count = 10000

    # Create a new DataFrame to store the balanced data
    balanced_df = pd.DataFrame()

    # Iterate through each category
    for category in df['sentiment'].unique():
        # Randomly sample 'min_count' reviews from each category
        sampled_reviews = df[df['sentiment'] == category].sample(min_count, random_state=42)
        
        # Append the sampled reviews to the balanced DataFrame
        balanced_df = pd.concat([balanced_df, sampled_reviews], ignore_index=True)
    
    return balanced_df

def get_sentiment_from_rating(rating:float)->str:
    """Convert rating to sentiments"""

    rating = int(rating)
    if rating < 3:
        return "negative"
    elif rating > 3:
        return "positive"
    else:
        return "neutral"

# Path of json file of fashion product reviews
data_path = os.path.join(
    data_dir_path,
    "fashion_data\AMAZON_FASHION.json"
)

reviews = list()
with open(data_path,'r') as file:
    for row in file:
        reviews.append(json.loads(row))
    review_data = pd.DataFrame(reviews)[['overall','reviewText']]

ic(review_data.isna().sum())

# Drop null values
review_data.dropna(inplace=True)

# Reset index
review_data.reset_index(inplace=True, drop=True)

review_data['sentiment'] = review_data['overall'].apply(get_sentiment_from_rating)
ic(len(review_data), "reviews loaded.")

balanced_review_data = balance_data(review_data)
ic(len(balanced_review_data), "reviews available.")

train_data, val_data, test_data = split_dataframe(balanced_review_data)

# EDA module

## data_exploration.py

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def plot_review_sentiment_count_bars(
        value_counts:pd.Series,
        save_folder: str, 
        save_name:str,
        chart_title:str
    ):
    """
    Create a bar chart from value counts and save it to a given folder.

    Parameters:
        value_counts (pd.Series): Value counts data.
        save_folder (str): Folder path where the chart will be saved.
        chart_title (str, optional): Title for the bar chart.
    """
    # Create a DataFrame from the value counts series
    df = pd.DataFrame({'Values': value_counts.index, 'Counts': value_counts.values})
    
    # Create the bar chart
    plt.figure(figsize=(10, 6))
    plt.bar(df['Values'], df['Counts'])
    plt.xlabel('Values')
    plt.ylabel('Counts')
    plt.title(chart_title)
    plt.xticks(rotation=0)

    # Add count labels at the top of each bar
    for i, count in enumerate(df['Counts']):
        plt.text(df['Values'][i], count, str(count), ha='center', va='bottom', fontsize=12)
    
    # Save the chart to the specified folder
    save_path = f"{save_folder}/{save_name}.png"
    plt.savefig(save_path, bbox_inches='tight')

## main.py

In [None]:
import os
from icecream import ic
from eda.data_exploration import plot_review_sentiment_count_bars
from data.dataloader import review_data, balanced_review_data
from results import results_dir_path
from utils.folder_utils import create_path


# Review count bar plot
save_folder = os.path.join(results_dir_path,"eda")
create_path(save_folder)
save_name = "sentiments_counts"
plot_review_sentiment_count_bars(
    value_counts=review_data.sentiment.value_counts(),
    save_folder=save_folder,
    save_name=save_name,
    chart_title="Sentiments count bar plot"
)

# Balanced review count bar plot
save_folder = os.path.join(results_dir_path,"eda")
create_path(save_folder)
save_name = "balanced_sentiments_counts"
plot_review_sentiment_count_bars(
    value_counts=balanced_review_data.sentiment.value_counts(),
    save_folder=save_folder,
    save_name=save_name,
    chart_title="Balanced data sentiments count bar plot"
)

# Evaluate module

## metrics.py

In [None]:
from sklearn.metrics import (
    precision_score, 
    recall_score, 
    accuracy_score, 
    f1_score,
    confusion_matrix
)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from icecream import ic
from utils.folder_utils import create_path

def calculate_correct_positive(actual, predicted):
    total_actual_positive = actual.count('positive')
    total_predicted_positive = predicted.count('positive')
    
    if total_actual_positive == 0:
        return 0.00
    
    correct_positive = sum(1 for a, p in zip(actual, predicted) if a == 'positive' and p == 'positive')
    percentage_correct_positive = (correct_positive / total_actual_positive) * 100
    
    return round(percentage_correct_positive, 2)

def calculate_correct_negative(actual, predicted):
    total_actual_negative = actual.count('negative')
    total_predicted_negative = predicted.count('negative')
    
    if total_actual_negative == 0:
        return 0.00
    
    correct_negative = sum(1 for a, p in zip(actual, predicted) if a == 'negative' and p == 'negative')
    percentage_correct_negative = (correct_negative / total_actual_negative) * 100
    
    return round(percentage_correct_negative, 2)

def calculate_positive_classified_as_negative(actual, predicted):
    total_actual_positive = actual.count('positive')
    
    if total_actual_positive == 0:
        return 0.00
    
    positive_classified_as_negative = sum(1 for a, p in zip(actual, predicted) if a == 'positive' and p == 'negative')
    percentage_positive_classified_as_negative = (positive_classified_as_negative / total_actual_positive) * 100
    
    return round(percentage_positive_classified_as_negative, 2)

def calculate_negative_classified_as_positive(actual, predicted):
    total_actual_negative = actual.count('negative')
    
    if total_actual_negative == 0:
        return 0.00
    
    negative_classified_as_positive = sum(1 for a, p in zip(actual, predicted) if a == 'negative' and p == 'positive')
    percentage_negative_classified_as_positive = (negative_classified_as_positive / total_actual_negative) * 100
    
    return round(percentage_negative_classified_as_positive, 2)


def evaluate_model(
        actual:list[str], 
        prediction:list[str], 
        save_file_path:str, 
        file_name:str
    )->None:
    """
    Evaluate sentiment analysis performance and write results to a text file.

    Parameters:
    actual (list): List of actual sentiment labels.
    prediction (list): List of predicted sentiment labels.
    save_file_path (str): Path to save the file.
    file_name (str): Name of the file.
    """

    # Calculating metrics
    precision = round(precision_score(actual, prediction, average='macro'), 2)
    recall = round(recall_score(actual, prediction, average='macro'), 2)
    accuracy = round(accuracy_score(actual, prediction), 2)
    f1 = round(f1_score(actual, prediction, average='macro'), 2)
    correct_positive_percentage = calculate_correct_positive(actual, prediction)
    correct_negative_percentage = calculate_correct_negative(actual, prediction)
    positive_classified_as_negative_percentage = calculate_positive_classified_as_negative(actual, prediction)
    negative_classified_as_positive_percentage = calculate_negative_classified_as_positive(actual, prediction)

    # Creating a DataFrame for the results
    results_df = pd.DataFrame({
        'Metric': [
            'Precision', 
            'Recall', 
            'Accuracy', 
            'F1 Score',
            'correct_positive_percentage',
            'correct_negative_percentage',
            'positive_classified_as_negative_percentage',
            'negative_classified_as_positive_percentage'
        ],
        'Value': [
            precision, 
            recall, 
            accuracy, 
            f1,
            correct_positive_percentage,
            correct_negative_percentage,
            positive_classified_as_negative_percentage,
            negative_classified_as_positive_percentage
        ]
    })

    ic(results_df)

    create_path(save_file_path)

    # Saving the results to a text file
    full_path = f"{save_file_path}/{file_name}.txt"
    with open(full_path, 'w') as file:
        file.write(results_df.to_string(index=False))

    return None

def plot_and_save_confusion_matrix(actual, predicted, save_file_path, file_name):
    """
    Calculate the confusion matrix, plot it using Matplotlib, and save the plot.

    Parameters:
    actual (list): List of actual labels.
    predicted (list): List of predicted labels.
    save_file_path (str): Path where the plot should be saved.
    file_name (str): Name of the file to save the plot.
    """

    # Calculate confusion matrix
    cm = confusion_matrix(actual, predicted, labels=["negative", "positive", "neutral"])

    # Plot using seaborn for a nicer looking heatmap
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='g', xticklabels=["negative", "positive", "neutral"], yticklabels=["negative", "positive", "neutral"])
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')

    # Save the plot
    plt.savefig(f"{save_file_path}/{file_name}.png")

    # Close the plot
    plt.close()

# Utils module

## folder_utils.py

In [None]:
import os

def create_path(folder_path:str)->None:
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
    return None

# BERT module

## training.py

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
import torch
import numpy as np

from data.dataloader import train_data,test_data

train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

## main.py

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import os
from tqdm import tqdm
from icecream import ic

from data.dataloader import (
    get_sentiment_from_rating, 
    test_data
)
from evaluate.metrics import (
    plot_and_save_confusion_matrix,
    evaluate_model
)
from results import results_dir_path

tqdm.pandas()

tokenizer = AutoTokenizer.from_pretrained("LiYuan/amazon-review-sentiment-analysis")
model = AutoModelForSequenceClassification.from_pretrained("LiYuan/amazon-review-sentiment-analysis")

def predict_sentiment(review:str)->str:

    try:
        inputs = tokenizer(review, return_tensors="pt",truncation=True)

        # Make prediction
        outputs = model(**inputs)
        logits = outputs.logits
        prediction_index = torch.argmax(logits, dim=1)
        sentiment = get_sentiment_from_rating(prediction_index.item())

        return sentiment

    except Exception as e:
        ic(review)
        ic(e)

        return "neutral"

save_file_path = os.path.join(
    results_dir_path,
    "bert"
)

test_data['prediction'] = test_data['reviewText'].progress_apply(predict_sentiment)
csv_file_path = os.path.join(save_file_path,"_data.csv")
test_data.to_csv(csv_file_path, index=False)

prediction = list(test_data['prediction'])
actual = list(test_data['sentiment'])

evaluate_model(
    actual=actual,
    prediction=prediction,
    save_file_path=save_file_path,
    file_name="evaluation_matrix"
)

plot_and_save_confusion_matrix(
    actual=actual,
    predicted=prediction,
    save_file_path=save_file_path,
    file_name="confusion_matrix"
)

csv_file_path = os.path.join(save_file_path,"data.csv")
test_data.to_csv(csv_file_path, index=False)

# Vader module

## main.py

In [None]:
import nltk
import os
from nltk.sentiment import SentimentIntensityAnalyzer
from icecream import ic
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


from data.dataloader import test_data
from evaluate.metrics import evaluate_model, plot_and_save_confusion_matrix
from results import results_dir_path

tqdm.pandas()

# Download stopwords
nltk.download('stopwords')

# Download WordNet lemmatizer data
nltk.download('wordnet')

# Download Punkt tokenizer models
nltk.download('punkt')

# Download the VADER lexicon
nltk.download('vader_lexicon')

def preprocess_text(text):

    # Tokenize the text
    tokens = word_tokenize(text.lower())

    # Remove stop words
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Join the tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)

    return processed_text

sia = SentimentIntensityAnalyzer()

def analyze_sentiment(review_text:str):

    score = sia.polarity_scores(review_text)
    if score['compound'] >= 0.05:
        return 'positive'
    elif score['compound'] <= -0.05:
        return 'negative'
    else:
        return 'neutral'

test_data['prediction'] = test_data['reviewText'].progress_apply(analyze_sentiment)

prediction = list(test_data['prediction'])
actual = list(test_data['sentiment'])
save_file_path = os.path.join(
    results_dir_path,
    "vader"
)

evaluate_model(
    actual=actual,
    prediction=prediction,
    save_file_path=save_file_path,
    file_name="evaluation_matrix"
)

plot_and_save_confusion_matrix(
    actual=actual,
    predicted=prediction,
    save_file_path=save_file_path,
    file_name="confusion_matrix"
)

test_data['processedReviewText'] = test_data['reviewText'].progress_apply(preprocess_text)
test_data['predictionProcessed'] = test_data['processedReviewText'].progress_apply(analyze_sentiment)

prediction = list(test_data['predictionProcessed'])
save_file_path = os.path.join(
    results_dir_path,
    "vader"
)

evaluate_model(
    actual=actual,
    prediction=prediction,
    save_file_path=save_file_path,
    file_name="evaluation_matrix_processed"
)

plot_and_save_confusion_matrix(
    actual=actual,
    predicted=prediction,
    save_file_path=save_file_path,
    file_name="confusion_matrix_processed"
)

csv_file_path = os.path.join(save_file_path,"data.csv")
test_data.to_csv(csv_file_path, index=False)

# llm module

## embedding.py

In [None]:
from langchain.embeddings import GooglePalmEmbeddings
from dotenv import load_dotenv

load_dotenv()

def embedding_function():
    
    # Initialize GooglePalmEmbeddings
    embeddings = GooglePalmEmbeddings()

    return embeddings

## prompts.py

In [None]:
sentiment_analysis_prompt = """
    Analyze the sentiment of the following customer review. 
    Note that the review may contain words that typically have a sensitive connotation, 
    but here they are used in the context of describing clothing or fashion items. 
    Your task is to interpret these words correctly within this context and 
    determine the overall sentiment of the review - 
    whether it is positive, negative, or neutral. 
    Please provide a clear sentiment label (positive/negative/neutral), focusing solely on the customer's 
    satisfaction or dissatisfaction with the clothing item. Provide the sentiment label only.
    Review: {review}
"""

## prompt_analyser.py

In [None]:
from langchain.prompts import PromptTemplate
from langchain.llms import GooglePalm
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv
from icecream import ic
from langchain.llms import OpenAI
import json
from pydantic import BaseModel, validator

from llm.prompts import sentiment_analysis_prompt

class SentimentModel(BaseModel):
    sentiment: str

    @validator('sentiment')
    def match_sentiment(cls, v):
        allowed_values = ["positive", "negative", "neutral"]
        matched_value = next((val for val in allowed_values if val in v.lower()), None)
        return matched_value

# Example Usage
model = SentimentModel(sentiment="I feel very positive about this!")
print(model.sentiment)  # Output will be "positive" if it's found in the input string


load_dotenv()

llm = GooglePalm(temperature=0.0)
chat_llm = ChatGoogleGenerativeAI(model="gemini-pro")

# gpt_llm = OpenAI(model_name="text-davinci-003", temperature=0.0)

def predict_sentiment(review:str):

    try:

        prompt = PromptTemplate(
            template=sentiment_analysis_prompt,
            input_variables=["review"],
        )

        chain = prompt | llm 
        result = chain.invoke({
                "review" : review,
        })

    except Exception as e:
        ic("In exception")
        prompt = """
            Analyze the sentiment of the following customer review. 
            Note that the review may contain words that typically have a sensitive connotation, 
            but here they are used in the context of describing clothing or fashion items. 
            Your task is to interpret these words correctly within this context and 
            determine the overall sentiment of the review - 
            whether it is positive, negative, or neutral. 
            Please provide a clear sentiment label (positive/negative/neutral), focusing solely on the customer's 
            satisfaction or dissatisfaction with the clothing item. Provide the sentiment label only.

            Review: {review}
        """
        output = chat_llm.invoke(prompt).content.lower()
        result = SentimentModel(sentiment=output).sentiment
        ic(output)
        ic(result)

    return result

if __name__ == "__main__":
    from icecream import ic

    review = """
        'Size ordered fits as expected. Looked real nice when received.  After a week '
             'of wearing it its pretty scratched up. Scratches real easy. Very light an '
             'almost plastic feeling. But hey its a 14 dollar ring. Over all i like it' 
    """
    prompt = f"""
        Please analyze the sentiment of the given fashion product review and 
        classify it as either positive, negative, or neutral. 
        Please provide a detailed response that accurately represents the user's sentiment. 
        Provide the answer in a single word.

        Product Review: {review}

        Sentiment:

    """
    ic(google_llm.invoke(prompt))

    prompt = PromptTemplate(
        template=sentiment_analysis_prompt,
        input_variables=["review"],
    )

    chain = prompt | google_llm 
    result = chain.invoke({
        "review" : review,
    })
    ic(result)

## main.py

In [None]:
from tqdm import tqdm
import os

from data.dataloader import test_data
from evaluate.metrics import evaluate_model, plot_and_save_confusion_matrix
from results import results_dir_path
from llm.prompt_analyser import predict_sentiment

tqdm.pandas()

test_data['prediction'] = test_data['reviewText'].progress_apply(predict_sentiment)


prediction = list(test_data['prediction'])
actual = list(test_data['sentiment'])
save_file_path = os.path.join(
    results_dir_path,
    "llm"
)

csv_file_path = os.path.join(save_file_path,"_data.csv")
test_data.to_csv(csv_file_path, index=False)

evaluate_model(
    actual=actual,
    prediction=prediction,
    save_file_path=save_file_path,
    file_name="evaluation_matrix"
)

plot_and_save_confusion_matrix(
    actual=actual,
    predicted=prediction,
    save_file_path=save_file_path,
    file_name="confusion_matrix"
)

csv_file_path = os.path.join(save_file_path,"data.csv")
test_data.to_csv(csv_file_path, index=False)

# webapp module

## main.py

In [None]:
import streamlit as st
from llm.prompt_analyser import predict_sentiment

# Streamlit UI elements
st.title("Product Review Sentiment Analysis")
review_text = st.text_area("Enter your product review here:")
predict_button = st.button("Predict Sentiment")

# Perform sentiment analysis when the button is clicked
if predict_button:
    if review_text.strip() == "":
        st.error("Please enter a review before predicting sentiment.")
    else:
        sentiment = predict_sentiment(review_text)
        st.header(f"Sentiment: {sentiment}")
