In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from transformers import pipeline
from tqdm import tqdm
import torch


In [None]:

# Check if NVIDIA GPU is available
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")


In [None]:
df = pd.read_csv("../data/raw/reviews.csv")

In [None]:
df.columns

In [None]:
# Drop rows where review_text is NaN or empty
df = df[~(df["review_text"].isna() | (df["review_text"].str.strip() == ""))]


print(f"Remaining rows after dropping empty reviews: {len(df)}")



In [None]:
duplicate_count = df.duplicated(subset=["review_text"]).sum()
print(f"Number of duplicate reviews: {duplicate_count}")

In [None]:
# Drop duplicate reviews based on review_text, keep the first occurrence
df = df.drop_duplicates(subset=["review_text"], keep="first").reset_index(drop=True)

# Verify
print(f"Remaining rows after dropping duplicates: {len(df)}")
print(f"Unique reviews: {df['review_text'].nunique()}")



In [None]:
# Creates an instance of the PorterStemmer: algorithm for stemming words (reducing words to their root form e.g., "running" -> "run")
stemmer = PorterStemmer()
# Loads a set of common English stop words (e.g., "the", "is", "in") to be removed from text
# These words are often filtered out in text processing as they carry less meaningful information
stop_words = set(stopwords.words("english"))

# Function to clean and preprocess text data
def clean_text(text):
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', str(text).lower())
    # Tokenize, remove stop words, and stem the remaining words
    words = [stemmer.stem(w) for w in text.split() if w not in stop_words]
    # Join the processed words back into a single string
    return " ".join(words)

# Create a new column 'clean_review' in the DataFrame by applying the clean_text function to the 'review_text' column
# New column will contain stemmed and filtered text reviews
df["clean_review"] = df["review_text"].apply(clean_text)

In [None]:
df.to_pickle("../data/processed/cleaned_reviews.pkl")

In [None]:
df = pd.read_pickle("../data/processed/cleaned_reviews.pkl")

In [None]:
df.head(10)

In [None]:
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

In [None]:
# Progress bar for pandas apply to show progress during sentiment analysis
tqdm.pandas()

# Function to get sentiment of a given text
def get_sentiment(text):
    # If the text is empty or only whitespace, return a neutral sentiment
    if not text or len(text.strip()) == 0:
        return {"label": "NEUTRAL", "score": 0.0}
    # Perform sentiment analysis using the pipeline, limiting input to 512 tokens
    result = sentiment_pipeline(text[:512])[0]  # limit to 512 tokens
    return result

df["sentiment_result"] = df["clean_review"].progress_apply(get_sentiment)