## Data Collection and Preprocessing

In [4]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

# Load the dataset (adjust the path to your file)
df = pd.read_csv('../data/raw/IMDB Dataset.csv')  # e.g., "IMDB Dataset.csv"
print(df.head())  # Check the first few rows
print(df['sentiment'].value_counts())  # Check distribution: should be ~25k positive, ~25k negative

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RIJU\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\RIJU\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RIJU\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [5]:
# Preprocessing function
stop_words = set(stopwords.words('english'))
def preprocess_review(review):
    # Split into sentences and clean
    sentences = sent_tokenize(review.lower())
    cleaned_sentences = []
    for sent in sentences:
        # Tokenize and clean
        words = word_tokenize(sent)
        words = [word for word in words if word.isalnum() and word not in stop_words]
        if words:  # Only keep non-empty sentences
            cleaned_sentences.append(" ".join(words))
    return cleaned_sentences

# Apply preprocessing to the 'review' column
df['cleaned_sentences'] = df['review'].apply(preprocess_review)

# Check a sample
print(df[['review', 'cleaned_sentences']].head())

                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                   cleaned_sentences  
0  [one reviewers mentioned watching 1 oz episode...  
1  [wonderful little production, br br filming te...  
2  [thought wonderful way spend time hot summer w...  
3  [basically family little boy jake thinks zombi...  
4  [petter mattei love time money visually stunni...  


In [6]:
sentiment_mapping = {'positive': 1, 'negative': 0}
df['sentiment'] = df['sentiment'].map(sentiment_mapping)

In [7]:
df.head()

Unnamed: 0,review,sentiment,cleaned_sentences
0,One of the other reviewers has mentioned that ...,1,[one reviewers mentioned watching 1 oz episode...
1,A wonderful little production. <br /><br />The...,1,"[wonderful little production, br br filming te..."
2,I thought this was a wonderful way to spend ti...,1,[thought wonderful way spend time hot summer w...
3,Basically there's a family where a little boy ...,0,[basically family little boy jake thinks zombi...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,[petter mattei love time money visually stunni...


## Topic Modeling wiht LDA
### Aspect Extraction

In [8]:
# imports
from gensim import corpora
from gensim.models import LdaModel

In [9]:
# Flatten the list of sentences into a single list for LDA
all_sentences = [sent for sublist in df['cleaned_sentences'] for sent in sublist if sent]  # Remove empty strings

In [10]:
# Tokenize sentences into words for LDA
tokenized_sentences = [sent.split() for sent in all_sentences]

In [11]:
# Create a dictionary and corpus
dictionary = corpora.Dictionary(tokenized_sentences)
corpus = [dictionary.doc2bow(text) for text in tokenized_sentences]

# Train LDA model (e.g., 5 topics, adjust as needed)
num_topics = 5  # Start with 5, you can experiment later
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=10, random_state=42)

In [12]:
# Print the topics to inspect
print("Topics discovered by LDA:")
topics = lda_model.print_topics(num_words=10)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

Topics discovered by LDA:
Topic 0: 0.029*"film" + 0.015*"story" + 0.012*"movie" + 0.012*"characters" + 0.011*"good" + 0.010*"acting" + 0.009*"plot" + 0.008*"scenes" + 0.008*"well" + 0.008*"great"
Topic 1: 0.081*"br" + 0.035*"movie" + 0.019*"film" + 0.016*"one" + 0.014*"like" + 0.013*"would" + 0.010*"see" + 0.009*"time" + 0.009*"really" + 0.008*"could"
Topic 2: 0.006*"one" + 0.005*"war" + 0.005*"new" + 0.005*"years" + 0.005*"world" + 0.004*"friends" + 0.004*"first" + 0.004*"two" + 0.003*"american" + 0.003*"old"
Topic 3: 0.008*"one" + 0.008*"get" + 0.007*"life" + 0.007*"like" + 0.007*"man" + 0.007*"people" + 0.006*"around" + 0.005*"scene" + 0.005*"way" + 0.005*"story"
Topic 4: 0.048*"br" + 0.011*"character" + 0.009*"role" + 0.007*"played" + 0.007*"performance" + 0.006*"love" + 0.006*"plays" + 0.006*"also" + 0.005*"best" + 0.005*"actor"


In [13]:
# Function to get the dominant topic for a sentence
def get_dominant_topic(sentence):
    if not sentence:  # Handle empty sentences
        return -1  # Placeholder for empty
    bow = dictionary.doc2bow(sentence.split())
    topics = lda_model[bow]
    if not topics:  # If no topics assigned
        return -1
    return max(topics, key=lambda x: x[1])[0]  # Return dominant topic ID

In [14]:
# Assign dominant topics to each sentence in each review
df['dominant_topic'] = df['cleaned_sentences'].apply(lambda x: [get_dominant_topic(sent) for sent in x])

In [15]:
# Check a sample
print("\nSample reviews with assigned topics:")
print(df[['cleaned_sentences', 'dominant_topic']].head())


Sample reviews with assigned topics:
                                   cleaned_sentences  \
0  [one reviewers mentioned watching 1 oz episode...   
1  [wonderful little production, br br filming te...   
2  [thought wonderful way spend time hot summer w...   
3  [basically family little boy jake thinks zombi...   
4  [petter mattei love time money visually stunni...   

                   dominant_topic  
0  [1, 1, 3, 3, 3, 3, 3, 1, 1, 2]  
1           [0, 0, 1, 0, 0, 0, 0]  
2                    [1, 0, 1, 0]  
3              [3, 1, 3, 0, 1, 0]  
4     [1, 2, 3, 1, 1, 4, 1, 1, 4]  


In [16]:
df.head()

Unnamed: 0,review,sentiment,cleaned_sentences,dominant_topic
0,One of the other reviewers has mentioned that ...,1,[one reviewers mentioned watching 1 oz episode...,"[1, 1, 3, 3, 3, 3, 3, 1, 1, 2]"
1,A wonderful little production. <br /><br />The...,1,"[wonderful little production, br br filming te...","[0, 0, 1, 0, 0, 0, 0]"
2,I thought this was a wonderful way to spend ti...,1,[thought wonderful way spend time hot summer w...,"[1, 0, 1, 0]"
3,Basically there's a family where a little boy ...,0,[basically family little boy jake thinks zombi...,"[3, 1, 3, 0, 1, 0]"
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,[petter mattei love time money visually stunni...,"[1, 2, 3, 1, 1, 4, 1, 1, 4]"


### Aspect Extraction Refinement

In [17]:
import pandas as pd

# Assuming df has 'cleaned_sentences' and 'dominant_topic' from Step 3
# Step 3's LDA model and dictionary are also available

# Step 1: Inspect LDA topics (re-print for reference)
print("LDA Topics for Reference:")
topics = lda_model.print_topics(num_words=10)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

# Step 2: Define keyword-based aspect mapping
# Adjust these based on your LDA topics' output
aspect_keywords = {
    "acting": ["acting", "actor", "actress", "performance", "cast"],
    "plot": ["plot", "story", "narrative", "script", "ending"],
    "cinematography": ["cinematography", "visual", "camera", "shot", "scene"],
    "soundtrack": ["music", "sound", "score", "soundtrack", "audio"],
    "direction": ["director", "direction", "paced", "style", "vision"]
}

# Reverse mapping: topic ID to aspect name (manual step after inspection)
# Example: Adjust this based on your topics
topic_to_aspect = {
    0: "acting",      # If Topic 0 has "acting", "performance", etc.
    1: "plot",        # If Topic 1 has "story", "plot", etc.
    2: "cinematography",
    3: "soundtrack",
    4: "direction",
    -1: "unknown"     # For empty or unassigned sentences
}

# Step 3: Refine aspects using keywords and topic IDs
def refine_aspect(sentence, topic_id):
    # First, check keywords in the sentence for a direct match
    sentence_lower = sentence.lower()
    for aspect, keywords in aspect_keywords.items():
        if any(kw in sentence_lower for kw in keywords):
            return aspect
    # Fallback to topic-based mapping if no keyword match
    return topic_to_aspect.get(topic_id, "unknown")

# Apply refinement to each sentence in each review
df['aspects'] = df.apply(
    lambda row: [refine_aspect(sent, topic) for sent, topic in zip(row['cleaned_sentences'], row['dominant_topic'])],
    axis=1
)

# Check a sample
print("\nSample reviews with refined aspects:")
print(df[['cleaned_sentences', 'dominant_topic', 'aspects']].head())

LDA Topics for Reference:
Topic 0: 0.029*"film" + 0.015*"story" + 0.012*"movie" + 0.012*"characters" + 0.011*"good" + 0.010*"acting" + 0.009*"plot" + 0.008*"scenes" + 0.008*"well" + 0.008*"great"
Topic 1: 0.081*"br" + 0.035*"movie" + 0.019*"film" + 0.016*"one" + 0.014*"like" + 0.013*"would" + 0.010*"see" + 0.009*"time" + 0.009*"really" + 0.008*"could"
Topic 2: 0.006*"one" + 0.005*"war" + 0.005*"new" + 0.005*"years" + 0.005*"world" + 0.004*"friends" + 0.004*"first" + 0.004*"two" + 0.003*"american" + 0.003*"old"
Topic 3: 0.008*"one" + 0.008*"get" + 0.007*"life" + 0.007*"like" + 0.007*"man" + 0.007*"people" + 0.006*"around" + 0.005*"scene" + 0.005*"way" + 0.005*"story"
Topic 4: 0.048*"br" + 0.011*"character" + 0.009*"role" + 0.007*"played" + 0.007*"performance" + 0.006*"love" + 0.006*"plays" + 0.006*"also" + 0.005*"best" + 0.005*"actor"

Sample reviews with refined aspects:
                                   cleaned_sentences  \
0  [one reviewers mentioned watching 1 oz episode...   
1  [

In [18]:
df.head()

Unnamed: 0,review,sentiment,cleaned_sentences,dominant_topic,aspects
0,One of the other reviewers has mentioned that ...,1,[one reviewers mentioned watching 1 oz episode...,"[1, 1, 3, 3, 3, 3, 3, 1, 1, 2]","[plot, cinematography, soundtrack, soundtrack,..."
1,A wonderful little production. <br /><br />The...,1,"[wonderful little production, br br filming te...","[0, 0, 1, 0, 0, 0, 0]","[acting, acting, acting, acting, acting, actin..."
2,I thought this was a wonderful way to spend ti...,1,[thought wonderful way spend time hot summer w...,"[1, 0, 1, 0]","[plot, plot, direction, acting]"
3,Basically there's a family where a little boy ...,0,[basically family little boy jake thinks zombi...,"[3, 1, 3, 0, 1, 0]","[soundtrack, plot, soundtrack, acting, plot, c..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,[petter mattei love time money visually stunni...,"[1, 2, 3, 1, 1, 4, 1, 1, 4]","[cinematography, cinematography, soundtrack, d..."


In [19]:
print(df['review'][5])
print(df['aspects'][5])

Probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it's not preachy or boring. It just never gets old, despite my having seen it some 15 or more times in the last 25 years. Paul Lukas' performance brings tears to my eyes, and Bette Davis, in one of her very few truly sympathetic roles, is a delight. The kids are, as grandma says, more like "dressed-up midgets" than children, but that only makes them more fun to watch. And the mother's slow awakening to what's happening in the world and under her own roof is believable and startling. If I had a dozen thumbs, they'd all be "up" for this movie.
['plot', 'plot', 'acting', 'plot', 'cinematography', 'cinematography']


## Sentiment Analysis using BERT

In [20]:
df = df.sample(1000) # taking a sample for testing

In [22]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import pandas as pd

# Step 1: Load pre-trained BERT model and tokenizer
# Using a model fine-tuned for sentiment analysis (e.g., from Hugging Face)
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"  # 5-class sentiment model
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

# Move model to GPU if available
device = torch.device("cuda") #if torch.cuda.is_available() else "cpu"
model.to(device)

# Step 2: Define sentiment prediction function
def get_sentiment(sentence):
    if not sentence:  # Handle empty sentences
        return "neutral"  # Default for empty
    # Tokenize input
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move to GPU if available
    # Predict
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    sentiment_id = torch.argmax(logits, dim=1).item()
    # Map to sentiment labels (adjust based on model output)
    sentiment_labels = ["negative", "neutral", "positive"]  # Simplified to 3 classes
    return sentiment_labels[sentiment_id % 3]  # Adjust if model has more classes

# Step 3: Apply sentiment analysis to each sentence in each review
df['sentiment'] = df['cleaned_sentences'].apply(lambda x: [get_sentiment(sent) for sent in x])

# Check a sample
print("\nSample reviews with aspects and sentiments:")
print(df[['cleaned_sentences', 'aspects', 'sentiment']].head())

AssertionError: Torch not compiled with CUDA enabled

In [None]:
df.head()

Unnamed: 0,review,sentiment,cleaned_sentences,dominant_topic,aspects
38392,There isn't much that comes close to the perfe...,"[negative, negative, neutral, negative, negati...",[much comes close storytelling suspenseful lev...,"[0, 1, 1, 1, 0, 1, 2, 4, 1, 1, 0, 0, 1, 1, 1]","[plot, plot, plot, plot, acting, plot, cinemat..."
6908,"I must admit, this is one of my favorite horro...","[neutral, negative, neutral, negative, negativ...","[must admit one favorite horror films time, un...","[1, 3, 0, 0, 1, 4, 1, 0, 4, 1, 3, 3, 1, 0, 0, ...","[plot, soundtrack, soundtrack, soundtrack, plo..."
186,I haven't read the Anne Rice novel that this m...,"[negative, neutral, neutral, neutral, positive...",[read anne rice novel movie based knows maybe ...,"[1, 1, 1, 4, 0, 3, 1, 0, 1, 1, 0, 1, 4, 3, 1, ...","[plot, plot, plot, plot, acting, soundtrack, p..."
49380,This film is one that played very well back in...,"[positive, positive, neutral, negative, neutra...",[film one played well back 1932 probably would...,"[1, 1, 3, 0, 0, 1, 3, 4, 3, 1, 1, 1, 1, 3, 3, ...","[direction, plot, soundtrack, plot, acting, pl..."
29669,A great combination: - Chabat's humor - Uderzo...,[neutral],[great combination chabat humor world characte...,[1],[acting]


## Combine Results

In [None]:
import pandas as pd

# Assuming df is your DataFrame with 1,000 rows, including 'cleaned_sentences', 'aspects', and 'sentiment'

# Step 1: Combine aspects and sentiments into pairs
def combine_aspects_sentiments(row):
    return list(zip(row['aspects'], row['sentiment']))

df['aspect_sentiment'] = df.apply(combine_aspects_sentiments, axis=1)

# Step 2: (Optional) Summarize per review
def summarize_review(aspect_sentiment_pairs):
    summary = {}
    for aspect, sentiment in aspect_sentiment_pairs:
        if aspect not in summary:
            summary[aspect] = {"positive": 0, "neutral": 0, "negative": 0}
        summary[aspect][sentiment] += 1
    return summary

df['summary'] = df['aspect_sentiment'].apply(summarize_review)

# Check a sample
print("\nSample reviews with aspect-sentiment pairs and summaries:")
for idx, row in df[['cleaned_sentences', 'aspect_sentiment', 'summary']].head().iterrows():
    print(f"\nReview {idx}:")
    print(f"Sentences: {row['cleaned_sentences']}")
    print(f"Aspect-Sentiment Pairs: {row['aspect_sentiment']}")
    print(f"Summary: {row['summary']}")


Sample reviews with aspect-sentiment pairs and summaries:

Review 38392:
Sentences: ['much comes close storytelling suspenseful levels goldeneye', 'came greatest game even today stays br br admit game get boring months playing playing two years later thrust back greatest almost playing first time br br 20 levels probably james bond game date', 'probably unforgettable one tank level likely explosive video game sequence time', 'shooting well usage q gadgets james bond fans always dying br br frankly james bond fan look aspects true james bond experience showing ps2 games', 'game great action usable gadgets somewhat expecting little even back also disliked game q moneypenny anyone mi6', 'watching movies bond interacts characters least times throughout movie nowhere seen game', 'vocal dialogue would made game lively rather text dialogue wound using', 'technology', 'use br br probably annoying feature game ways follows story movie precisely ways incoherent', 'example two many levels protec

In [None]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim import corpora
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Assuming these are already loaded from your previous steps:
# - lda_model, dictionary (from Step 3)
# - tokenizer, model (from Step 5)
# - aspect_keywords, topic_to_aspect (from Step 4)

stop_words = set(stopwords.words('english'))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to process a custom review
def analyze_custom_review(review):
    # Step 2: Preprocess
    sentences = sent_tokenize(review.lower())
    cleaned_sentences = []
    for sent in sentences:
        words = word_tokenize(sent)
        words = [word for word in words if word.isalnum() and word not in stop_words]
        if words:
            cleaned_sentences.append(" ".join(words))
    
    # Step 3: Assign topics
    def get_dominant_topic(sentence):
        if not sentence:
            return -1
        bow = dictionary.doc2bow(sentence.split())
        topics = lda_model[bow]
        return max(topics, key=lambda x: x[1])[0] if topics else -1
    
    dominant_topics = [get_dominant_topic(sent) for sent in cleaned_sentences]
    
    # Step 4: Refine aspects
    def refine_aspect(sentence, topic_id):
        sentence_lower = sentence.lower()
        for aspect, keywords in aspect_keywords.items():
            if any(kw in sentence_lower for kw in keywords):
                return aspect
        return topic_to_aspect.get(topic_id, "unknown")
    
    aspects = [refine_aspect(sent, topic) for sent, topic in zip(cleaned_sentences, dominant_topics)]
    
    # Step 5: Predict sentiment
    def get_sentiment(sentence):
        if not sentence:
            return "neutral"
        inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)
        inputs = {key: val.to(device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        sentiment_id = torch.argmax(logits, dim=1).item()
        return ["negative", "neutral", "positive"][sentiment_id % 3]
    
    sentiments = [get_sentiment(sent) for sent in cleaned_sentences]
    
    # Step 6: Combine results
    aspect_sentiment_pairs = list(zip(aspects, sentiments))
    
    return {
        "cleaned_sentences": cleaned_sentences,
        "aspect_sentiment_pairs": aspect_sentiment_pairs
    }

# Test with custom reviews
custom_reviews = [
    "The acting was phenomenal. But the plot was a bit slow and predictable.",
    "Terrible soundtrack. amazing cinematography, and the direction was top-notch!",
    "The movie and the acting was not that bad"
]

print("\nTesting Custom Reviews:")
for i, review in enumerate(custom_reviews, 1):
    result = analyze_custom_review(review)
    print(f"\nCustom Review {i}:")
    print(f"Input: {review}")
    print(f"Cleaned Sentences: {result['cleaned_sentences']}")
    print(f"Aspect-Sentiment Pairs: {result['aspect_sentiment_pairs']}")


Testing Custom Reviews:

Custom Review 1:
Input: The acting was phenomenal. But the plot was a bit slow and predictable.
Cleaned Sentences: ['acting phenomenal', 'plot bit slow predictable']
Aspect-Sentiment Pairs: [('acting', 'neutral'), ('plot', 'neutral')]

Custom Review 2:
Input: Terrible soundtrack. amazing cinematography, and the direction was top-notch!
Cleaned Sentences: ['terrible soundtrack', 'amazing cinematography direction']
Aspect-Sentiment Pairs: [('soundtrack', 'negative'), ('cinematography', 'neutral')]

Custom Review 3:
Input: The movie and the acting was not that bad
Cleaned Sentences: ['movie acting bad']
Aspect-Sentiment Pairs: [('acting', 'negative')]
