In [None]:
# Cell 1: Import necessary libraries
import pandas as pd
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

project_root = Path(__file__).parent.parent
preprocessed_data_path = project_root / 'data' / 'preprocessed' / 'preprocessed_movie_reviews.csv'


In [None]:
# Cell 2: Load the preprocessed data
try:
    df = pd.read_csv(preprocessed_data_path)
    print("Data loaded successfully!")
    print(df.head())
except FileNotFoundError:
    print(f"Error: The file '{preprocessed_data_path}' was not found.")
    print("Please make sure you have run the preprocessing script and the file exists.")
    df = None

In [None]:
# Cell 3: Initial Data Inspection
if df is not None:
    print("\n--- Data Information ---")
    df.info()
    
    print("\n--- Sentiment Label Distribution ---")
    print(df['sentiment_label'].value_counts())
    
    # Visualize the sentiment distribution
    plt.figure(figsize=(6, 4))
    sns.countplot(x='sentiment_label', data=df)
    plt.title('Distribution of Sentiment Labels')
    plt.show()

In [None]:
# Cell 4: Analyze text length
if df is not None:
    df['text_length'] = df['preprocessed_text'].astype(str).apply(lambda x: len(x.split()))
    
    print("\n--- Text Length Statistics ---")
    print(df.groupby('sentiment_label')['text_length'].describe())
    
    # Visualize text length distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(data=df, x='text_length', hue='sentiment_label', kde=True, bins=50)
    plt.title('Distribution of Text Length by Sentiment')
    plt.show()

In [None]:
# Cell 5: Word Cloud Generation
if df is not None:
    # Separate text for positive and negative reviews
    positive_text = " ".join(df[df['sentiment_label'] == 'positive']['preprocessed_text'].astype(str))
    negative_text = " ".join(df[df['sentiment_label'] == 'negative']['preprocessed_text'].astype(str))
    
    print("\n--- Generating Word Clouds... ---")
    
    # Generate a word cloud for positive reviews
    if positive_text:
        wordcloud_positive = WordCloud(width=800, height=400, background_color='white').generate(positive_text)
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud_positive, interpolation='bilinear')
        plt.axis('off')
        plt.title('Most Common Words in Positive Reviews')
        plt.show()

    # Generate a word cloud for negative reviews
    if negative_text:
        wordcloud_negative = WordCloud(width=800, height=400, background_color='white').generate(negative_text)
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud_negative, interpolation='bilinear')
        plt.axis('off')
        plt.title('Most Common Words in Negative Reviews')
        plt.show()