In [None]:
# Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from nltk import ngrams
from collections import Counter

In [None]:
DATA_PATH = 'data/IMDBSentimentData.csv'

# Load Data
df = pd.read_csv()
df = pd.read_csv(DATA_PATH, sep=',', header=0, names=['Review', 'Sentiment'])
display(df.head())

In [None]:
# DataFrame Info
df.info()

## Observations:
- Total 50K reviews.
- No missing values.
- Both reviews and sentiment are str type object.

In [None]:
# DataFrame Description
df.describe(include='object')

## Observations
- Most reviews are unique. 
- Two classes sentiment 'positive' and 'negative' -> Binary Classification Problem

In [None]:
# Sentiment Distribution
df['Sentiment'].value_counts(normalize=True) * 100

## Observations
- Both sentiment distribution are same, balanced dataset.

In [None]:
# Missing Value 
df.isnull().sum()

## Observations
- No missing values.

In [None]:
# Review length
df['Review_Length'] = df['Review'].apply(lambda x: len(x.split()))
df['Review_Length'].describe()

In [None]:
plt.figure(figsize=(8, 4))
sns.histplot(df['Review_Length'], bins=50, kde=True)
plt.title('Review Length Distribution')

## Observations
- Most reviews are short to medium in length (around 173–280 units), but a few very long reviews (like 2470) pull the average higher than the median. 

- This indicates the data is right-skewed (a long tail of very lengthy reviews).

In [None]:
sns.boxplot(x='Sentiment', y='Review_Length', data=df)
plt.title('Length by Sentiment')

## Observations:
- Positive reviews contain most of the outlier compared to negative reviews. It could be that people who liked the movie expressed them very much.

- After all observation, for a sweet spot, 128 token size could be nice.

In [None]:
# Word Clouds (inform: common words suggest BoW/TF-IDF for ML)
# Combine all positive and negative reviews
positive = ' '.join(df[df['Sentiment'] == 'positive']['Review'])
negative = ' '.join(df[df['Sentiment'] == 'negative']['Review'])

# Generate word clouds
wc_pos = WordCloud(width=800, height=400, background_color='white').generate(positive)
wc_neg = WordCloud(width=800, height=400, background_color='black', colormap='Reds').generate(negative)

# Plot side by side
plt.figure(figsize=(15,7))

# Positive
plt.subplot(1,2,1)
plt.imshow(wc_pos, interpolation='bilinear')
plt.axis("off")
plt.title("Positive Reviews")

# Negative
plt.subplot(1,2,2)
plt.imshow(wc_neg, interpolation='bilinear')
plt.axis("off")
plt.title("Negative Reviews")

plt.show()

In [None]:
# N-grams (inform: sequences suggest RNN/LSTM for context)
def get_ngrams(text, n=2, top_k=10):
    return Counter(ngrams(text.split(), n)).most_common(top_k)

get_ngrams(positive)

In [None]:
get_ngrams(negative)

## Observations:
- Common stop words dominate, TF-IDF should work well for ML baselines.
- N-grams show context, indicate LSTM will work well over simple ML.