# Data Exploration

In [None]:
# Imports

import textwrap

import pandas as pd
import matplotlib.pyplot as plt

from transformers import BertTokenizer

In [None]:
data = pd.read_csv('../data/imdb_train.csv')

data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
# Label distribution

data.sentiment.value_counts()

In [None]:
# Missing values

data.isna().sum()

In [None]:
# Identifying duplicate reviews

duplicates = len(data[data.duplicated(subset = 'review')])

print('Duplicates found:', duplicates)

In [None]:
# Sample review

sample_review = textwrap.fill(data.sample().iloc[0, 0], width = 100)

print(sample_review)

In [None]:
# Review word count + average review length

word_count_range = [len(text.split()) for text in data.review]
average_word_count = sum(word_count_range) / len(word_count_range)

print(f"Review length (words): {min(word_count_range)} - {max(word_count_range)} words")
print(f"Average review length (words): {average_word_count}")

In [None]:
# Plotting review word counts

plt.figure(figsize = (10, 3))
plt.hist(word_count_range, bins = 150, edgecolor = 'black', alpha = 0.7)
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.title('Distribution of Review Word Counts')
plt.grid(axis = 'y', linestyle = '--', alpha = 0.6)

plt.show()

In [None]:
# Exploring BERT token length distribution BEFORE data cleaning

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

over_512 = 0
within_512 = 0

token_count_list = []

for review in data['review']:
    token_count = len(tokenizer.tokenize(review)) + 2
    token_count_list.append(token_count)
    if token_count > 512:
        over_512 += 1
    else:
        within_512 +=1

print('Number of reviews with more than 512 tokens:', over_512)
print('Number of reviews with 512 tokens or fewer:', within_512)

In [None]:
# Plotting BERT token length distribution BEFORE data cleaning

plt.figure(figsize = (10, 3))
plt.hist(token_count_list, bins = 150, edgecolor = 'black', color = 'orange', alpha = 0.7)
plt.axvline(x = 512, color = 'red', linestyle = '--', linewidth = 1.5) 
plt.xlabel('Number of Tokens')
plt.ylabel('Frequency')
plt.title('Distribution of Token Lengths BEFORE data cleaning')
plt.grid(axis = 'y', linestyle = '--', alpha = 0.6)

plt.show()