# Data Exploration

In [None]:
# Imports

import textwrap

import pandas as pd
import matplotlib.pyplot as plt

from transformers import BertTokenizer, DistilBertTokenizer, RobertaTokenizer

In [None]:
data = pd.read_csv('../data/imdb_train.csv')

data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
# Label distribution

data.sentiment.value_counts()

In [None]:
# Missing values

data.isna().sum()

In [None]:
# Identifying duplicate reviews

duplicates = len(data[data.duplicated()])

print('Duplicates found:', duplicates)

In [None]:
# Sample review

sample_review = textwrap.fill(data.sample().iloc[0, 0], width = 100)

print(sample_review)

In [None]:
# Review word count + average review length

word_count_range = [len(text.split()) for text in data.review]
average_word_count = sum(word_count_range) / len(word_count_range)

print(f"Review length (words): {min(word_count_range)} - {max(word_count_range)} words")
print(f"Average review length (words): {average_word_count}")

In [None]:
# Plotting review word counts

plt.figure(figsize = (10, 3))
plt.hist(word_count_range, bins = 150, edgecolor = 'black', alpha = 0.7)
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.title('Distribution of Review Word Counts')
plt.grid(axis = 'y', linestyle = '--', alpha = 0.6)

plt.show()

In [None]:
# Exploring DistilBERT, BERT and RoBERTa token length distribution BEFORE data cleaning

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

bert_token_count_list = []
distilbert_token_count_list = []
roberta_token_count_list = []

for review in data['review']:
    # Tokenizing (BERT)
    bert_token_count = len(bert_tokenizer.tokenize(review)) + 2
    bert_token_count_list.append(bert_token_count)
    # Tokenizing (DistilBERT)
    distilbert_token_count = len(distilbert_tokenizer.tokenize(review)) + 2
    distilbert_token_count_list.append(distilbert_token_count)
    # Tokenizing (RoBERTa)
    roberta_token_count = len(roberta_tokenizer.tokenize(review)) + 2
    roberta_token_count_list.append(roberta_token_count)

# Counting number of reviews with more than 512 tokens for each tokenizer
bert_over_512 = sum([1 for count in bert_token_count_list if count > 512])
distilbert_over_512 = sum([1 for count in distilbert_token_count_list if count > 512])
roberta_over_512 = sum([1 for count in roberta_token_count_list if count > 512])

In [None]:
# Plotting BERT token length distribution BEFORE data cleaning

plt.figure(figsize = (10, 3))
plt.hist(bert_token_count_list, bins = 150, edgecolor = 'black', color = 'orange', alpha = 0.7)
plt.axvline(x = 512, color = 'red', linestyle = '--', linewidth = 1.5) 
plt.xlabel('Number of Tokens')
plt.ylabel('Frequency')
plt.title('Token Length Cut-off for BERT before Data Cleaning')
plt.grid(axis = 'y', linestyle = '--', alpha = 0.6)

plt.show()

In [None]:
# Plotting token length distribution for DistilBERT, BERT, and RoBERTa for comparison

plt.figure(figsize = (10, 3))

plt.hist(distilbert_token_count_list, bins = 150, alpha = 0.5,
         edgecolor = 'black', color = 'blue', label = 'DistilBERT', linewidth = 0.5)
plt.hist([x + 500 for x in bert_token_count_list], bins = 150, alpha = 0.5,
         edgecolor = 'black', color = 'orange', label = 'BERT', linewidth = 0.5)
plt.hist([x + 1000 for x in roberta_token_count_list], bins = 150, alpha = 0.5,
         edgecolor = 'black', color = 'green', label = 'RoBERTa', linewidth = 0.5)

plt.ylabel('Frequency')
plt.title('Tokenizer Comparison for DistilBERT, BERT, and RoBERTa')
plt.grid(axis = 'y', linestyle = '--', alpha = 0.6)
plt.xticks([])
plt.legend()

plt.show()