In [1]:
import pandas as pd
import os
import glob
import re
from transformers import AlbertTokenizer
import numpy as np

# --- NOTE: This first part is the same as your previous notebook ---
# --- Make sure the base_path is correct! ---
base_path = 'D:/Fake_Review_Detector/op_spam_v1.4/op_spam_v1.4' # Adjust if needed

reviews = []
labels = []

for label_type in ['deceptive_from_MTurk', 'truthful_from_TripAdvisor']:
    for polarity in ['positive_polarity', 'negative_polarity']:
        path = os.path.join(base_path, polarity, label_type)
        files = glob.glob(os.path.join(path, 'fold*', '*.txt'))
        for file_path in files:
            with open(file_path, 'r', encoding='utf-8') as f:
                reviews.append(f.read())
                labels.append(1 if 'deceptive' in label_type else 0)

df = pd.DataFrame({'review': reviews, 'label': labels})
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print("--- Phase 1 Recap ---")
print(f"Successfully loaded {len(df)} reviews.\n")


# --- PHASE 2: DATA PREPROCESSING AND TOKENIZATION ---

print("--- Starting Phase 2: Preprocessing & Tokenization ---\n")

# 1. Text Cleaning
def clean_text(text):
    """
    A simple function to clean the review text.
    - Converts to lowercase
    - Removes punctuation and numbers
    """
    text = text.lower() # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove punctuation and numbers
    return text

print("Cleaning review text...")
df['cleaned_review'] = df['review'].apply(clean_text)
print("First 5 cleaned reviews:")
print(df[['review', 'cleaned_review']].head())
print("\n" + "="*50 + "\n")


# 2. Tokenization with ALBERT
# We load a pre-trained tokenizer from Hugging Face.
# 'albert-base-v2' is a good, standard choice.
print("Loading ALBERT tokenizer...")
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
print("Tokenizer loaded successfully.\n")

# Now we tokenize our cleaned reviews.
# - padding=True: makes all sequences the same length by adding padding tokens.
# - truncation=True: cuts off reviews that are longer than the model's max length.
# - return_tensors='np': returns the output as NumPy arrays.
print("Tokenizing all cleaned reviews (this may take a minute)...")
tokenized_data = tokenizer(
    df['cleaned_review'].tolist(),
    padding=True,
    truncation=True,
    return_tensors='np',
    max_length=512 # ALBERT's maximum sequence length
)
print("Tokenization complete!\n")


# The tokenizer returns a dictionary of arrays. Let's look at them.
input_ids = tokenized_data['input_ids']
attention_mask = tokenized_data['attention_mask']

print(f"Shape of input_ids: {input_ids.shape}")
print(f"Shape of attention_mask: {attention_mask.shape}\n")

print("Example of a tokenized review:")
print(f"Original Text: '{df['cleaned_review'][0][:100]}...'")
print(f"Token IDs: {input_ids[0][:20]}...")
print("\nPhase 2 Complete! We are now ready to build and train our model.")


  from .autonotebook import tqdm as notebook_tqdm


--- Phase 1 Recap ---
Successfully loaded 1200 reviews.

--- Starting Phase 2: Preprocessing & Tokenization ---

Cleaning review text...
First 5 cleaned reviews:
                                              review  \
0  We stayed at the Intercontinental for three ni...   
1  Just got back from a 10 day visit to Chicago. ...   
2  My stay at the Ambassador East Hotel was a ple...   
3  The James Chicago is a stuffy, uninviting hote...   
4  My stay at the Hotel Monaco recently was fanta...   

                                      cleaned_review  
0  we stayed at the intercontinental for three ni...  
1  just got back from a  day visit to chicago for...  
2  my stay at the ambassador east hotel was a ple...  
3  the james chicago is a stuffy uninviting hotel...  
4  my stay at the hotel monaco recently was fanta...  


Loading ALBERT tokenizer...
Tokenizer loaded successfully.

Tokenizing all cleaned reviews (this may take a minute)...
Tokenization complete!

Shape of input_ids: (1200,