In [1]:
import pandas as pd
import os
import glob
import re
import numpy as np
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertModel
from sklearn.model_selection import train_test_split
import tf_keras

# --- Phase 1 & 2 Recap: Load and Preprocess Data ---
# This ensures this notebook can run independently.
print("--- Loading and Reprocessing Data ---\n")
# --- IMPORTANT: Update this path if needed! ---
# This is the corrected path
base_path = 'D:/Fake_Review_Detector/op_spam_v1.4/op_spam_v1.4'
# Load data
reviews = []
labels = []
for label_type in ['deceptive_from_MTurk', 'truthful_from_TripAdvisor']:
    for polarity in ['positive_polarity', 'negative_polarity']:
        path = os.path.join(base_path, polarity, label_type)
        files = glob.glob(os.path.join(path, 'fold*', '*.txt'))
        for file_path in files:
            with open(file_path, 'r', encoding='utf-8') as f:
                reviews.append(f.read())
                labels.append(1 if 'deceptive' in label_type else 0)
df = pd.DataFrame({'review': reviews, 'label': labels})
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Clean and Tokenize the data...
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text
df['cleaned_review'] = df['review'].apply(clean_text)
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
tokenized_data = tokenizer(
    df['cleaned_review'].tolist(), padding='max_length', truncation=True, return_tensors='np', max_length=256
)
input_ids = tokenized_data['input_ids']
attention_mask = tokenized_data['attention_mask']
labels = np.array(df['label'].values)

# Create train and test splits
X_train_ids, X_test_ids, X_train_mask, X_test_mask, y_train, y_test = train_test_split(
    input_ids, attention_mask, labels, test_size=0.2, random_state=42
)
print("--- Data Ready ---\n")


# --- PHASE 3 (REVISED): Re-build and Re-train the Model ---
print("--- Starting Improvement 1: Re-training with 2 Epochs ---")

def create_model():
    # Define the two input layers for our tokenized data
    input_ids_layer = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name='input_ids')
    attention_mask_layer = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name='attention_mask')

    # ALBERT layer from Hugging Face
    albert_model = TFAlbertModel.from_pretrained('albert-base-v2', from_pt=True)
    albert_outputs = albert_model(input_ids_layer, attention_mask=attention_mask_layer)
    
    sequence_output = albert_outputs.last_hidden_state

    # A Bi-directional LSTM layer to understand the sequence of words
    lstm_layer = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))(sequence_output)

    # A final dense layer for classification
    output_layer = tf.keras.layers.Dense(1, activation='sigmoid')(lstm_layer)

    model = tf.keras.Model(inputs=[input_ids_layer, attention_mask_layer], outputs=output_layer)
    return model

# Create and compile the model
model = create_model()
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss='binary_crossentropy',
              metrics=['accuracy'])

print("Model created and compiled successfully.\n")
print("Starting training for 2 epochs...")

# Train the model
history = model.fit(
    [X_train_ids, X_train_mask],
    y_train,
    epochs=2, # *** THE ONLY CHANGE IS HERE: 2 epochs instead of 3 ***
    batch_size=16,
    validation_split=0.1
)

# Save the new, improved model with a different name
model_save_path = 'fake_review_model_v2.keras'
model.save(model_save_path)

print(f"\n--- Improvement Step 1 Complete! ---")
print(f"New model saved as '{model_save_path}'")


  from .autonotebook import tqdm as notebook_tqdm



--- Loading and Reprocessing Data ---

--- Data Ready ---

--- Starting Improvement 1: Re-training with 2 Epochs ---



TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFAlbertModel: ['predictions.LayerNorm.weight', 'predictions.decoder.bias', 'predictions.dense.weight', 'predictions.decoder.weight', 'predictions.dense.bias', 'predictions.LayerNorm.bias', 'predictions.bias']
- This IS expected if you are initializing TFAlbertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFAlbertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFAlbertModel were initialized from the PyTorch model.
If your task is si

Model created and compiled successfully.

Starting training for 2 epochs...
Epoch 1/2


Epoch 2/2





--- Improvement Step 1 Complete! ---
New model saved as 'fake_review_model_v2.keras'
