In [1]:
import pandas as pd
import os
import glob
import re
from transformers import AlbertTokenizer, TFAlbertModel
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np
import tf_keras # Ensure tf_keras is imported

print("--- Script Started ---")

# --- Phase 1 & 2: Load and Preprocess Data ---
print("--- Loading and Preprocessing Data... ---")
try:
    # Make sure this path is correct for your D: drive
    base_path = 'D:/Fake_Review_Detector/op_spam_v1.4/op_spam_v1.4'

    reviews = []
    labels = []
    # This loop goes through all the folders and reads the text files
    for label_type in ['deceptive_from_MTurk', 'truthful_from_TripAdvisor']:
        for polarity in ['positive_polarity', 'negative_polarity']:
            path = os.path.join(base_path, polarity, label_type)
            if not os.path.isdir(path):
                print(f"WARNING: Directory not found, skipping: {path}")
                continue
            files = glob.glob(os.path.join(path, 'fold*', '*.txt'))
            for file_path in files:
                with open(file_path, 'r', encoding='utf-8') as f:
                    reviews.append(f.read())
                    labels.append(1 if 'deceptive' in label_type else 0)
    
    if not reviews:
        raise FileNotFoundError(f"No review files were found. Please check your base_path: '{base_path}'")

    df = pd.DataFrame({'review': reviews, 'label': labels})
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    print(f"Successfully loaded {len(df)} reviews.")

    # Clean text
    def clean_text(text):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        return text
    df['cleaned_review'] = df['review'].apply(clean_text)
    print("Text cleaning complete.")

    # Tokenize text
    print("Loading tokenizer...")
    tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
    print("Tokenizing text (this may take a moment)...")
    tokenized_data = tokenizer(
        df['cleaned_review'].tolist(),
        padding='max_length',
        truncation=True,
        return_tensors='np',
        max_length=256
    )
    input_ids = tokenized_data['input_ids']
    attention_mask = tokenized_data['attention_mask']
    labels = np.array(df['label'].values)
    print("--- Data Loading and Preprocessing Complete ---\n")

except Exception as e:
    print("\n--- AN ERROR OCCURRED DURING DATA PREPARATION ---")
    print(e)
    # Stop the script if data loading fails
    exit()


# --- PHASE 3: MODEL BUILDING AND TRAINING ---
print("--- Starting Phase 3: Model Building & Training ---\n")

# 1. Split the data
X_train_ids, X_test_ids, X_train_mask, X_test_mask, y_train, y_test = train_test_split(
    input_ids, attention_mask, labels, test_size=0.2, random_state=42
)

# 2. Build the Model
def create_model():
    input_ids_layer = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name='input_ids')
    attention_mask_layer = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name='attention_mask')
    
    # We load the PyTorch weights and convert them, which requires torch to be installed
    albert_model = TFAlbertModel.from_pretrained('albert-base-v2', from_pt=True)
    albert_outputs = albert_model(input_ids_layer, attention_mask=attention_mask_layer)
    sequence_output = albert_outputs.last_hidden_state
    
    bilstm_layer = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))(sequence_output)
    output_layer = tf.keras.layers.Dense(1, activation='sigmoid')(bilstm_layer)
    
    model = tf.keras.Model(inputs=[input_ids_layer, attention_mask_layer], outputs=output_layer)
    return model

print("Creating the ALBERT-BiLSTM model...")
model = create_model()
print("Model created successfully.\n")

# 3. Compile the Model
print("Compiling the model...")
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),
              loss='binary_crossentropy',
              metrics=['accuracy'])
print("Model compiled successfully.\n")

# 4. Train the Model
print("Starting model training (this will take several minutes)...")
history = model.fit(
    [X_train_ids, X_train_mask],
    y_train,
    epochs=3,
    batch_size=16,
    validation_split=0.1
)
print("\nModel training complete!\n")

# 5. Save the Trained Model
print("Saving the trained model to 'fake_review_model.keras'...")
model.save('fake_review_model.keras')
print("Model saved successfully.")
print("\n--- Phase 3 Complete! You can now run the web application. ---")


  from .autonotebook import tqdm as notebook_tqdm



--- Script Started ---
--- Loading and Preprocessing Data... ---
Successfully loaded 1200 reviews.
Text cleaning complete.
Loading tokenizer...
Tokenizing text (this may take a moment)...
--- Data Loading and Preprocessing Complete ---

--- Starting Phase 3: Model Building & Training ---

Creating the ALBERT-BiLSTM model...



TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFAlbertModel: ['predictions.dense.bias', 'predictions.decoder.bias', 'predictions.bias', 'predictions.dense.weight', 'predictions.LayerNorm.weight', 'predictions.decoder.weight', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing TFAlbertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFAlbertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFAlbertModel were initialized from the PyTorch model.
If your task is si

Model created successfully.

Compiling the model...
Model compiled successfully.

Starting model training (this will take several minutes)...
Epoch 1/3


Epoch 2/3
Epoch 3/3

Model training complete!

Saving the trained model to 'fake_review_model.keras'...




Model saved successfully.

--- Phase 3 Complete! You can now run the web application. ---
