In [None]:
import pandas as pd
import os
import glob
import re
import numpy as np
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertModel
from sklearn.model_selection import train_test_split
import tf_keras

# --- Phase 1: Load Original Data ---
print("--- Loading Original Data... ---")
base_path = 'D:/Fake_Review_Detector/op_spam_v1.4/op_spam_v1.4'

reviews = []
labels = []
for label_type in ['deceptive_from_MTurk', 'truthful_from_TripAdvisor']:
    for polarity in ['positive_polarity', 'negative_polarity']:
        path = os.path.join(base_path, polarity, label_type)
        files = glob.glob(os.path.join(path, 'fold*', '*.txt'))
        for file_path in files:
            with open(file_path, 'r', encoding='utf-8') as f:
                reviews.append(f.read())
                labels.append(1 if 'deceptive' in label_type else 0)

if not reviews:
    raise FileNotFoundError(
        f"CRITICAL ERROR: No original review files were found.\n"
        f"Please double-check your 'base_path'. It is currently set to: '{base_path}'"
    )

df = pd.DataFrame({'review': reviews, 'label': labels})

# --- DATA AUGMENTATION STEP ---
print("--- Augmenting Data with New Genuine Reviews ---")
new_genuine_reviews = [
    "The pool area was fantastic and the kids loved it. The room was a bit smaller than we expected from the photos, but it was very clean and the beds were comfortable. A solid choice for a family trip.",
    "I really enjoyed my stay. The check-in process was smooth and the staff were incredibly professional. My only issue was that the Wi-Fi in my room was very slow and unreliable.",
    "The restaurant downstairs was terrible - overpriced and slow service. However, I have to say the room itself was quiet and comfortable, which is the most important thing. I'd probably stay here again but eat somewhere else.",
    "This is a standard business hotel. It does everything you need it to do efficiently. The location is good for the convention center, the rooms are functional, and the gym is adequate. Nothing special, but very reliable.",
    "We had a few issues during our stay. The key card for our room stopped working twice, and we had to go down to the lobby to get it fixed. The shower also had very low water pressure. It wasn't a terrible stay, but it could have been better."
]
new_data = pd.DataFrame({
    'review': new_genuine_reviews,
    'label': [0] * len(new_genuine_reviews)
})
df = pd.concat([df, new_data], ignore_index=True)
# --- End of Augmentation ---

# Shuffle the combined dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
print(f"Total reviews after augmentation: {len(df)}\n")


# --- Phase 2: Preprocessing ---
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text
df['cleaned_review'] = df['review'].apply(clean_text)
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
tokenized_data = tokenizer(
    df['cleaned_review'].tolist(), padding='max_length', truncation=True, return_tensors='np', max_length=256
)
input_ids = tokenized_data['input_ids']
attention_mask = tokenized_data['attention_mask']
labels = np.array(df['label'].values)
print("--- Data Preprocessing Complete ---\n")


# --- PHASE 3: Re-training on Augmented Data ---
print("--- Starting Improvement 3: Re-training on Augmented Data ---")
X_train_ids, X_test_ids, X_train_mask, X_test_mask, y_train, y_test = train_test_split(
    input_ids, attention_mask, labels, test_size=0.2, random_state=42
)

def create_model():
    input_ids_layer = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name='input_ids')
    # *** THIS LINE IS NOW CORRECTED ***
    attention_mask_layer = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name='attention_mask')
    albert_model = TFAlbertModel.from_pretrained('albert-base-v2', from_pt=True)
    albert_outputs = albert_model(input_ids_layer, attention_mask=attention_mask_layer)
    sequence_output = albert_outputs.last_hidden_state
    bilstm_layer = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))(sequence_output)
    output_layer = tf.keras.layers.Dense(1, activation='sigmoid')(bilstm_layer)
    model = tf.keras.Model(inputs=[input_ids_layer, attention_mask_layer], outputs=output_layer)
    return model

model = create_model()
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),
              loss='binary_crossentropy',
              metrics=['accuracy'])

print("Starting training...")
history = model.fit(
    [X_train_ids, X_train_mask],
    y_train,
    epochs=2,
    batch_size=8, # Using a smaller batch size to prevent memory crashes
    validation_split=0.1
)

model_save_path = 'fake_review_model_v4.keras'
model.save(model_save_path)
print(f"\n--- Improvement Step 3 Complete! New model saved as '{model_save_path}' ---")



  from .autonotebook import tqdm as notebook_tqdm



--- Loading Original Data... ---
--- Augmenting Data with New Genuine Reviews ---
Total reviews after augmentation: 1205

--- Data Preprocessing Complete ---

--- Starting Improvement 3: Re-training on Augmented Data ---



TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFAlbertModel: ['predictions.bias', 'predictions.dense.bias', 'predictions.LayerNorm.weight', 'predictions.dense.weight', 'predictions.decoder.bias', 'predictions.decoder.weight', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing TFAlbertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFAlbertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFAlbertModel were initialized from the PyTorch model.
If your task is si

Starting training...
Epoch 1/2


