In [4]:
import pandas as pd
import os
import glob
import re
import numpy as np
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertModel
from sklearn.model_selection import train_test_split
import tf_keras

# --- Phase 1: Load Original Data ---
print("--- Loading Original Data... ---")
# *** THE FIX IS HERE: Pointing to the likely nested folder ***
base_path = 'D:/Fake_Review_Detector/op_spam_v1.4/op_spam_v1.4'

reviews = []
labels = []
for label_type in ['deceptive_from_MTurk', 'truthful_from_TripAdvisor']:
    for polarity in ['positive_polarity', 'negative_polarity']:
        path = os.path.join(base_path, polarity, label_type)
        files = glob.glob(os.path.join(path, 'fold*', '*.txt'))
        for file_path in files:
            with open(file_path, 'r', encoding='utf-8') as f:
                reviews.append(f.read())
                labels.append(1 if 'deceptive' in label_type else 0)

# --- Robustness Check ---
if not reviews:
    raise FileNotFoundError(
        f"CRITICAL ERROR: No original review files were found.\n"
        f"Please double-check your 'base_path'. It is currently set to: '{base_path}'\n"
        f"The correct path MUST point to the folder that contains the 'positive_polarity' and 'negative_polarity' folders."
    )
# --- End of Check ---

df = pd.DataFrame({'review': reviews, 'label': labels})

# --- DATA AUGMENTATION STEP ---
print("--- Augmenting Data with a Large, Diverse Set of Genuine Reviews ---")
new_genuine_reviews = [
    # Positive & Detailed
    "The room was spacious and the view of the city was breathtaking. The staff went above and beyond to make our anniversary special. Absolutely worth the price.",
    "From the moment we checked in, the service was impeccable. The concierge gave us fantastic dinner recommendations. The bed was one of the most comfortable I've ever slept in.",
    # Balanced
    "The pool area was fantastic and the kids loved it. The room was a bit smaller than we expected, but it was very clean. A solid choice for a a family trip.",
    "Location is unbeatable, right in the heart of everything. The downside is that it can be a bit noisy at night. The room itself was modern and well-maintained.",
    # Negative & Detailed (The kind our model gets wrong)
    "A complete disaster. Our flight was delayed and we arrived late, but the front desk had given our room away. We had to wait an hour for them to find us another, smaller room. No apology was offered.",
    "Do not stay here. The pictures online are completely misleading. The carpet was stained, there was mold in the shower, and the whole place smelled like smoke. We checked out after one night.",
    # Short & Simple
    "Great place, very clean.",
    "It was okay. Nothing special.",
    "Wouldn't recommend.",
    "Perfect for a quick business trip.",
    # More emotional but genuine reviews
    "I was so excited for this stay and it was a huge letdown. The service was so slow it felt like they forgot we were there. Ruined our weekend.",
    "This was the best hotel experience I have ever had! I felt like royalty. The spa was heavenly and the food was divine. I cannot wait to come back!",
    "Just a warning to others: the hidden 'resort fee' is a scam. It added an extra $50 per night to our bill for amenities we never even used. Very deceptive.",
    "The staff here are the kindest people you will ever meet. My husband fell ill and they were so helpful and compassionate. They truly cared. I am so grateful.",
    # Neutral/Descriptive
    "The hotel is located about a 15-minute walk from the main train station. The room had a desk, a small fridge, and a safe. The bathroom was functional.",
    "Check-in is at 3 PM. They have an airport shuttle that runs every hour. The on-site restaurant serves breakfast and dinner.",
    "This is a standard business hotel. It does everything you need it to. The location is good for the convention center, the rooms are functional, and the gym is adequate.",
    "We had a few issues. The key card stopped working twice. The shower had low water pressure. It wasn't a terrible stay, but it could have been better.",
    "The view from our balcony was incredible. We could see the entire coastline. The room itself was a bit dated, but that view made up for everything.",
    "Overall a positive experience. The hotel is pet-friendly which was a huge plus for us. They even provided a water bowl for our dog."
]
new_data = pd.DataFrame({
    'review': new_genuine_reviews,
    'label': [0] * len(new_genuine_reviews) # Label all as Genuine
})
df = pd.concat([df, new_data], ignore_index=True)
# --- End of Augmentation ---

df = df.sample(frac=1, random_state=42).reset_index(drop=True)
print(f"Total reviews after large augmentation: {len(df)}\n")


# --- Phase 2: Preprocessing ---
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text
df['cleaned_review'] = df['review'].apply(clean_text)
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
tokenized_data = tokenizer(
    df['cleaned_review'].tolist(), padding='max_length', truncation=True, return_tensors='np', max_length=256
)
input_ids = tokenized_data['input_ids']
attention_mask = tokenized_data['attention_mask']
labels = np.array(df['label'].values)
print("--- Data Preprocessing Complete ---\n")


# --- PHASE 3: Re-training on the BEST Dataset ---
print("--- Starting Final Improvement: Re-training on Augmented Data ---")
X_train_ids, X_test_ids, X_train_mask, X_test_mask, y_train, y_test = train_test_split(
    input_ids, attention_mask, labels, test_size=0.2, random_state=42
)

def create_model():
    input_ids_layer = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name='input_ids')
    attention_mask_layer = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name='attention_mask')
    albert_model = TFAlbertModel.from_pretrained('albert-base-v2', from_pt=True)
    albert_outputs = albert_model(input_ids_layer, attention_mask=attention_mask_layer)
    sequence_output = albert_outputs.last_hidden_state
    bilstm_layer = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))(sequence_output)
    output_layer = tf.keras.layers.Dense(1, activation='sigmoid')(bilstm_layer)
    model = tf.keras.Model(inputs=[input_ids_layer, attention_mask_layer], outputs=output_layer)
    return model

model = create_model()
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),
              loss='binary_crossentropy',
              metrics=['accuracy'])

print("Starting training on the best dataset...")
history = model.fit(
    [X_train_ids, X_train_mask],
    y_train,
    epochs=2,
    batch_size=8, # Using a small batch size for stability
    validation_split=0.1
)

model_save_path = 'fake_review_model_final.keras'
model.save(model_save_path)
print(f"\n--- Final Improvement Complete! New model saved as '{model_save_path}' ---")



--- Loading Original Data... ---
--- Augmenting Data with a Large, Diverse Set of Genuine Reviews ---
Total reviews after large augmentation: 1220

--- Data Preprocessing Complete ---

--- Starting Final Improvement: Re-training on Augmented Data ---



TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFAlbertModel: ['predictions.decoder.weight', 'predictions.dense.bias', 'predictions.decoder.bias', 'predictions.LayerNorm.bias', 'predictions.bias', 'predictions.dense.weight', 'predictions.LayerNorm.weight']
- This IS expected if you are initializing TFAlbertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFAlbertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFAlbertModel were initialized from the PyTorch model.
If your task is si

Starting training on the best dataset...
Epoch 1/2


Epoch 2/2





--- Final Improvement Complete! New model saved as 'fake_review_model_final.keras' ---
