### Clean and normalize data, train test split, tokenize text review

In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# --- Step 1: Load and clean data ---
df = pd.read_csv('fake_reviews_dataset.csv')  # Replace with actual file name/path

# Drop rows with missing or empty reviews
df.dropna(subset=['text_'], inplace=True)
df = df[df['text_'].str.strip().astype(bool)]

# --- Step 2: Encode label (0 = fake, 1 = real) ---
df['label'] = df['label'].map({'CG': 0, 'OR': 1})

# --- Step 3: Normalize rating to range [0, 1] ---
df['rating'] = df['rating'].astype(float)
df['rating_norm'] = df['rating'] / 5.0

# --- Step 4: Encode category as integers ---
category_encoder = LabelEncoder()
df['category_encoded'] = category_encoder.fit_transform(df['category'])

# --- Step 5: Train-test split ---
X_text = df['text_']
X_rating = df['rating_norm']
X_category = df['category_encoded']
y = df['label']

X_text_train, X_text_test, X_rating_train, X_rating_test, X_cat_train, X_cat_test, y_train, y_test = train_test_split(
    X_text, X_rating, X_category, y, test_size=0.2, stratify=y, random_state=42
)

# --- Step 6: Tokenize and pad text ---
vocab_size = 10000
max_len = 100
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(X_text_train)

X_train_seq = tokenizer.texts_to_sequences(X_text_train)
X_test_seq = tokenizer.texts_to_sequences(X_text_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

# --- Step 7: Final numpy arrays for training ---
X_train_rating = np.array(X_rating_train)
X_test_rating = np.array(X_rating_test)
X_train_cat = np.array(X_cat_train)
X_test_cat = np.array(X_cat_test)
y_train = np.array(y_train)
y_test = np.array(y_test)


### Tensorflow model

In [29]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, GlobalAveragePooling1D, Dense, Concatenate, Dropout

# --- Hyperparameters ---
vocab_size = 10000
embedding_dim = 32
max_len = 100
category_count = df['category_encoded'].nunique()  # number of unique categories

# --- Text Input ---
text_input = Input(shape=(max_len,), name='text_input')
text_embedding = Embedding(vocab_size, embedding_dim)(text_input)
text_pooled = GlobalAveragePooling1D()(text_embedding)

# --- Rating Input ---
rating_input = Input(shape=(1,), name='rating_input')

# --- Category Input ---
cat_input = Input(shape=(1,), name='category_input')
cat_embedding = Embedding(input_dim=category_count, output_dim=10)(cat_input)
cat_flat = tf.keras.layers.Flatten()(cat_embedding)

# --- Concatenate All Inputs ---
merged = Concatenate()([text_pooled, rating_input, cat_flat])

# --- Dense Layers ---
x = Dense(64, activation='relu')(merged)
x = Dropout(0.3)(x)
x = Dense(32, activation='relu')(x)
x = Dropout(0.2)(x)
output = Dense(1, activation='sigmoid')(x)

# --- Compile Model ---
model = tf.keras.Model(inputs=[text_input, rating_input, cat_input], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()


### Train

In [30]:
history = model.fit(
    x={
        'text_input': X_train_pad,
        'rating_input': X_train_rating,
        'category_input': X_train_cat
    },
    y=y_train,
    validation_split=0.1,
    batch_size=128,
    epochs=10
)


Epoch 1/10
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.6525 - loss: 0.6002 - val_accuracy: 0.8804 - val_loss: 0.2780
Epoch 2/10
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8827 - loss: 0.2718 - val_accuracy: 0.8952 - val_loss: 0.2400
Epoch 3/10
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9056 - loss: 0.2239 - val_accuracy: 0.9070 - val_loss: 0.2209
Epoch 4/10
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9195 - loss: 0.1970 - val_accuracy: 0.9066 - val_loss: 0.2208
Epoch 5/10
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9254 - loss: 0.1835 - val_accuracy: 0.9165 - val_loss: 0.2122
Epoch 6/10
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9256 - loss: 0.1810 - val_accuracy: 0.9104 - val_loss: 0.2234
Epoch 7/10
[1m228/228[0m 

### Test

In [31]:
# --- Evaluate model (shows default progress bar) ---
test_loss, test_acc = model.evaluate(
    x={
        'text_input': X_test_pad,
        'rating_input': X_test_rating,
        'category_input': X_test_cat
    },
    y=y_test
)

print(f"\nTest Accuracy: {test_acc:.4f}")
print(f"Test Loss: {test_loss:.4f}")

# --- Predict confidence scores (sigmoid outputs) ---
y_pred_probs = model.predict({
    'text_input': X_test_pad,
    'rating_input': X_test_rating,
    'category_input': X_test_cat
})

# --- Calculate average confidence ---
# Confidence per prediction = distance from 0.5
confidences = np.abs(y_pred_probs - 0.5) * 2  # range [0, 1]
avg_confidence = np.mean(confidences)

print(f"Average Model Confidence: {avg_confidence:.4f}")


[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9129 - loss: 0.2504

Test Accuracy: 0.9191
Test Loss: 0.2275
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step   
Average Model Confidence: 0.8839


### Save

In [34]:
model.save('my_model.h5')

