<a href="https://colab.research.google.com/github/Shrutika-16/Banking-Magement-System/blob/main/bert_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Concatenate, Dropout, Layer, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from transformers import BertTokenizer, TFBertModel
import pandas as pd
import time

# ✅ Check if GPU is available
device = "GPU" if tf.config.list_physical_devices('GPU') else "CPU"
print(f"Training on: {device}")

# ✅ Mount Google Drive to access dataset
from google.colab import drive
drive.mount('/content/drive')

# ✅ Load Dataset
data_path = "/content/drive/My Drive/fake_review_data/"
train_data = pd.read_csv(data_path + "train_preprocessed.csv")
test_data = pd.read_csv(data_path + "test_preprocessed.csv")

# ✅ Clean NaN
train_data['REVIEW_TEXT'] = train_data['REVIEW_TEXT'].astype(str).fillna("")
test_data['REVIEW_TEXT'] = test_data['REVIEW_TEXT'].astype(str).fillna("")

# ✅ Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
MAX_LEN = 256  # Increased length

def encode_texts(texts, max_len=MAX_LEN):
    if isinstance(texts, pd.Series):
        texts = texts.tolist()
    encoding = tokenizer(texts, max_length=max_len, padding='max_length', truncation=True, return_tensors="np")
    return encoding["input_ids"], encoding["attention_mask"]

X_train_ids, X_train_mask = encode_texts(train_data['REVIEW_TEXT'])
X_test_ids, X_test_mask = encode_texts(test_data['REVIEW_TEXT'])

# ✅ Numeric features
numeric_columns = [
    'AVERAGE_RATING', 'RATING_DEVIATION', 'TOTAL_PRODUCT_REVIEWS',
    'REVIEW_LENGTH', 'RATING_CATEGORY', 'SINGLE_RATING_CATEGORY',
    'REVIEW_COUNT_DATE', 'SAME_DATE_MULTIPLE_REVIEWS', 'MAX_USER_REVIEWS_DAY',
    'TIMESTAMP_DIFFERENCE', 'AVERAGE_USER_REVIEW_LENGTH', 'TOTAL_USER_REVIEWS',
    'PERCENTAGE_POSITIVE_REVIEWS', 'RATIO_POSITIVE_NEGATIVE'
]

X_train_numeric = train_data[numeric_columns].astype(float).values
X_test_numeric = test_data[numeric_columns].astype(float).values
y_train = train_data['LABEL'].values
y_test = test_data['LABEL'].values

# ✅ Load & fine-tune BERT
bert_model = TFBertModel.from_pretrained('bert-base-uncased')
bert_model.trainable = True  # Now fine-tuning BERT

# ✅ BERT Layer
class BertLayer(Layer):
    def __init__(self, bert_model, **kwargs):
        super(BertLayer, self).__init__(**kwargs)
        self.bert_model = bert_model

    def call(self, inputs):
        input_ids, attention_mask = inputs
        output = self.bert_model(input_ids=input_ids, attention_mask=attention_mask)[0]
        return output

    def get_config(self):
        config = super().get_config()
        config.update({
            'bert_model': self.bert_model.name  # or use a placeholder name
        })
        return config

    @classmethod
    def from_config(cls, config):
        from transformers import TFBertModel
        model = TFBertModel.from_pretrained('bert-base-uncased')
        return cls(model)


# ✅ Model Architecture
def create_bert_lstm_model(bert_model, max_len=MAX_LEN, num_numeric_features=14):
    input_ids = Input(shape=(max_len,), dtype=tf.int32, name='input_ids')
    attention_mask = Input(shape=(max_len,), dtype=tf.int32, name='attention_mask')
    numeric_input = Input(shape=(num_numeric_features,), name='numeric_input')

    bert_output = BertLayer(bert_model)([input_ids, attention_mask])
    lstm_output = Bidirectional(LSTM(64, return_sequences=False))(bert_output)

    numeric_dense = Dense(64, activation='relu')(numeric_input)

    merged = Concatenate()([lstm_output, numeric_dense])
    x = Dropout(0.4)(merged)
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.3)(x)
    output = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=[input_ids, attention_mask, numeric_input], outputs=output)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

# ✅ Build model
model = create_bert_lstm_model(bert_model)

# ✅ Callbacks
checkpoint_path = "/content/drive/My Drive/best_fake_review_model.keras"
callbacks = [
    EarlyStopping(monitor='val_accuracy', patience=2, restore_best_weights=True),
    ModelCheckpoint(checkpoint_path, monitor='val_accuracy', save_best_only=True)
]

# ✅ Train
start_time = time.time()
history = model.fit(
    [X_train_ids, X_train_mask, X_train_numeric], y_train,
    validation_data=([X_test_ids, X_test_mask, X_test_numeric], y_test),
    epochs=3,
    batch_size=32,
    callbacks=callbacks
)
end_time = time.time()
print(f"Training Time: {(end_time - start_time)/60:.2f} minutes")

# ✅ Final Accuracy
print(f"Training Accuracy: {history.history['accuracy'][-1]*100:.2f}%")
print(f"Validation Accuracy: {history.history['val_accuracy'][-1]*100:.2f}%")

# ✅ Save Model
model.save("/content/drive/My Drive/fake_reviews_model.keras")
print("Model saved successfully!")

Training on: GPU
Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Epoch 1/3
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m362s[0m 702ms/step - accuracy: 0.8570 - loss: 0.5506 - val_accuracy: 0.8970 - val_loss: 0.4832
Epoch 2/3
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m396s[0m 734ms/step - accuracy: 0.8978 - loss: 0.4938 - val_accuracy: 0.8970 - val_loss: 0.4691
Epoch 3/3
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m367s[0m 735ms/step - accuracy: 0.8961 - loss: 0.4773 - val_accuracy: 0.8970 - val_loss: 0.4584
Training Time: 18.75 minutes
Training Accuracy: 89.37%
Validation Accuracy: 89.70%
Model saved successfully!


In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer


In [13]:
from transformers import TFBertModel
import tensorflow as tf
from tensorflow.keras.layers import Layer

@tf.keras.utils.register_keras_serializable()
class BertLayer(Layer):
    def __init__(self, model_name="bert-base-uncased", **kwargs):
        self.model_name = model_name
        self.bert = TFBertModel.from_pretrained(model_name)
        super(BertLayer, self).__init__(**kwargs)

    def call(self, inputs, training=False):
        input_ids, attention_mask = inputs
        output = self.bert(input_ids, attention_mask=attention_mask)
        return output.last_hidden_state

    def get_config(self):
        config = super().get_config()
        config.update({"model_name": self.model_name})
        return config

    @classmethod
    def from_config(cls, config):
        # Handle old config saved as {"bert_model": "tf_bert_model"}
        if "bert_model" in config:
            config["model_name"] = "bert-base-uncased"  # or use logic to detect proper model
            del config["bert_model"]
        return cls(**config)


In [14]:
from tensorflow.keras.models import load_model

model = load_model(
    "/content/drive/My Drive/fake_reviews_model.keras",
    custom_objects={'BertLayer': BertLayer}
)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [31]:
def predict_fake_review(review_text, model, tokenizer, max_len=256):
    # Tokenize the input review with correct MAX_LEN
    encodings = tokenizer(
        review_text,
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_tensors="tf"
    )

    input_ids = encodings["input_ids"]
    attention_mask = encodings["attention_mask"]

    # Dummy numeric features since only text is provided
    numeric_features = np.zeros((1, 14))  # Match number of numeric features

    # Predict
    prediction = model.predict([input_ids, attention_mask, numeric_features])

    # Convert output probability to label
    label = "Fake Review" if prediction[0][0] > 0.7 else "Genuine Review"
    confidence = prediction[0][0] * 100 if prediction[0][0] > 0.5 else (1 - prediction[0][0]) * 100

    return label, confidence

In [32]:
test_review = "This food is amazing! I absolutely love it and would buy it again."
label, confidence = predict_fake_review(test_review, model, tokenizer)
print(f"Prediction: {label} (Confidence: {confidence:.2f}%)")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
Prediction: Genuine Review (Confidence: 62.01%)


In [33]:
test_review = "really feeling place actually came happy hour serve really amazing cider wine beer ended staying dinner food looked amazing im healthy eater place right alley excellent meat vegetable organic b preservative really hard find wish restaurant following trend"
label, confidence = predict_fake_review(test_review, model, tokenizer)
print(f"Prediction: {label} (Confidence: {confidence:.2f}%)")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
Prediction: Fake Review (Confidence: 74.79%)


In [34]:
test_review = "so i realized very quickly this place is not ideal for pescatarians but since my significant other is an avid carnivore i made it work can i just say how nice it is to finally find a place in astoria with decent southerninspired food when i saw fried green tomatoes listed on their menu i almost died of shock i never expected to see those slices of heaven north of va i of course had to get them along with their disco tots tots with melted cheddar cheese and gravy both were incredible my so significant other being a true country boy ordered the chicken and waffles and i ordered the salmon benedict my entree was okay i honestly wouldnt order it again but my sos waffle was to die for and the syrup had mixed with his chickens tabasco glaze so it was sweet with a kick i was actually really glad to see they mix sweet and spicy because we mix hot sauce with our syrup all the time i just showed my southern tendencies didnt i if youre on a diet i wouldnt suggest coming here if you want good food and a great atmosphere go"
label, confidence = predict_fake_review(test_review, model, tokenizer)
print(f"Prediction: {label} (Confidence: {confidence:.2f}%)")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
Prediction: Fake Review (Confidence: 80.59%)


In [35]:
test_review = "wow! nice food i loved the taste"
label, confidence = predict_fake_review(test_review, model, tokenizer)
print(f"Prediction: {label} (Confidence: {confidence:.2f}%)")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
Prediction: Genuine Review (Confidence: 65.38%)


In [36]:
test_review = "i went to mamouns last night it was great as it always is the food was perfect and the atmosphere was also the same always packed i had a chicken kebab this time and it was good too but the best is the falafel sandwich"
label, confidence = predict_fake_review(test_review, model, tokenizer)
print(f"Prediction: {label} (Confidence: {confidence:.2f}%)")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
Prediction: Fake Review (Confidence: 78.61%)


In [37]:
test_review = "for a dominican place it s okay sophies is the same price but you get wayyy more food there and much much better the sweet plantain at margons suck its undercooked and really dark indicating using old oil so they dont cook it as long so it wont turn black their porkchops are soooo tough so stringy their beans are okay not much flavor their fried chicken is the only good thing"
label, confidence = predict_fake_review(test_review, model, tokenizer)
print(f"Prediction: {label} (Confidence: {confidence:.2f}%)")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
Prediction: Fake Review (Confidence: 78.15%)


In [42]:
test_review = "what a amazing food, must try pizza. i love this"
label, confidence = predict_fake_review(test_review, model, tokenizer)
print(f"Prediction: {label} (Confidence: {confidence:.2f}%)")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
Prediction: Genuine Review (Confidence: 68.85%)
