<a href="https://colab.research.google.com/github/Umeshmamidala/chat-bot/blob/main/Fake_Review_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import random

# Genuine reviews
genuine_reviews = [
    "This product is amazing and works perfectly",
    "Very good quality and worth the price",
    "I am satisfied with the performance",
    "Delivery was fast and packaging was neat",
    "Excellent product highly recommended",
    "Good value for money",
    "The item matches the description",
    "Works as expected no issues so far",
    "Nice design and easy to use",
    "Customer service was helpful",
    "Build quality is solid",
    "Five stars for performance",
    "Very comfortable and durable",
    "Battery life is impressive",
    "Product arrived on time",
    "Good experience overall",
    "Quality exceeded my expectations",
    "Happy with this purchase",
    "Easy installation and setup",
    "Worth buying again"
]

# Fake reviews
spam_reviews = [
    "Buy now limited offer hurry up",
    "Best product ever buy buy buy",
    "Click this link for huge discount",
    "Free gift card claim now",
    "Limited stock act fast",
    "100 percent guaranteed best deal",
    "Visit our website for more offers",
    "Cheap price best price buy now",
    "Earn money fast click here",
    "Special promotion today only",
    "Don't miss this opportunity",
    "Lowest price guaranteed",
    "Exclusive deal buy immediately",
    "Best in the world no doubt",
    "Order now before it ends",
    "Hot sale huge discount",
    "Unbelievable offer grab now",
    "Top rated product buy today",
    "Best deal ever limited time",
    "Hurry buy before sold out"
]

reviews = []

for _ in range(350):
    reviews.append((random.choice(genuine_reviews), 0))

for _ in range(150):
    reviews.append((random.choice(spam_reviews), 1))

random.shuffle(reviews)

df = pd.DataFrame(reviews, columns=["review", "label"])

# Balance dataset
genuine = df[df.label == 0]
fake = df[df.label == 1]

fake_upsampled = fake.sample(len(genuine), replace=True, random_state=42)

balanced_df = pd.concat([genuine, fake_upsampled])
balanced_df = balanced_df.sample(frac=1, random_state=42)

print(balanced_df['label'].value_counts())

balanced_df.to_csv("reviews.csv", index=False)


label
0    350
1    350
Name: count, dtype: int64


In [None]:
!pip install transformers datasets --quiet

import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np


In [None]:
df = pd.read_csv("reviews.csv")

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['review'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)


In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ReviewDataset(train_encodings, train_labels)
val_dataset = ReviewDataset(val_encodings, val_labels)


In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir="./logs"
)


`logging_dir` is deprecated and will be removed in v5.2. Please set `TENSORBOARD_LOGGING_DIR` instead.


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()




Step,Training Loss


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=140, training_loss=0.04198756899152483, metrics={'train_runtime': 384.8696, 'train_samples_per_second': 5.82, 'train_steps_per_second': 0.364, 'total_flos': 10359997804800.0, 'train_loss': 0.04198756899152483, 'epoch': 4.0})

In [None]:
preds = trainer.predict(val_dataset)
y_pred = np.argmax(preds.predictions, axis=1)

print(classification_report(val_labels, y_pred))




              precision    recall  f1-score   support

           0       1.00      1.00      1.00        70
           1       1.00      1.00      1.00        70

    accuracy                           1.00       140
   macro avg       1.00      1.00      1.00       140
weighted avg       1.00      1.00      1.00       140



In [None]:
def predict_review(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=1).item()

    if prediction == 0:
        return "Genuine"
    else:
        return "Fake"


In [None]:
print(predict_review(input("enter your review to check whether it is genuine or fake:")))



enter your review to check whether it is genuine or fake:The item matches the description
Genuine
