<a href="https://colab.research.google.com/github/Sumit-rautela/Fake-Review-Detector/blob/main/Copy_of_fake_review_detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets scikit-learn torch
!pip install lime


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
import pandas as pd

# Load specific columns and rename
df_reduced = pd.read_csv("/content/fake reviews dataset.csv")[['label', 'text_']]
# Corrected typo: rename columns of df_reduced, not df
df_reduced = df_reduced.rename(columns={'text_': 'text'})

# Map labels and drop missing entries
df_reduced['label'] = df_reduced['label'].map({'OR': 0, 'CG': 1})  # OR = original, CG = computer-generated
df_reduced = df_reduced.dropna(subset=['label', 'text'])

# Check if the DataFrame is empty after dropping NaNs
print(f"Shape of df_reduced after dropping NaNs: {df_reduced.shape}")

# Reduce to 6000 rows - this will only work if df_reduced has rows
# If the shape printed above is (0, 2), the issue is with the data or mapping/dropping
df = df_reduced.sample(n=6000, random_state=42)
print(f"Shape of df_reduced after dropping NaNs: {df.shape}")


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset



# Split the data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

# Tokenize
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=512)

train_encodings = tokenize(train_texts)
test_encodings = tokenize(test_texts)

# Create datasets
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels  # Use "labels" key for Trainer
})
test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': test_labels
})

# Load the model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Define evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# Define training arguments (compatible with older versions of transformers)
training_args = TrainingArguments(
    output_dir="./bert_results",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    # Remove or comment out problematic parameters
    # save_strategy="no",
    # logging_strategy="epoch",
    # evaluation_strategy="epoch"
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()


In [None]:
!pip install huggingface_hub


In [None]:
!huggingface-cli login

In [None]:
from huggingface_hub import HfApi, HfFolder, Repository
from transformers import BertTokenizer, BertForSequenceClassification

# Set your model repo name (your-username/your-model-name)
model_repo = "lol3445/bert-email-cap-classifier"

# Save model and tokenizer locally first
model.save_pretrained("saved_bert_model")
tokenizer.save_pretrained("saved_bert_model")

# Push to the Hub
model.push_to_hub(model_repo)
tokenizer.push_to_hub(model_repo)


In [None]:
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")


In [None]:
!pip install streamlit pyngrok


In [None]:
%%writefile app.py

In [None]:
import streamlit as st
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from lime.lime_text import LimeTextExplainer
import pandas as pd
import numpy as np

# Load model/tokenizer
@st.cache_resource
def load_model():
    tokenizer = BertTokenizer.from_pretrained("lol3445/bert-email-cap-classifier")
    model = BertForSequenceClassification.from_pretrained("lol3445/bert-email-cap-classifier")
    return tokenizer, model

tokenizer, model = load_model()

def classify_email(text, tokenizer, model):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        prediction = torch.argmax(probs, dim=1).item()
        confidence = probs[0][prediction].item()
    label = "Original" if prediction == 1 else "Computer-Generated"
    return label, confidence

def predict_proba(texts):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    if isinstance(texts, str):
        texts = [texts]
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
    return probs.cpu().numpy()

explainer = LimeTextExplainer(class_names=["Computer-Generated", "Original"])

st.set_page_config(page_title="Fake Review Detector", layout="centered")
st.title("🕵️‍♂️ Fake Review Detector")
st.write("Determine if a review is **Original** or **Computer-Generated** using BERT + LIME.")

user_input = st.text_area("✍️ Enter your Review Text:")
min_words_for_lime = 3

# Define label and confidence up front
label = None
confidence = None

if st.button("Classify"):
    if user_input.strip() == "":
        st.warning("Please enter some text.")
    else:
        label, confidence = classify_email(user_input, tokenizer, model)
        st.success(f"**Prediction: {label}**")
        st.info(f"Confidence Score: {confidence:.2%}")
        st.progress(confidence)

        # Feedback section
        st.markdown("### Was this prediction correct?")
        col1, col2 = st.columns(2)
        with col1:
            if st.button("👍 Yes, correct"):
                st.success("✅ Thanks for your feedback!")
                with open("feedback_log.txt", "a") as f:
                    f.write(f"👍 | Text: {user_input} | Predicted: {label} | Confidence: {confidence:.2%}\n")
        with col2:
            if st.button("👎 No, incorrect"):
                st.warning("⚠️ We'll use this to improve the model.")
                with open("feedback_log.txt", "a") as f:
                    f.write(f"👎 | Text: {user_input} | Predicted: {label} | Confidence: {confidence:.2%}\n")

# LIME Explanation
show_explanation = st.checkbox(
    "🔍 Show Explanation (LIME)",
    disabled=(len(user_input.strip().split()) < min_words_for_lime)
)

if show_explanation and user_input.strip():
    with st.spinner("Generating explanation..."):
        try:
            explanation = explainer.explain_instance(
                user_input,
                predict_proba,
                num_features=min(10, len(user_input.strip().split())),
                num_samples=100
            )
            lime_html = explanation.as_html()
            wrapped_html = f"""
            <div style="background-color: white; color: black; padding: 20px; border-radius: 10px;">
                <style>
                    .lime-explanation {{
                        background-color: white !important;
                        color: black !important;
                    }}
                    .lime-explanation table {{
                        background-color: white !important;
                        color: black !important;
                    }}
                    .lime-explanation td, .lime-explanation th {{
                        background-color: white !important;
                        color: black !important;
                        border: 1px solid #ccc !important;
                    }}
                    .lime-explanation .positive {{
                        background-color: #90EE90 !important;
                        color: black !important;
                    }}
                    .lime-explanation .negative {{
                        background-color: #FFB6C1 !important;
                        color: black !important;
                    }}
                </style>
                {lime_html}
            </div>
            """
            st.components.v1.html(wrapped_html, height=600, scrolling=True)
        except Exception as e:
            st.error(f"❌ Could not generate explanation: {str(e)}")
            st.info("💡 Try entering a longer, more descriptive review.")

# CSV Upload for Bulk Detection
st.markdown("---")
st.subheader("📁 Upload a CSV file for Bulk Detection")

uploaded_file = st.file_uploader("Upload a CSV file with a column named `text`", type=["csv"])

if uploaded_file:
    try:
        df_upload = pd.read_csv(uploaded_file)

        if "text" not in df_upload.columns:
            st.error("❌ The CSV must contain a column named `text`.")
        else:
            texts = df_upload["text"].astype(str).tolist()
            probs = predict_proba(texts)

            preds = np.argmax(probs, axis=1)
            confidences = probs[np.arange(len(preds)), preds]

            results_df = pd.DataFrame({
                "text": texts,
                "prediction": ["Original" if p == 1 else "Computer-Generated" for p in preds],
                "confidence": [f"{c:.2%}" for c in confidences]
            })

            st.success(f"✅ Processed {len(results_df)} reviews.")
            st.dataframe(results_df)

            csv_download = results_df.to_csv(index=False).encode("utf-8")
            st.download_button("⬇️ Download Results as CSV", csv_download, "results.csv", "text/csv")
    except Exception as e:
        st.error(f"⚠️ Error reading file: {e}")

if len(user_input.strip().split()) < min_words_for_lime:
    st.info(f"💡 Enter at least {min_words_for_lime} words to enable LIME explanation.")


In [None]:
!wget -q -O - ipv4.icanhazip.com

In [None]:
!streamlit run app.py & npx localtunnel --port 8501