<a href="https://colab.research.google.com/github/Renick2006/My-Projects/blob/main/Hybrid_Fake_Review_Detector(2).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [212]:
!pip install nltk scikit-learn pandas numpy matplotlib seaborn




In [213]:
import pandas as pd
import numpy as np
import re
import nltk

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [214]:
import pandas as pd

df = pd.read_csv(
    "fake_reviews.csv",
    engine="python",
    on_bad_lines="skip",
    encoding="utf-8"
)

print(df.shape)


(2128, 4)


In [215]:
print(df.columns)


Index(['review_headline', 'review_body', 'fake_review', 'cleaned_review_body'], dtype='object')


In [216]:
df['text'] = df['review_headline'].fillna('') + " " + df['review_body'].fillna('')


In [217]:
df['clean_text'] = df['text'].astype(str)


In [218]:
X = df['clean_text']
y = df['fake_review']


In [219]:
print("X size:", len(X))
print("y size:", len(y))


X size: 2128
y size: 2128


In [220]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=3000,
    ngram_range=(1,2),
    min_df=2,
    stop_words='english'
)


X_vec = tfidf.fit_transform(X)

print("New TF-IDF shape:", X_vec.shape)


New TF-IDF shape: (2128, 3000)


In [221]:
from sklearn.model_selection import train_test_split

# Split raw text
X_text_train, X_text_test, y_train, y_test = train_test_split(
    df['text'],
    y,
    test_size=0.2,
    random_state=42
)

# Vectorize after split
X_train_vec = tfidf.transform(X_text_train)
X_test_vec  = tfidf.transform(X_text_test)


In [222]:
from sklearn.svm import LinearSVC

svm_model = LinearSVC(
    C=2.0,
    class_weight='balanced',
    max_iter=5000
)

svm_model.fit(X_train_vec, y_train)


In [223]:
y_pred_svm = svm_model.predict(X_test_vec)

print("Improved SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))


Improved SVM Accuracy: 0.7629107981220657
              precision    recall  f1-score   support

           0       0.72      0.74      0.73       184
           1       0.80      0.78      0.79       242

    accuracy                           0.76       426
   macro avg       0.76      0.76      0.76       426
weighted avg       0.76      0.76      0.76       426



In [224]:
def predict_review_svm(review):
    review = review.lower()
    vec = tfidf.transform([review])
    pred = svm_model.predict(vec)[0]
    return "Fake Review" if pred == 1 else "Genuine Review"

print(predict_review_svm("This product is amazing and works perfectly"))
print(predict_review_svm("Worst product ever, total waste of money"))


Fake Review
Genuine Review


In [225]:
from sklearn.model_selection import GridSearchCV

params = {
    'C': [0.5, 1, 2, 3]
}

grid = GridSearchCV(
    LinearSVC(class_weight='balanced'),
    params,
    cv=5,
    scoring='f1'
)

grid.fit(X_train_vec, y_train)

print("Best C:", grid.best_params_)
svm_model = grid.best_estimator_


Best C: {'C': 1}


In [226]:
import numpy as np

def extract_behavior_features(text):
    text = str(text)
    words = text.split()

    length = len(words)
    exclamations = text.count("!")
    capital_ratio = sum(1 for c in text if c.isupper()) / max(len(text), 1)
    unique_ratio = len(set(words)) / max(len(words), 1)

    return [length, exclamations, capital_ratio, unique_ratio]


In [227]:
X_behavior = np.array(
    df['text'].apply(extract_behavior_features).tolist()
)


In [228]:
from sklearn.ensemble import RandomForestClassifier

Xb_train, Xb_test, yb_train, yb_test = train_test_split(
    X_behavior, y, test_size=0.2, random_state=42
)

behavior_model = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)

behavior_model.fit(Xb_train, yb_train)


In [229]:
def hybrid_predict(review):
    # TEXT MODEL
    vec = tfidf.transform([review.lower()])
    text_pred = svm_model.predict(vec)[0]

    # BEHAVIOR MODEL
    behavior_feat = np.array(
        extract_behavior_features(review)
    ).reshape(1, -1)

    behavior_prob = behavior_model.predict_proba(behavior_feat)[0]

    # confidence of fake class
    fake_confidence = behavior_prob[1]

    # HYBRID DECISION LOGIC
    if fake_confidence > 0.75:
        return "Fake Review"
    else:
        return "Fake Review" if text_pred == 1 else "Genuine Review"


In [230]:
print(hybrid_predict("This product is amazing!!! Best purchase ever!!!"))
print(hybrid_predict("Worst product, totally useless, waste of money"))


Fake Review
Genuine Review


In [231]:
hybrid_preds = []

for review in X_text_test:
    pred = hybrid_predict(review)
    hybrid_preds.append(1 if pred == "Fake Review" else 0)

print("Hybrid Accuracy:", accuracy_score(y_test, hybrid_preds))
print(classification_report(y_test, hybrid_preds))


Hybrid Accuracy: 0.784037558685446
              precision    recall  f1-score   support

           0       0.78      0.70      0.74       184
           1       0.79      0.85      0.82       242

    accuracy                           0.78       426
   macro avg       0.78      0.77      0.78       426
weighted avg       0.78      0.78      0.78       426

