imports

In [18]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

from scipy.sparse import hstack


Loading Dataset

In [19]:
reviews = pd.read_json(
    "C:\\Users\\Global\\OneDrive\\Desktop\\MAM_J\\Online_Review_Manipulation_Detection\\data\\yelp_academic_dataset_review.json",
    lines=True
)


In [20]:
reviews.head()


Unnamed: 0,votes,user_id,review_id,stars,date,text,type,business_id
0,"{'funny': 0, 'useful': 5, 'cool': 2}",rLtl8ZkDX5vH5nAx9C3q5Q,fWKvX83p0-ka4JS3dc6E5A,5,2011-01-26,My wife took me here on my birthday for breakf...,review,9yKzy9PApeiPPOUJEtnvkg
1,"{'funny': 0, 'useful': 0, 'cool': 0}",0a2KyEL0d3Yb1V6aivbIuQ,IjZ33sJrzXqU-0X6U8NwyA,5,2011-07-27,I have no idea why some people give bad review...,review,ZRJwVLyzEJq1VAihDhYiow
2,"{'funny': 0, 'useful': 1, 'cool': 0}",0hT2KtfLiobPvh6cDC8JQg,IESLBzqUCLdSzSqm0eCSxQ,4,2012-06-14,love the gyro plate. Rice is so good and I als...,review,6oRAC4uyJCsJl1X0WZpVSA
3,"{'funny': 0, 'useful': 2, 'cool': 1}",uZetl9T0NcROGOyFfughhg,G-WvGaISbqqaMHlNnByodA,5,2010-05-27,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,_1QQZuf4zZOyFCvXc0o6Vg
4,"{'funny': 0, 'useful': 0, 'cool': 0}",vYmM4KTsC8ZfQBg-j5MWkw,1uJFq2r5QfJG_6ExMRCaGw,5,2012-01-05,General Manager Scott Petello is a good egg!!!...,review,6ozycU1RpktNG2-1BroVtw


In [21]:
reviews.columns


Index(['votes', 'user_id', 'review_id', 'stars', 'date', 'text', 'type',
       'business_id'],
      dtype='object')

Behavioural Features

In [22]:
# Review length feature
reviews['review_length'] = reviews['text'].apply(len)


In [23]:
# Extreme rating feature (1 for 1-star or 5-star reviews, 0 otherwise)
reviews['extreme_rating'] = reviews['stars'].apply(
    lambda x: 1 if x in [1, 5] else 0
)


In [24]:
# Useful votes feature
reviews['useful_votes'] = reviews['votes'].apply(
    lambda x: x['useful']
)


Manipulation Label

In [25]:
def is_manipulated(row):
    if (
        row['extreme_rating'] == 1 and
        row['useful_votes'] == 0 and
        row['review_length'] < 200
    ):
        return 1   # Manipulated
    else:
        return 0   # Genuine

reviews['manipulated'] = reviews.apply(is_manipulated, axis=1)


In [26]:
reviews['manipulated'].value_counts()


manipulated
0    219430
1     10477
Name: count, dtype: int64

Text_Cleaning (NLP)

In [27]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

reviews['clean_text'] = reviews['text'].apply(clean_text)


TEXT FEATURES (TF-IDF)

In [28]:
tfidf = TfidfVectorizer(
    max_features=3000,
    stop_words='english'
)

text_features = tfidf.fit_transform(reviews['clean_text'])


TEXT + BEHAVIOUR COMBINE

In [29]:
behavioural_features = reviews[
    ['review_length', 'extreme_rating', 'useful_votes']
]

X = hstack([text_features, behavioural_features.values])
y = reviews['manipulated']


TRAIN / TEST SPLIT

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

MACHINE LEARNING MODELS

In [31]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

pred_lr = lr.predict(X_test)
print("Logistic Regression Results")
print(classification_report(y_test, pred_lr))


Logistic Regression Results
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     43874
           1       1.00      0.99      1.00      2108

    accuracy                           1.00     45982
   macro avg       1.00      1.00      1.00     45982
weighted avg       1.00      1.00      1.00     45982



In [32]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)

pred_rf = rf.predict(X_test)
print("Random Forest Results")
print(classification_report(y_test, pred_rf))


KeyboardInterrupt: 