In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
# Load the dataset

file_path = "fake reviews dataset.csv"
df = pd.read_csv(file_path)

In [3]:
# Display the first few rows of dataset

print("Original dataset: ")
print(df.head())

Original dataset: 
             category  rating label  \
0  Home_and_Kitchen_5       5    CG   
1  Home_and_Kitchen_5       5    CG   
2  Home_and_Kitchen_5       5    CG   
3  Home_and_Kitchen_5       1    CG   
4  Home_and_Kitchen_5       5    CG   

                                               text_  
0  Love this!  Well made, sturdy, and very comfor...  
1  love it, a great upgrade from the original.  I...  
2  This pillow saved my back. I love the look and...  
3  Missing information on how to use it, but it i...  
4  Very nice set. Good quality. We have had the s...  


In [4]:
# Data Preprocessing

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters, numbers and punctuation
    text = re.sub(r'[^a-zA-Z\s]','',text)

    #Remove extra whitespaces
    text = re.sub(r'\s+',' ',text)

    return text

# Apply the preprocessing function to the 'review' column
df['text_'] = df['text_'].apply(preprocess_text)

print("\nPreprocessed Dataset:")
print(df.head())


Preprocessed Dataset:
             category  rating label  \
0  Home_and_Kitchen_5       5    CG   
1  Home_and_Kitchen_5       5    CG   
2  Home_and_Kitchen_5       5    CG   
3  Home_and_Kitchen_5       1    CG   
4  Home_and_Kitchen_5       5    CG   

                                               text_  
0  love this well made sturdy and very comfortabl...  
1  love it a great upgrade from the original ive ...  
2  this pillow saved my back i love the look and ...  
3  missing information on how to use it but it is...  
4  very nice set good quality we have had the set...  


In [5]:
# Map labels to numerical values
label_mapping = {'OR':0, 'CG': 1}
df['label'] = df['label'].map(label_mapping)

In [6]:
# Display the labeled dataset
print("\nLabeled Dataset:")
print(df.head())


Labeled Dataset:
             category  rating  label  \
0  Home_and_Kitchen_5       5      1   
1  Home_and_Kitchen_5       5      1   
2  Home_and_Kitchen_5       5      1   
3  Home_and_Kitchen_5       1      1   
4  Home_and_Kitchen_5       5      1   

                                               text_  
0  love this well made sturdy and very comfortabl...  
1  love it a great upgrade from the original ive ...  
2  this pillow saved my back i love the look and ...  
3  missing information on how to use it but it is...  
4  very nice set good quality we have had the set...  


In [7]:
# Split the dataset into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(
    df['text_'], df['label'], test_size = 0.2, random_state=42
)

In [8]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
train_features = tfidf_vectorizer.fit_transform(train_data)
test_features = tfidf_vectorizer.transform(test_data)

#Display the shape of the TF-IDF matrices
print("\nTF-IDF Matrix Shape - Training set:", train_features.shape)
print("TF-IDF Matrix Shape - Testing set:", test_features.shape)


TF-IDF Matrix Shape - Training set: (32345, 5000)
TF-IDF Matrix Shape - Testing set: (8087, 5000)


In [9]:
# Model selection and Training

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Initialize the SVM model
svm_model = SVC(kernel='linear', random_state=42)

# Train the model
svm_model.fit(train_features,train_labels)

# Predictions on the test set
predictions = svm_model.predict(test_features)

# Evaluate the model
accuracy = accuracy_score(test_labels, predictions)
report = classification_report(test_labels,predictions)


# Display evaluation metrics
print("\nModel Evaluation:")
print("Accuracy:", accuracy)
print("\nClassification Report:")
print(report)


Model Evaluation:
Accuracy: 0.9028069741560529

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.90      0.90      4071
           1       0.90      0.90      0.90      4016

    accuracy                           0.90      8087
   macro avg       0.90      0.90      0.90      8087
weighted avg       0.90      0.90      0.90      8087



In [11]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

xgb_model = XGBClassifier(random_state=42)

xgb_model.fit(train_features, train_labels)

predictions = xgb_model.predict(test_features)

accuracy = accuracy_score(test_labels,predictions)
report = classification_report(test_labels,predictions)

print("accuracy of XGB: ", accuracy)
print("classification_report: ", report)

  if is_sparse(data):


accuracy of XGB:  0.8832694447879312
classification_report:                precision    recall  f1-score   support

           0       0.88      0.89      0.88      4071
           1       0.89      0.87      0.88      4016

    accuracy                           0.88      8087
   macro avg       0.88      0.88      0.88      8087
weighted avg       0.88      0.88      0.88      8087



In [12]:
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
# from sklearn.model_selection import train_test_split

svm_model = SVC(kernel='linear', random_state=42)
xgb_model = XGBClassifier(random_state = 42)

estimators = [('svm',svm_model), ('xgb', xgb_model)]
stacking_model = StackingClassifier(estimators=estimators, final_estimator=XGBClassifier())

stacking_model.fit(train_features, train_labels)

predictions = stacking_model.predict(test_features)

accuracy = accuracy_score(test_labels, predictions)
report  = classification_report(test_labels, predictions)

print("accuracy of ensemble: ", accuracy)
print("classification_report: ", report)

accuracy of ensemble:  0.9103499443551378
classification_report:                precision    recall  f1-score   support

           0       0.91      0.91      0.91      4071
           1       0.91      0.91      0.91      4016

    accuracy                           0.91      8087
   macro avg       0.91      0.91      0.91      8087
weighted avg       0.91      0.91      0.91      8087



In [10]:
# Saving model

import joblib

# save model to file
joblib.dump(svm_model,'model.pkl')

# Save TF-IDF vectorizer to file
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']