# Fake Product Review Detection using NLP techniques

Download Amazon Fake vs. Real Reviews Dataset from kaggle

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mario78/amazon-product-reviews")

!mv {path} /content/data/

Downloading from https://www.kaggle.com/api/v1/datasets/download/mario78/amazon-product-reviews?dataset_version_number=1...


100%|██████████| 2.82M/2.82M [00:00<00:00, 131MB/s]

Extracting files...





import necessary libraries

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [82]:
# Load dataset
df = pd.read_csv('/content/data/Amazon_Product.csv')
df.head()

Unnamed: 0,User_ID,Product_ID,Review_Text,Rating,Is_Fake,Timestamp,User_Location,Product_Category,User_Reputation,Review_Length,Review_Verified,Helpful_Votes,Sentiment_Score,Device_Used,User_Activity_Level,Language
0,5724,451,Terrible product. It broke after one use.,4,False,2023-12-19,"Sydney, China",Electronics,310,7,False,83,Neutral,Mobile,High,en
1,9974,122,Front position walk less. Tell administration ...,3,True,2023-04-02,"Berlin, Japan",Home,45,18,False,90,Positive,Tablet,High,en
2,2775,163,Kind Fenster darin was. Allein Wasser Vogel gi...,2,True,2024-03-03,"Tokyo, Australia",Sports,335,24,False,5,Negative,Mobile,High,de
3,6712,201,Explain evidence figure effort. Radio explain ...,3,True,2020-10-17,"New York, Canada",Toys,296,27,False,95,Neutral,Mobile,Medium,en
4,3320,154,Producto terrible. Se rompió después de un uso.,4,False,2022-12-15,"Mumbai, India",Food,398,8,True,61,Negative,Tablet,Low,es


In [83]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   User_ID              50000 non-null  int64 
 1   Product_ID           50000 non-null  int64 
 2   Review_Text          50000 non-null  object
 3   Rating               50000 non-null  int64 
 4   Is_Fake              50000 non-null  bool  
 5   Timestamp            50000 non-null  object
 6   User_Location        50000 non-null  object
 7   Product_Category     50000 non-null  object
 8   User_Reputation      50000 non-null  int64 
 9   Review_Length        50000 non-null  int64 
 10  Review_Verified      50000 non-null  bool  
 11  Helpful_Votes        50000 non-null  int64 
 12  Sentiment_Score      50000 non-null  object
 13  Device_Used          50000 non-null  object
 14  User_Activity_Level  50000 non-null  object
 15  Language             50000 non-null  object
dtypes: b

In [84]:
df.isnull().sum()

Unnamed: 0,0
User_ID,0
Product_ID,0
Review_Text,0
Rating,0
Is_Fake,0
Timestamp,0
User_Location,0
Product_Category,0
User_Reputation,0
Review_Length,0


In [85]:
corr_matrix = df.select_dtypes(include=['int64']).corr()
print(corr_matrix)

                  User_ID  Product_ID    Rating  User_Reputation  Review_Length  Helpful_Votes
User_ID          1.000000   -0.008128  0.000352        -0.004240       0.002657      -0.009369
Product_ID      -0.008128    1.000000 -0.000564        -0.000073       0.009937      -0.003902
Rating           0.000352   -0.000564  1.000000        -0.007306      -0.481471      -0.002463
User_Reputation -0.004240   -0.000073 -0.007306         1.000000       0.001964       0.006726
Review_Length    0.002657    0.009937 -0.481471         0.001964       1.000000       0.003685
Helpful_Votes   -0.009369   -0.003902 -0.002463         0.006726       0.003685       1.000000


In [86]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

float_columns = df.select_dtypes(include=['int64'])
vif_data = pd.DataFrame()
vif_data["Feature"] = float_columns.columns
vif_data["VIF"] = [variance_inflation_factor(float_columns.values, i) for i in range(len(float_columns.columns))]
print(vif_data)

           Feature       VIF
0          User_ID  3.669099
1       Product_ID  3.684047
2           Rating  5.720593
3  User_Reputation  3.870478
4    Review_Length  3.889737
5    Helpful_Votes  3.623689


In [87]:
df = df.drop(['User_ID', 'Product_ID', 'Timestamp', 'User_Location', 'Product_Category', 'Review_Length', 'Device_Used', 'Language'], axis=1)
df.head()

Unnamed: 0,Review_Text,Rating,Is_Fake,User_Reputation,Review_Verified,Helpful_Votes,Sentiment_Score,User_Activity_Level
0,Terrible product. It broke after one use.,4,False,310,False,83,Neutral,High
1,Front position walk less. Tell administration ...,3,True,45,False,90,Positive,High
2,Kind Fenster darin was. Allein Wasser Vogel gi...,2,True,335,False,5,Negative,High
3,Explain evidence figure effort. Radio explain ...,3,True,296,False,95,Neutral,Medium
4,Producto terrible. Se rompió después de un uso.,4,False,398,True,61,Negative,Low


In [88]:
df.Is_Fake.value_counts()

Unnamed: 0_level_0,count
Is_Fake,Unnamed: 1_level_1
False,25036
True,24964


In [42]:
df.Review_Verified.value_counts()

Unnamed: 0_level_0,count
Review_Verified,Unnamed: 1_level_1
True,25144
False,24856


In [89]:
df.Sentiment_Score.value_counts()

Unnamed: 0_level_0,count
Sentiment_Score,Unnamed: 1_level_1
Negative,16833
Neutral,16605
Positive,16562


In [90]:
df.User_Activity_Level.value_counts()

Unnamed: 0_level_0,count
User_Activity_Level,Unnamed: 1_level_1
Medium,16719
High,16646
Low,16635


In [91]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['Is_Fake'] = label_encoder.fit_transform(df['Is_Fake']) # True: 1 False: 0
df['Review_Verified'] = label_encoder.fit_transform(df['Review_Verified'])  # True: 1 False: 0
df['Sentiment_Score'] = label_encoder.fit_transform(df['Sentiment_Score']) # Neutral: 1 Positive: 2 Negative: 1
df['User_Activity_Level'] = label_encoder.fit_transform(df['User_Activity_Level']) # High: 0 Medium: 2 Low: 1

In [92]:
df.head()

Unnamed: 0,Review_Text,Rating,Is_Fake,User_Reputation,Review_Verified,Helpful_Votes,Sentiment_Score,User_Activity_Level
0,Terrible product. It broke after one use.,4,0,310,0,83,1,0
1,Front position walk less. Tell administration ...,3,1,45,0,90,2,0
2,Kind Fenster darin was. Allein Wasser Vogel gi...,2,1,335,0,5,0,0
3,Explain evidence figure effort. Radio explain ...,3,1,296,0,95,1,2
4,Producto terrible. Se rompió después de un uso.,4,0,398,1,61,0,1


In [93]:
X = df.drop('Is_Fake', axis=1)
y = df['Is_Fake']

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [95]:
X_train.head()

Unnamed: 0,Review_Text,Rating,User_Reputation,Review_Verified,Helpful_Votes,Sentiment_Score,User_Activity_Level
39087,"C'est correct, ce n'est pas ce que j'attendais.",4,350,0,25,0,2
30893,यह उत्पाद अद्भुत है! पूरी तरह से सिफारिश करता ...,5,385,0,29,1,1
45278,Schlechtes Produkt. Es ist nach der ersten Ben...,4,283,0,48,2,1
16398,Vitae hic occaecati nostrum corporis corrupti ...,1,163,1,0,1,2
13653,"C'est correct, ce n'est pas ce que j'attendais.",5,80,0,11,1,2


In [96]:
# Create the vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_text = vectorizer.fit_transform(X_train['Review_Text'])

# Transform the test data
X_test_text = vectorizer.transform(X_test['Review_Text'])

# Convert sparse matrices to dense arrays
X_train_text_dense = X_train_text.toarray()
X_test_text_dense = X_test_text.toarray()

#  Create new DataFrames for the vectorized text data
X_train_text_df = pd.DataFrame(X_train_text_dense, columns=vectorizer.get_feature_names_out(), index=X_train.index)
X_test_text_df = pd.DataFrame(X_test_text_dense, columns=vectorizer.get_feature_names_out(), index=X_test.index)

# Drop the original 'Review_Text' column and concatenate the new features
X_train = X_train.drop('Review_Text', axis=1)
X_train = pd.concat([X_train, X_train_text_df], axis=1)

X_test = X_test.drop('Review_Text', axis=1)
X_test = pd.concat([X_test, X_test_text_df], axis=1)

In [98]:
X_train.head()

Unnamed: 0,Rating,User_Reputation,Review_Verified,Helpful_Votes,Sentiment_Score,User_Activity_Level,ab,abandonner,abattre,abend,aber,ability,able,abord,about,above,abri,absence,absolu,absolument,accent,accept,accepter,accompagner,accomplir,accord,accorder,according,account,accrocher,accusamus,accusantium,accuser,achat,acheter,achever,acht,across,act,acte,action,activity,actually,ad,add,address,adipisci,admettre,administration,admit,...,émotion,énergie,énorme,épais,épaule,époque,éprouver,établir,étage,étaler,état,éteindre,étendre,étendue,éternel,étoile,étonner,étouffer,étrange,étranger,étroit,étude,étudier,été,éviter,événement,être,île,öffnen,über,überall,अच,अद,अप,अपन,उत,उपय,एक,करत,करन,खर,गय,णवत,तरह,नद,नह,बह,यह,वर,वह
39087,4,350,0,25,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30893,5,385,0,29,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.459283,0.0,0.0,0.395268,0.0,0.0,0.459283,0.0,0.0,0.0,0.0,0.459283,0.0,0.0,0.0,0.459283,0.0,0.0
45278,4,283,0,48,2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16398,1,163,1,0,1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13653,5,80,0,11,1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [97]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [99]:
# evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9992
Confusion Matrix:
 [[5079    0]
 [   8 4913]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      5079
           1       1.00      1.00      1.00      4921

    accuracy                           1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000

