In [None]:
# Importing all necessary Modules, Libraries and Packages
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
#import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import spacy
import string

In [None]:
# Reading our Dataset into a Pandas Dataframe
df = pd.read_csv('/content/fake reviews dataset.csv')
df["text_"] = df["text_"].astype(str)

In [None]:
df.columns

Index(['category', 'rating', 'label', 'text_'], dtype='object')

In [None]:
df

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...
...,...,...,...,...
40427,Clothing_Shoes_and_Jewelry_5,4.0,OR,I had read some reviews saying that this bra r...
40428,Clothing_Shoes_and_Jewelry_5,5.0,CG,I wasn't sure exactly what it would be. It is ...
40429,Clothing_Shoes_and_Jewelry_5,2.0,OR,"You can wear the hood by itself, wear it with ..."
40430,Clothing_Shoes_and_Jewelry_5,1.0,CG,I liked nothing about this dress. The only rea...


In [None]:
# Performing Text Preprocessing

# Removing html tags
def remove_html(text):
    return BeautifulSoup(text, "lxml").text

df["text_"] = df["text_"].apply(remove_html)

  return BeautifulSoup(text, "lxml").text


In [None]:
# Removing any URLs
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

df["text_"] = df["text_"].apply(remove_urls)

In [None]:
# Converting all text to Lowercase
df["text_"] = df["text_"].str.lower()
df

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"love this! well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. i..."
2,Home_and_Kitchen_5,5.0,CG,this pillow saved my back. i love the look and...
3,Home_and_Kitchen_5,1.0,CG,"missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,very nice set. good quality. we have had the s...
...,...,...,...,...
40427,Clothing_Shoes_and_Jewelry_5,4.0,OR,i had read some reviews saying that this bra r...
40428,Clothing_Shoes_and_Jewelry_5,5.0,CG,i wasn't sure exactly what it would be. it is ...
40429,Clothing_Shoes_and_Jewelry_5,2.0,OR,"you can wear the hood by itself, wear it with ..."
40430,Clothing_Shoes_and_Jewelry_5,1.0,CG,i liked nothing about this dress. the only rea...


In [None]:
# Removing all characters which are not alphanumeric or whitespaces
df["text_"] = df["text_"].apply(lambda text: re.sub(r"[^\w\s]", "", text))
df.head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,love this well made sturdy and very comfortab...
1,Home_and_Kitchen_5,5.0,CG,love it a great upgrade from the original ive...
2,Home_and_Kitchen_5,5.0,CG,this pillow saved my back i love the look and ...
3,Home_and_Kitchen_5,1.0,CG,missing information on how to use it but it is...
4,Home_and_Kitchen_5,5.0,CG,very nice set good quality we have had the set...


In [None]:
# Importing and downloading some extra tools for preprocessing text data
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
# Removing Stopwords( eg: is, it, the, and, etc. )
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    cleaned_text = " ".join(filtered_words)
    return cleaned_text

df["text_"] = df["text_"].apply(remove_stopwords)
df

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,love well made sturdy comfortable love itvery ...
1,Home_and_Kitchen_5,5.0,CG,love great upgrade original ive mine couple years
2,Home_and_Kitchen_5,5.0,CG,pillow saved back love look feel pillow
3,Home_and_Kitchen_5,1.0,CG,missing information use great product price
4,Home_and_Kitchen_5,5.0,CG,nice set good quality set two months
...,...,...,...,...
40427,Clothing_Shoes_and_Jewelry_5,4.0,OR,read reviews saying bra ran small ordered two ...
40428,Clothing_Shoes_and_Jewelry_5,5.0,CG,wasnt sure exactly would little large small si...
40429,Clothing_Shoes_and_Jewelry_5,2.0,OR,wear hood wear hood wear jacket without hood 3...
40430,Clothing_Shoes_and_Jewelry_5,1.0,CG,liked nothing dress reason gave 4 stars ordere...


In [None]:
# Stemming all words (converting all words into a smaller base like form)
def stem_text(text):
    stemmer = PorterStemmer()
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    stemmed_text = " ".join(stemmed_words)
    return stemmed_text

df["text_stemmed"] = df["text_"].apply(stem_text)
df

Unnamed: 0,category,rating,label,text_,text_stemmed
0,Home_and_Kitchen_5,5.0,CG,love well made sturdy comfortable love itvery ...,love well made sturdi comfort love itveri pretti
1,Home_and_Kitchen_5,5.0,CG,love great upgrade original ive mine couple years,love great upgrad origin ive mine coupl year
2,Home_and_Kitchen_5,5.0,CG,pillow saved back love look feel pillow,pillow save back love look feel pillow
3,Home_and_Kitchen_5,1.0,CG,missing information use great product price,miss inform use great product price
4,Home_and_Kitchen_5,5.0,CG,nice set good quality set two months,nice set good qualiti set two month
...,...,...,...,...,...
40427,Clothing_Shoes_and_Jewelry_5,4.0,OR,read reviews saying bra ran small ordered two ...,read review say bra ran small order two band c...
40428,Clothing_Shoes_and_Jewelry_5,5.0,CG,wasnt sure exactly would little large small si...,wasnt sure exactli would littl larg small size...
40429,Clothing_Shoes_and_Jewelry_5,2.0,OR,wear hood wear hood wear jacket without hood 3...,wear hood wear hood wear jacket without hood 3...
40430,Clothing_Shoes_and_Jewelry_5,1.0,CG,liked nothing dress reason gave 4 stars ordere...,like noth dress reason gave 4 star order size ...


In [None]:
X = df['text_stemmed']
def label(text):
  if text == "CG":
    return 0
  elif text == "OR":
    return 1
Y = df['label'].apply(label)
print(X)
Y

0         love well made sturdi comfort love itveri pretti
1             love great upgrad origin ive mine coupl year
2                   pillow save back love look feel pillow
3                      miss inform use great product price
4                      nice set good qualiti set two month
                               ...                        
40427    read review say bra ran small order two band c...
40428    wasnt sure exactli would littl larg small size...
40429    wear hood wear hood wear jacket without hood 3...
40430    like noth dress reason gave 4 star order size ...
40431    work wed industri work long day feet outsid he...
Name: text_stemmed, Length: 40432, dtype: object


0        0
1        0
2        0
3        0
4        0
        ..
40427    1
40428    0
40429    1
40430    0
40431    1
Name: label, Length: 40432, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

In [None]:
vectorizer = CountVectorizer()
tfidf_transformer = TfidfTransformer()
X_cv_train = vectorizer.fit_transform(X_train)
X_tf_train = tfidf_transformer.fit_transform(X_cv_train)

print(X_tf_train)

  (0, 31957)	0.26233018884961334
  (0, 31539)	0.49909261045684516
  (0, 20672)	0.3771391023535766
  (0, 19574)	0.34995601708399365
  (0, 17366)	0.3285824713488025
  (0, 17156)	0.27934651354996504
  (0, 10748)	0.48102537009801954
  (1, 31957)	0.05383431733693515
  (1, 31947)	0.08885470077679844
  (1, 31867)	0.05681699455513609
  (1, 29756)	0.07056333998682619
  (1, 28679)	0.07056333998682619
  (1, 27385)	0.18128977483274253
  (1, 27319)	0.07311825284954804
  (1, 25762)	0.09070535336085223
  (1, 25417)	0.22781017072424417
  (1, 24328)	0.10707936411285397
  (1, 24036)	0.07597781962092617
  (1, 23535)	0.12205522282571474
  (1, 23496)	0.08390757986760962
  (1, 23436)	0.1855959343885482
  (1, 23372)	0.46800772541237773
  (1, 23106)	0.10317317183422847
  (1, 21499)	0.08657508109622562
  (1, 21034)	0.1205885380026169
  :	:
  (32342, 16070)	0.3260040620529231
  (32342, 14415)	0.18558595070887288
  (32342, 12103)	0.22802622856660487
  (32342, 11843)	0.22802622856660487
  (32342, 8427)	0.27727207

In [None]:
import pickle
with open('CVectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)
with open('TfidfTransformer.pkl', 'wb') as file:
    pickle.dump(tfidf_transformer, file)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [None]:
X_cv_test = vectorizer.transform(X_test)
X_tf_test = tfidf_transformer.transform(X_cv_test)

In [None]:
lrm = LogisticRegression()
lrm.fit(X_tf_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
Y_pred_lr = lrm.predict(X_tf_test)

acc_lr = accuracy_score(Y_test, Y_pred_lr)
f1_lr = f1_score(Y_test, Y_pred_lr)
prec_lr = precision_score(Y_test, Y_pred_lr)
rec_lr = recall_score(Y_test, Y_pred_lr)

In [None]:
mul = MultinomialNB()
mul.fit(X_tf_train, Y_train)

In [None]:
Y_pred_ml = mul.predict(X_tf_test)

acc_ml = accuracy_score(Y_test, Y_pred_ml)
f1_ml = f1_score(Y_test, Y_pred_ml)
prec_ml = precision_score(Y_test, Y_pred_ml)
rec_ml = recall_score(Y_test, Y_pred_ml)

In [None]:
knc = KNeighborsClassifier()
knc.fit(X_tf_train, Y_train)

In [None]:
Y_pred_kn = dtc.predict(X_tf_test)

acc_kn = accuracy_score(Y_test, Y_pred_kn)
f1_kn = f1_score(Y_test, Y_pred_kn)
prec_kn = precision_score(Y_test, Y_pred_kn)
rec_kn = recall_score(Y_test, Y_pred_kn)

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(X_tf_train, Y_train)

In [None]:
Y_pred_dt = dtc.predict(X_tf_test)

acc_dt = accuracy_score(Y_test, Y_pred_dt)
f1_dt = f1_score(Y_test, Y_pred_dt)
prec_dt = precision_score(Y_test, Y_pred_dt)
rec_dt = recall_score(Y_test, Y_pred_dt)

In [None]:
svc = LinearSVC()
svc.fit(X_tf_train, Y_train)

In [None]:
Y_pred_sv = svc.predict(X_tf_test)

acc_sv = accuracy_score(Y_test, Y_pred_sv)
f1_sv = f1_score(Y_test, Y_pred_sv)
prec_sv = precision_score(Y_test, Y_pred_sv)
rec_sv = recall_score(Y_test, Y_pred_sv)

In [None]:
rfc = RandomForestClassifier()
rfc.fit(X_tf_train, Y_train)

In [None]:
Y_pred_rf = rfc.predict(X_tf_test)

acc_rf = accuracy_score(Y_test, Y_pred_rf)
f1_rf = f1_score(Y_test, Y_pred_rf)
prec_rf = precision_score(Y_test, Y_pred_rf)
rec_rf = recall_score(Y_test, Y_pred_rf)

In [None]:
results = pd.DataFrame([['Logistic Regression', acc_lr, f1_lr, prec_lr, rec_lr],
                        ['MultinomialNB', acc_ml, f1_ml, prec_ml, rec_ml],
                        #['RandomForestClassifier', acc_rf, f1_rf, prec_rf, rec_rf],
                        ['SVC', acc_sv, f1_sv, prec_sv, rec_sv],
                        #['DecisionTreeClassifier', acc_dt, f1_dt, prec_dt, rec_dt],
                        #['KNeighborsClassifier', acc_kn, f1_kn, prec_kn, rec_kn]
                        ],
                        columns = ['Model', 'Accuracy', 'F1','Precision', 'Recall'])
results

Unnamed: 0,Model,Accuracy,F1,Precision,Recall
0,Logistic Regression,0.863114,0.864055,0.850992,0.877526
1,MultinomialNB,0.854087,0.847427,0.879732,0.817411
2,SVC,0.874985,0.875046,0.880597,0.869565


In [None]:
import pickle

# Save the trained model to a file
with open('FakeReviewDetectionModel.pkl', 'wb') as file:
    pickle.dump(svc, file)
with open('CVectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)
with open('TfidfTransformer.pkl', 'wb') as file:
    pickle.dump(TfidfTransformer, file)