STEP 1:

DATA CLEANING

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('fakeReviewData.csv')  # Adjust for file type, e.g., .csv or .xlsx

In [5]:
print(df.head())

             category  rating label  \
0  Home_and_Kitchen_5     5.0    CG   
1  Home_and_Kitchen_5     5.0    CG   
2  Home_and_Kitchen_5     5.0    CG   
3  Home_and_Kitchen_5     1.0    CG   
4  Home_and_Kitchen_5     5.0    CG   

                                               text_  
0  Love this!  Well made, sturdy, and very comfor...  
1  love it, a great upgrade from the original.  I...  
2  This pillow saved my back. I love the look and...  
3  Missing information on how to use it, but it i...  
4  Very nice set. Good quality. We have had the s...  


In [6]:
#here we can see what the data looks like, the shape, the number of null values, column names

print(df.shape)
print(df.columns)
print(df.isnull().sum())
print(df.head())

(40432, 4)
Index(['category', 'rating', 'label', 'text_'], dtype='object')
category    0
rating      0
label       0
text_       0
dtype: int64
             category  rating label  \
0  Home_and_Kitchen_5     5.0    CG   
1  Home_and_Kitchen_5     5.0    CG   
2  Home_and_Kitchen_5     5.0    CG   
3  Home_and_Kitchen_5     1.0    CG   
4  Home_and_Kitchen_5     5.0    CG   

                                               text_  
0  Love this!  Well made, sturdy, and very comfor...  
1  love it, a great upgrade from the original.  I...  
2  This pillow saved my back. I love the look and...  
3  Missing information on how to use it, but it i...  
4  Very nice set. Good quality. We have had the s...  


In [7]:
print(df['category'].unique())
print(df['label'].unique())
print(df['rating'].unique())

['Home_and_Kitchen_5' 'Sports_and_Outdoors_5' 'Electronics_5'
 'Movies_and_TV_5' 'Tools_and_Home_Improvement_5' 'Pet_Supplies_5'
 'Kindle_Store_5' 'Books_5' 'Toys_and_Games_5'
 'Clothing_Shoes_and_Jewelry_5']
['CG' 'OR']
[5. 1. 3. 2. 4.]


In [8]:
#in this dataset, we have every datapoint as a review at an online marketplace.
#here, we can afford to drop duplicates since a duplicate review doesn't hold value for us.

df = df.drop_duplicates()


STEP 2:

TEXT NORMALIZATION

In [9]:
#here we'll work on the review text itself.
#we want to perform the following:
#1. remove numbers, punctuations, special charactesr
#2. convert all upper case characters to lower case


import re

def clean_text(value):
    value = value.lower()
    value = re.sub(r'[^a-z\s]', '', value)
    return value

df['text_'] = df['text_'].apply(clean_text)

In [10]:
print(df["text_"].head(15))

0     love this  well made sturdy and very comfortab...
1     love it a great upgrade from the original  ive...
2     this pillow saved my back i love the look and ...
3     missing information on how to use it but it is...
4     very nice set good quality we have had the set...
5           i wanted different flavors but they are not
6     they are the perfect touch for me and the only...
7     these done fit well and look great  i love the...
8     great big numbers  easy to read the only thing...
9     my son loves this comforter and it is very wel...
10    as advertised th one ive had the only problem ...
11    very handy for one of my kids and the tools ar...
12    did someone say oriental for   it is a great p...
13    these are so flimsy they are not the quality y...
14    makes may tea with out stirring the only probl...
Name: text_, dtype: object


STEP 3:

TOKENIZATION


In [12]:
import spacy
from tqdm import tqdm
tqdm.pandas()

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Tokenize with progress bar
df['tokens'] = df['text_'].progress_apply(lambda x: [token.text for token in nlp(x)])


100%|██████████| 40420/40420 [13:08<00:00, 51.28it/s]


In [13]:
print(df.head())

             category  rating label  \
0  Home_and_Kitchen_5     5.0    CG   
1  Home_and_Kitchen_5     5.0    CG   
2  Home_and_Kitchen_5     5.0    CG   
3  Home_and_Kitchen_5     1.0    CG   
4  Home_and_Kitchen_5     5.0    CG   

                                               text_  \
0  love this  well made sturdy and very comfortab...   
1  love it a great upgrade from the original  ive...   
2  this pillow saved my back i love the look and ...   
3  missing information on how to use it but it is...   
4  very nice set good quality we have had the set...   

                                              tokens  
0  [love, this,  , well, made, sturdy, and, very,...  
1  [love, it, a, great, upgrade, from, the, origi...  
2  [this, pillow, saved, my, back, i, love, the, ...  
3  [missing, information, on, how, to, use, it, b...  
4  [very, nice, set, good, quality, we, have, had...  


the above code snippet took 15 mins to run. i'll now be downloading this dataset to prevent having to run it every time.

In [14]:
# Export to a CSV file
df.to_csv('fakeReviewDataTokenized.csv', index=False)  # Set index=False to avoid saving the index column


In [15]:
df = pd.read_csv("fakeReviewDataTokenized.csv")

STEP 4:

STOPWORD REMOVAL


In [16]:
import ast

# Convert stringified lists to actual lists
df['tokens'] = df['tokens'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Verify the output
print(df['tokens'].iloc[0])  # Check the first row's tokens
print(type(df['tokens'].iloc[0]))  # Confirm type is a list


['love', 'this', ' ', 'well', 'made', 'sturdy', 'and', 'very', 'comfortable', ' ', 'i', 'love', 'itvery', 'pretty']
<class 'list'>


In [19]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

# Remove stopwords
df['tokens_cleaned'] = df['tokens'].apply(lambda tokens: [word for word in tokens if word.lower() not in stop_words])

# Verify cleaned tokens
print(df['tokens_cleaned'].iloc[0])  # Check the first row's cleaned tokens
print(type(df['tokens_cleaned'].iloc[0]))  # Confirm type is a list


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['love', ' ', 'well', 'made', 'sturdy', 'comfortable', ' ', 'love', 'itvery', 'pretty']
<class 'list'>


In [20]:
print(df.head())

             category  rating label  \
0  Home_and_Kitchen_5     5.0    CG   
1  Home_and_Kitchen_5     5.0    CG   
2  Home_and_Kitchen_5     5.0    CG   
3  Home_and_Kitchen_5     1.0    CG   
4  Home_and_Kitchen_5     5.0    CG   

                                               text_  \
0  love this  well made sturdy and very comfortab...   
1  love it a great upgrade from the original  ive...   
2  this pillow saved my back i love the look and ...   
3  missing information on how to use it but it is...   
4  very nice set good quality we have had the set...   

                                              tokens  \
0  [love, this,  , well, made, sturdy, and, very,...   
1  [love, it, a, great, upgrade, from, the, origi...   
2  [this, pillow, saved, my, back, i, love, the, ...   
3  [missing, information, on, how, to, use, it, b...   
4  [very, nice, set, good, quality, we, have, had...   

                                      tokens_cleaned  
0  [love,  , well, made, sturdy, com

In [21]:
# Filter out spaces and empty strings
df['tokens_cleaned'] = df['tokens_cleaned'].apply(lambda tokens: [word for word in tokens if word.strip()])
print(df[['tokens', 'tokens_cleaned']].head())


                                              tokens  \
0  [love, this,  , well, made, sturdy, and, very,...   
1  [love, it, a, great, upgrade, from, the, origi...   
2  [this, pillow, saved, my, back, i, love, the, ...   
3  [missing, information, on, how, to, use, it, b...   
4  [very, nice, set, good, quality, we, have, had...   

                                      tokens_cleaned  
0  [love, well, made, sturdy, comfortable, love, ...  
1  [love, great, upgrade, original, mine, couple,...  
2    [pillow, saved, back, love, look, feel, pillow]  
3  [missing, information, use, great, product, pr...  
4       [nice, set, good, quality, set, two, months]  


STEP 5.1:

STEMMING


In [22]:
from nltk.stem import PorterStemmer

# Initialize the stemmer
stemmer = PorterStemmer()

# Apply stemming to the cleaned tokens
df['tokens_stemmed'] = df['tokens_cleaned'].apply(lambda tokens: [stemmer.stem(word) for word in tokens])

# Verify the results
print(df[['tokens_cleaned', 'tokens_stemmed']].head())



                                      tokens_cleaned  \
0  [love, well, made, sturdy, comfortable, love, ...   
1  [love, great, upgrade, original, mine, couple,...   
2    [pillow, saved, back, love, look, feel, pillow]   
3  [missing, information, use, great, product, pr...   
4       [nice, set, good, quality, set, two, months]   

                                      tokens_stemmed  
0  [love, well, made, sturdi, comfort, love, itve...  
1   [love, great, upgrad, origin, mine, coupl, year]  
2     [pillow, save, back, love, look, feel, pillow]  
3         [miss, inform, use, great, product, price]  
4        [nice, set, good, qualiti, set, two, month]  


STEP 5.2:

LEMMATIZATION

In [23]:
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Apply lemmatization
df['tokens_lemmatized'] = df['tokens_cleaned'].apply(lambda tokens: [token.lemma_ for token in nlp(" ".join(tokens))])

# Verify results
print(df[['tokens_stemmed', 'tokens_lemmatized']].head())


                                      tokens_stemmed  \
0  [love, well, made, sturdi, comfort, love, itve...   
1   [love, great, upgrad, origin, mine, coupl, year]   
2     [pillow, save, back, love, look, feel, pillow]   
3         [miss, inform, use, great, product, price]   
4        [nice, set, good, qualiti, set, two, month]   

                                   tokens_lemmatized  
0  [love, well, make, sturdy, comfortable, love, ...  
1  [love, great, upgrade, original, mine, couple,...  
2     [pillow, save, back, love, look, feel, pillow]  
3    [miss, information, use, great, product, price]  
4        [nice, set, good, quality, set, two, month]  


In [24]:
print(df[['tokens_stemmed', 'tokens_lemmatized']].head())


                                      tokens_stemmed  \
0  [love, well, made, sturdi, comfort, love, itve...   
1   [love, great, upgrad, origin, mine, coupl, year]   
2     [pillow, save, back, love, look, feel, pillow]   
3         [miss, inform, use, great, product, price]   
4        [nice, set, good, qualiti, set, two, month]   

                                   tokens_lemmatized  
0  [love, well, make, sturdy, comfortable, love, ...  
1  [love, great, upgrade, original, mine, couple,...  
2     [pillow, save, back, love, look, feel, pillow]  
3    [miss, information, use, great, product, price]  
4        [nice, set, good, quality, set, two, month]  


In [25]:
df.to_csv('fakeReviewDataLemmatized.csv', index=False)  # Set index=False to avoid saving the index column


In [26]:
import pandas as pd
df = pd.read_csv("fakeReviewDataLemmatized.csv")

6:

VECTORIZATION

In [28]:
import ast

# Convert stringified lists to actual lists
df['tokens'] = df['tokens'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Verify the output
print(df['tokens'].iloc[0])  # Check the first row's tokens
print(type(df['tokens'].iloc[0]))  # Confirm type is a list


['love', 'this', ' ', 'well', 'made', 'sturdy', 'and', 'very', 'comfortable', ' ', 'i', 'love', 'itvery', 'pretty']
<class 'list'>


In [29]:
from gensim.models import Word2Vec

# Train a Word2Vec model on the tokens
word2vec_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=1, workers=4)

# Calculate sentence embeddings (average of word vectors)
df['sentence_embedding'] = df['tokens'].apply(
    lambda tokens: sum(word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv) / len(tokens)
)

# Verify the embeddings
print(df['sentence_embedding'].head())


0    [-1.2240342, -0.72104365, 0.41656366, -0.40086...
1    [-1.0465335, 0.033930317, 0.33292958, 0.469406...
2    [-1.4938077, 0.27488175, 0.8935994, -0.6465594...
3    [-0.9308116, -0.029670544, 0.5803284, -0.62010...
4    [-0.61544967, 0.12781528, 0.5377493, 0.0713582...
Name: sentence_embedding, dtype: object


In [30]:
# Save the Word2Vec model
word2vec_model.save("word2vec_model.model")
