In [3]:
import pandas as pd
import re

# Step 1: Load the file
file = 'Reviews.csv'
df = pd.read_csv(file)

In [4]:
# Step 2: Convert the text to lower case
df["lowercased"] = df["Text"].astype(str).str.lower()

In [5]:
# Step 3: Remove slangs from the text
slang_dict = {
    "tbh": "to be honest",
    "omg": "oh my god",
    "lol": "laugh out loud",
    "idk": "I don't know",
    "brb": "be right back",
    "btw": "by the way",
    "imo": "in my opinion",
    "smh": "shaking my head",
    "fyi": "for your information",
    "np": "no problem",
    "ikr": "I know right",
    "asap": "as soon as possible",
    "bff": "best friend forever",
    "gg": "good game",
    "hmu": "hit me up",
    "rofl": "rolling on the floor laughing",
    "w": "win",
    "im": "i'm"
}

def replace_slang(text):
    escaped_slang_words = []
    for word in slang_dict.keys():
        escaped_word = re.escape(word)
        escaped_slang_words.append(escaped_word)
    slang_pattern = r'\b(' + '|'.join(escaped_slang_words) + r')\b'
    
    def replace_match(match):
        slang_word = match.group(0)
        return slang_dict[slang_word.lower()]
    replaced_text = re.sub(slang_pattern, replace_match, text, flags=re.IGNORECASE)
    return replaced_text

df['slangs_replaced'] = df['lowercased'].apply(replace_slang)

In [6]:
# Step 4: Replace contrations in the text with full clause
contractions_dict = {
    "wasn't": "was not",
    "isn't": "is not",
    "aren't": "are not",
    "weren't": "were not",
    "doesn't": "does not",
    "don't": "do not",
    "didn't": "did not",
    "can't": "cannot",
    "couldn't": "could not",
    "shouldn't": "should not",
    "wouldn't": "would not",
    "won't": "will not",
    "haven't": "have not",
    "hasn't": "has not",
    "hadn't": "had not",
    "i'm": "i am",
    "you're": "you are",
    "he's": "he is",
    "she's": "she is",
    "it's": "it is",
    "we're": "we are",
    "they're": "they are",
    "i've": "i have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "i'd": "i would",
    "you'd": "you would",
    "he'd": "he would",
    "she'd": "she would",
    "we'd": "we would",
    "they'd": "they would",
    "i'll": "i will",
    "you'll": "you will",
    "he'll": "he will",
    "she'll": "she will",
    "we'll": "we will",
    "they'll": "they will",
    "let's": "let us",
    "that's": "that is",
    "who's": "who is",
    "what's": "what is",
    "where's": "where is",
    "when's": "when is",
    "why's": "why is"
}

escaped_contractions = []

for contraction in contractions_dict.keys():
    escaped_contraction = re.escape(contraction)
    escaped_contractions.append(escaped_contraction)

joined_contractions = "|".join(escaped_contractions)
contractions_pattern = r'\b(' + joined_contractions + r')\b'
compiled_pattern = re.compile(contractions_pattern, flags=re.IGNORECASE)

def replaced_contractions(text):
    def replace_match(match):
        matched_word = match.group(0)  # Extract matched contraction
        lower_matched_word = matched_word.lower()  # Convert to lowercase
        expanded_form = contractions_dict[lower_matched_word]  # Get full form from dictionary
        return expanded_form  # Return the expanded form

    expanded_text = compiled_pattern.sub(replace_match, text)

    return expanded_text

df['contractions_replaced'] = df['slangs_replaced'].apply(replaced_contractions)

In [7]:
# Step 5: Remove punctuation from text
import string
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df["punctuations_removed"] = df["contractions_replaced"].apply(remove_punctuation)

# Step 6: Remove numbers form the text
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

df['numbers_removed'] = df['punctuations_removed'].apply(remove_numbers)

In [8]:
# Step 7: Perform autocorrect on the text
from autocorrect import Speller

spell = Speller(lang='en')
'''
def correct_spelling(text):
    return spell(text)
'''
df['spelling_corrected'] = df['numbers_removed'].apply(correct_spelling)


NameError: name 'correct_spelling' is not defined

In [14]:
# Step 8: Remove emojis from the text
import emoji
def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

df['emoji_removed'] = df['numbers_removed'].apply(remove_emojis)

# Step 9: Remove stopwords from the text
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    filtered_words = []
    for word in words:
        lower_word = word.lower()
        if lower_word not in stop_words:
            filtered_words.append(word)
    return " ".join(filtered_words)

df["stopwords_removed"] = df["emoji_removed"].apply(remove_stopwords)

# Step 10: Stem all the text
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def stem_text(text):
    if not isinstance(text, str):
        return ""
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    return " ".join(stemmed_words)

df["stemmed_words"] = df["stopwords_removed"].apply(stem_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thema\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
# Step 11: Lemmatize the text
import nltk

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab')

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize_text(text):
    if not isinstance(text, str):
        return ""

    words = word_tokenize(text)
    pos_tags = pos_tag(words)

    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    return " ".join(lemmatized_words)

df["lemmatized"] = df["stopwords_removed"].apply(lemmatize_text)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\thema\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\thema\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\thema\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\thema\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [16]:
# Step 12: Tokenize all the text
from nltk.tokenize import word_tokenize
nltk.download('punkt')

def tokenize_text(text):
    if not isinstance(text, str):
        return []
    return word_tokenize(text)

df["tokenized"] = df["lemmatized"].apply(tokenize_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\thema\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
df

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,lowercased,slangs_replaced,contractions_replaced,punctuations_removed,numbers_removed,emoji_removed,stopwords_removed,stemmed_words,lemmatized,tokenized
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,i have bought several of the vitality canned d...,i have bought several of the vitality canned d...,i have bought several of the vitality canned d...,i have bought several of the vitality canned d...,i have bought several of the vitality canned d...,i have bought several of the vitality canned d...,bought several vitality canned dog food produc...,bought sever vital can dog food product found ...,buy several vitality can dog food product find...,"[buy, several, vitality, can, dog, food, produ..."
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled as jumbo salted peanut...,product arrived labeled as jumbo salted peanut...,product arrived labeled as jumbo salted peanut...,product arrived labeled as jumbo salted peanut...,product arrived labeled as jumbo salted peanut...,product arrived labeled as jumbo salted peanut...,product arrived labeled jumbo salted peanutsth...,product arriv label jumbo salt peanutsth peanu...,product arrive labeled jumbo salt peanutsthe p...,"[product, arrive, labeled, jumbo, salt, peanut..."
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,this is a confection that has been around a fe...,this is a confection that has been around a fe...,this is a confection that has been around a fe...,this is a confection that has been around a fe...,this is a confection that has been around a fe...,this is a confection that has been around a fe...,confection around centuries light pillowy citr...,confect around centuri light pillowi citru gel...,confection around century light pillowy citrus...,"[confection, around, century, light, pillowy, ..."
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,if you are looking for the secret ingredient i...,if you are looking for the secret ingredient i...,if you are looking for the secret ingredient i...,if you are looking for the secret ingredient i...,if you are looking for the secret ingredient i...,if you are looking for the secret ingredient i...,looking secret ingredient robitussin believe f...,look secret ingredi robitussin believ found go...,look secret ingredient robitussin believe find...,"[look, secret, ingredient, robitussin, believe..."
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,great taffy at a great price. there was a wid...,great taffy at a great price. there was a wid...,great taffy at a great price. there was a wid...,great taffy at a great price there was a wide...,great taffy at a great price there was a wide...,great taffy at a great price there was a wide...,great taffy great price wide assortment yummy ...,great taffi great price wide assort yummi taff...,great taffy great price wide assortment yummy ...,"[great, taffy, great, price, wide, assortment,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,Great for sesame chicken..this is a good if no...,great for sesame chicken..this is a good if no...,great for sesame chicken..this is a good if no...,great for sesame chicken..this is a good if no...,great for sesame chickenthis is a good if not ...,great for sesame chickenthis is a good if not ...,great for sesame chickenthis is a good if not ...,great sesame chickenthis good better resturant...,great sesam chickenthi good better restur eate...,great sesame chickenthis good good resturants ...,"[great, sesame, chickenthis, good, good, restu..."
568450,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...,i'm disappointed with the flavor. the chocolat...,i'm disappointed with the flavor. the chocolat...,i am disappointed with the flavor. the chocola...,i am disappointed with the flavor the chocolat...,i am disappointed with the flavor the chocolat...,i am disappointed with the flavor the chocolat...,disappointed flavor chocolate notes especially...,disappoint flavor chocol note especi weak milk...,disappointed flavor chocolate note especially ...,"[disappointed, flavor, chocolate, note, especi..."
568451,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o...","these stars are small, so you can give 10-15 o...","these stars are small, so you can give 10-15 o...","these stars are small, so you can give 10-15 o...",these stars are small so you can give 1015 of ...,these stars are small so you can give of thos...,these stars are small so you can give of thos...,stars small give one training session tried tr...,star small give one train session tri train do...,star small give one training session try train...,"[star, small, give, one, training, session, tr..."
568452,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...,these are the best treats for training and rew...,these are the best treats for training and rew...,these are the best treats for training and rew...,these are the best treats for training and rew...,these are the best treats for training and rew...,these are the best treats for training and rew...,best treats training rewarding dog good groomi...,best treat train reward dog good groom lower c...,best treat train reward dog good groom low cal...,"[best, treat, train, reward, dog, good, groom,..."


In [18]:
# Finally: Save the data to CSV file 
df[['tokenized', 'lemmatized']].to_csv("cleaned text data for Amazon food reviews.csv", index=False)
print("Successfully cleaned the text data and saved into a CSV file!!")

Successfully cleaned the text data and saved into a CSV file!!
