In [1]:
import pandas as pd
import contractions
import re

import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag

from tqdm import tqdm

tqdm.pandas()


In [2]:
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("stopwords")
nltk.download("averaged_perceptron_tagger")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\trfdeer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\trfdeer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\trfdeer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\trfdeer\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
df = pd.read_csv("../data/Merged/reviews_div.csv")
df["review_text"] = df["review_text"].astype(str)
df["review_title"] = df["review_title"].astype(str)

df


Unnamed: 0,review_title,review_text,review_rating,review_type
0,Hey!,Whats there to say about a fantastic chocolate...,5,positive
1,Tasty & Nutritious in one easy to administer p...,My 15 month-old twin boys aren't big fans of s...,5,positive
2,Absolutely Tasty,"Okay, so these little cans aren't cheap, but w...",5,positive
3,A Good Daily Roast,This coffee has become one of my daily favorit...,5,positive
4,Newman's Own Turkey & Vegetable Catfood,I have four cats with differing tastebuds/like...,5,positive
...,...,...,...,...
29995,Has an artificial vanilla flavor,Sorry but I was looking for a nice madagascar ...,1,negative
29996,Still waiting.......,I ordered this item in AUGUST and i am a premi...,1,negative
29997,Expensive!,Do yourself a favor and go to the nearest supe...,1,negative
29998,Yucky Vegetable Smoothie Add-in,"Gerber changed the recipe for the worse, this ...",2,negative


In [4]:
stop = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

pats = [re.compile("<.*?>")]


def clean_text(text: str, remove_stopwords_only: bool = False) -> str:
    if not remove_stopwords_only:
        text = str(contractions.fix(text))
        text = text.lower()
        text = re.sub(r"\s+", " ", text)
        for pat in pats:
            text = re.sub(pat, "", text)
        words = [
            lemmatizer.lemmatize(i, j[0].lower())
            if j[0].lower() in ["a", "n", "v"]
            else lemmatizer.lemmatize(i)
            for i, j in pos_tag(word_tokenize(text))
        ]
        text = " ".join(words)

    if remove_stopwords_only:
        text = text.translate(str.maketrans("", "", punctuation))
        words = [
            word
            for word in word_tokenize(text)
            if (word.lower() not in stop or word.lower() in {"go", "do", "no", "not"})
        ]

    return text


In [5]:
df["review_text_cleaned"] = df["review_text"].progress_map(clean_text)
df["review_title_cleaned"] = df["review_title"].progress_map(clean_text)


100%|██████████| 30000/30000 [01:44<00:00, 285.98it/s]
100%|██████████| 30000/30000 [00:13<00:00, 2184.88it/s]


In [6]:
df.to_csv("../data/Merged/reviews_cleaned_nostop.csv", index=False)


In [7]:
def get_punct_count(text):
    return len(re.findall(rf"[{punctuation}]", text))


df["num_special_chars"] = df["review_text_cleaned"].apply(get_punct_count)


In [8]:
df["review_text_cleaned"] = df["review_text_cleaned"].progress_map(
    lambda text: clean_text(text, remove_stopwords_only=True)
)
df["review_title_cleaned"] = df["review_title_cleaned"].progress_map(
    lambda text: clean_text(text, remove_stopwords_only=True)
)


100%|██████████| 30000/30000 [00:05<00:00, 5404.04it/s]
100%|██████████| 30000/30000 [00:01<00:00, 22364.95it/s]


In [9]:
df["title_length"] = df["review_title_cleaned"].apply(lambda title: len(title.split()))
df["text_length"] = df["review_text_cleaned"].apply(lambda text: len(text.split()))


In [10]:
df = df.loc[
    :,
    [
        "review_title_cleaned",
        "review_text_cleaned",
        "title_length",
        "text_length",
        "num_special_chars",
        "review_type",
    ],
]
df.to_csv("../data/Merged/reviews_cleaned.csv", index=False)
