In [9]:
!pip install autocorrect emoji beautifulsoup4 pandas nltk

Defaulting to user installation because normal site-packages is not writeable
Looking in links: /usr/share/pip-wheels
Collecting autocorrect
  Downloading autocorrect-2.6.1.tar.gz (622 kB)
[2K     [38;5;70m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m622.8/622.8 kB[0m [31m6.4 MB/s[0m  [33m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: autocorrect
  Building wheel for autocorrect (pyproject.toml) ... [?25ldone
[?25h  Created wheel for autocorrect: filename=autocorrect-2.6.1-py3-none-any.whl size=622414 sha256=45e30749344ba5bde1b7fddaff403ed767b0773086e67c704977314a06e9af72
  Stored in directory: /home/1913c543-331c-4721-95ae-6d87e099a045/.cache/pip/wheels/b6/28/c2/9ddf8f57f871b55b6fd0ab99c887531fb9a66e5ff236b82aee
Successfully built autocorrect
Installing collected packages: autocorrect
Successfully i

In [10]:
# Import libraries
import pandas as pd
import re
import emoji
import string
import nltk
from bs4 import BeautifulSoup
from autocorrect import Speller
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Download required NLTK data (Added error handling for newer versions)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('punkt_tab') # Required for newer NLTK versions
try:
    nltk.download('averaged_perceptron_tagger_eng')
except:
    nltk.download('averaged_perceptron_tagger')

# Initialize tools
spell = Speller(lang='en')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Slang and Contractions Dictionaries
slang_dict = {
    "tbh": "to be honest", "omg": "oh my god", "lol": "laugh out loud",
    "idk": "i do not know", "brb": "be right back", "btw": "by the way",
    "imo": "in my opinion", "smh": "shaking my head", "fyi": "for your information", "np": "no problem"
}

contractions_dict = {
    "can't": "cannot", "won't": "will not", "don't": "do not", "isn't": "is not",
    "aren't": "are not", "wasn't": "was not", "weren't": "were not", "i'm": "i am",
    "it's": "it is", "you're": "you are", "they're": "they are"
}

# Build contractions regex
pattern = r'\b(' + "|".join(re.escape(key) for key in contractions_dict.keys()) + r')\b'
compiled_pattern = re.compile(pattern, flags=re.IGNORECASE)

# --- Helper Functions ---

def remove_urls(text):
    return re.sub(r'http\S+|www\S+', '', text)

def remove_html(text):
    return BeautifulSoup(text, "html.parser").get_text()

def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

def replace_slang(text):
    words = text.split()
    return " ".join([slang_dict.get(word.lower(), word) for word in words])

def replace_contractions(text):
    return compiled_pattern.sub(lambda m: contractions_dict[m.group(0).lower()], text)

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def correct_spelling(text):
    return spell(text)

def remove_stopwords(text):
    words = text.split()
    return " ".join([word for word in words if word.lower() not in stop_words])

def get_wordnet_pos(tag):
    if tag.startswith('J'): return wordnet.ADJ
    elif tag.startswith('V'): return wordnet.VERB
    elif tag.startswith('N'): return wordnet.NOUN
    elif tag.startswith('R'): return wordnet.ADV
    return wordnet.NOUN

def lemmatize_text(text):
    if not isinstance(text, str) or not text.strip():
        return ""
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    return " ".join([lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags])

# --- Main Pipeline ---

def preprocess_text(text):
    if pd.isna(text): return ""
    text = text.lower()
    text = remove_urls(text)
    text = remove_html(text)
    text = remove_emojis(text)
    text = replace_slang(text)
    text = replace_contractions(text)
    text = remove_punctuation(text)
    text = remove_numbers(text)
    text = correct_spelling(text)
    text = remove_stopwords(text)
    text = lemmatize_text(text)
    return text

# Load, Apply, and Save
try:
    df = pd.read_csv("UNITENReview.csv")
    print("Processing reviews... this may take a moment due to spellcheck.")
    df["processed"] = df["Review"].apply(preprocess_text)
    
    print(df[["Review", "processed"]].head())
    df.to_csv("UNITEN_Processed.csv", index=False)
    print("File saved successfully!")
except FileNotFoundError:
    print("Error: 'UNITENReview.csv' not found. Please ensure the file is in the same folder.")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/1913c543-331c-4721-95ae-
[nltk_data]     6d87e099a045/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/1913c543-331c-4721-95ae-
[nltk_data]     6d87e099a045/nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/1913c543-331c-4721-95ae-
[nltk_data]     6d87e099a045/nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     /home/1913c543-331c-4721-95ae-
[nltk_data]     6d87e099a045/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/1913c543-331c-4721-95ae-
[nltk_data]     6d87e099a045/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/1913c543-331c-4721-95ae-
[nltk_data]     6d87e099a045/nltk_data...
[nltk_data]   Unzipping taggers/averaged

Processing reviews... this may take a moment due to spellcheck.
                                                                                                                                                                                                                                                                                                                                                         Review  \
0                                                                                                                                                                                                                                                                                                          Im happy with uniten actually, even the people are W   
1                                                                                                                                                                                                                                 