In [79]:
!pip install pandas
import re
import nltk
from nltk.corpus import stopwords,wordnet
from nltk.stem import WordNetLemmatizer
from difflib import get_close_matches
import pandas as pd



In [80]:
packages = ['punkt_tab','wordnet','stopwords','omw-1.4','averaged_perceptron_tagger_eng']
for pkg in packages:
    try:
        nltk.data.find(pkg)
    except Exception:
        nltk.download(pkg)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [81]:
# Sample dataset
texts = [
    "Hi! Are you open today?   ",
    "Do you have chesecake?",
    "I don't want spicy food, do you have mild options?",
    "Can I reserve a table for 2 at 7pm?",
    "Is the trufle pasta available?",
    "   What's your addresses?   ",
    "I'm looking for pizzas and pastas.",
    "The desserts were amazing yesterday!",
    "The customer is running late.",
    "We are better than the other restaurant."
]

vocabulary = ["margherita", "pizza", "cheesecake", "truffle", "pasta", "reservation", "table", "open", "today", "address"]

In [82]:
!pip install contractions
import contractions



In [83]:
lemmatizer = WordNetLemmatizer()
sw = set(stopwords.words('english'))

In [84]:
!pip install python-Levenshtein
try:
    import Levenshtein
    has_lev = True
except Exception:
    has_lev = False
# print(has_lev)



In [85]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [88]:
# Text Preprocessing
cleaned_texts = []

for text in texts:
    # Step 1 - Remove contractions, strip and lowercase the text
    contracted_text = contractions.fix(text)
    stripped_txt = contracted_text.strip().lower()
    
    # Step 2 - Word tokenize (Add sentence tokenization in future)
    tokens = nltk.word_tokenize(stripped_txt)
    
    # Step 3 - Remove special characters other than word, spaces and hyphen
    temp_tokens = []
    for t in tokens:
        temp = re.sub(r"[^\w\s-]","",t)
        if temp.strip() != "":
            temp_tokens.append(temp)
    tokens = temp_tokens
    
    # Step 4 - Check if the text entered is correct using Levenshtein or difflib
    corrected = []
    for t in tokens:
        if t in vocabulary:
            corrected.append(t)
            continue
        
        matched = None
        if has_lev:
            best = None
            best_score = 0.0
            for v in vocabulary:
                score = Levenshtein.ratio(t, v)
                if score > best_score:
                    best_score = score
                    best = v
            if best_score >= 0.85:
                matched = best
                
        else:
            close = get_close_matches(t, vocabulary, n=1, cutoff=0.85)
            if close:
                matched = close[0]
                
        if matched:
            corrected.append(matched)
        else:
            corrected.append(t)

    tokens = corrected

    # Step 5 - Remove stopwords
    extra_stop = {"please", "thanks", "thank"}
    temp_tokens2 = []
    for t in tokens:
        if t not in sw and t not in extra_stop:
            temp_tokens2.append(t)
    tokens = temp_tokens2
    
    # Step 6 - POS-aware lemmatization
    lemm_tokens = []
    pos_tags = nltk.pos_tag(tokens)
    for word,tag in pos_tags:
        wn_pos = get_wordnet_pos(tag)
        lemma = lemmatizer.lemmatize(word,wn_pos)
        lemm_tokens.append(lemma)
    tokens = lemm_tokens

    # Step 7 - Final cleanup and join
    clean_tokens = []
    for t in tokens:
        if t and len(t.strip()) > 0:
            clean_tokens.append(t)
    cleaned = " ".join(clean_tokens)
    cleaned_texts.append(cleaned)

In [89]:
df = pd.DataFrame({"raw": texts, "cleaned": cleaned_texts})
df

Unnamed: 0,raw,cleaned
0,Hi! Are you open today?,hi open today
1,Do you have chesecake?,cheesecake
2,"I don't want spicy food, do you have mild opti...",want spicy food mild option
3,Can I reserve a table for 2 at 7pm?,reserve table 2 7pm
4,Is the trufle pasta available?,truffle pasta available
5,What's your addresses?,address
6,I'm looking for pizzas and pastas.,look pizza pasta
7,The desserts were amazing yesterday!,dessert amaze yesterday
8,The customer is running late.,customer run late
9,We are better than the other restaurant.,well restaurant
