# Text Preprocessing

In [110]:
!pip install pandas
import re
import nltk
from nltk.corpus import stopwords,wordnet
from nltk.stem import WordNetLemmatizer
from difflib import get_close_matches
import pandas as pd



In [111]:
packages = ['punkt_tab','wordnet','stopwords','omw-1.4','averaged_perceptron_tagger_eng']
for pkg in packages:
    try:
        nltk.data.find(pkg)
    except Exception:
        nltk.download(pkg)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [112]:
# Sample input by user
texts = [
    "Do you have chesecake?"
]

vocabulary = ["margherita", "pizza", "cheesecake", "truffle", "pasta", "reservation", "table", "open", "today", "address"]

In [113]:
!pip install contractions
import contractions



In [114]:
lemmatizer = WordNetLemmatizer()
sw = set(stopwords.words('english'))

In [115]:
!pip install python-Levenshtein
try:
    import Levenshtein
    has_lev = True
except Exception:
    has_lev = False
# print(has_lev)



In [116]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [117]:
# Text Preprocessing
cleaned_texts = []

for text in texts:
    # Step 1 - Remove contractions, strip and lowercase the text
    contracted_text = contractions.fix(text)
    stripped_txt = contracted_text.strip().lower()
    
    # Step 2 - Word tokenize (Add sentence tokenization in future)
    tokens = nltk.word_tokenize(stripped_txt)
    
    # Step 3 - Remove special characters other than word, spaces and hyphen
    temp_tokens = []
    for t in tokens:
        temp = re.sub(r"[^\w\s-]","",t)
        if temp.strip() != "":
            temp_tokens.append(temp)
    tokens = temp_tokens
    
    # Step 4 - Check if the text entered is correct using Levenshtein or difflib
    corrected = []
    for t in tokens:
        if t in vocabulary:
            corrected.append(t)
            continue
        
        matched = None
        if has_lev:
            best = None
            best_score = 0.0
            for v in vocabulary:
                score = Levenshtein.ratio(t, v)
                if score > best_score:
                    best_score = score
                    best = v
            if best_score >= 0.85:
                matched = best
                
        else:
            close = get_close_matches(t, vocabulary, n=1, cutoff=0.85)
            if close:
                matched = close[0]
                
        if matched:
            corrected.append(matched)
        else:
            corrected.append(t)

    tokens = corrected

    # Step 5 - Remove stopwords
    extra_stop = {"please", "thanks", "thank"}
    temp_tokens2 = []
    for t in tokens:
        if t not in sw and t not in extra_stop:
            temp_tokens2.append(t)
    tokens = temp_tokens2
    
    # Step 6 - POS-aware lemmatization
    lemm_tokens = []
    pos_tags = nltk.pos_tag(tokens)
    for word,tag in pos_tags:
        wn_pos = get_wordnet_pos(tag)
        lemma = lemmatizer.lemmatize(word,wn_pos)
        lemm_tokens.append(lemma)
    tokens = lemm_tokens

    # Step 7 - Final cleanup and join
    clean_tokens = []
    for t in tokens:
        if t and len(t.strip()) > 0:
            clean_tokens.append(t)
    cleaned = " ".join(clean_tokens)
    cleaned_texts.append(cleaned)

In [118]:
df_demo = pd.DataFrame({"raw": texts, "cleaned": cleaned_texts})
df_demo

Unnamed: 0,raw,cleaned
0,Do you have chesecake?,cheesecake


# Vectorization 

In [119]:
data_path = r"C:\Users\User\anaconda3\envs\Chatbot\intents.csv"

In [120]:
text_col = "text"
intent_col = "intent"

In [121]:
!pip install scikit-learn



In [122]:
from pathlib import Path
import json
import random, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [123]:
# load dataset
path = Path(data_path)
if not path.exists():
    raise FileNotFoundError(f"Dataset not found at {path}")
if path.suffix.lower() in ['.xlsx','.xls']:
    df = pd.read_excel(path)
else:
    df = pd.read_csv(path)

if text_col not in df.columns or intent_col not in df.columns:
    raise RuntimeError(f"Dataset must contain columns '{text_col}' and '{intent_col}'. Found: {list(df.columns)}")

In [124]:
# Drop rows where text or intent is NaN
df = df.dropna(subset=["text", "intent"]).reset_index(drop=True)
print(df.head())

        text    intent
0      hello  greeting
1         hi  greeting
2        hey  greeting
3   hi there  greeting
4  hello bot  greeting


In [125]:
X = df["text"]
y = df["intent"]

# Train / test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [126]:
# Build pipeline: TF-IDF + LinearSVC
# model = Pipeline([
#     ("tfidf", TfidfVectorizer(
#         ngram_range=(1, 2),      
#         stop_words="english",   
#         max_df=0.9,              
#         min_df=2                
#     )),
#     ("clf", LinearSVC())
# ])                            # Gave less accuracy of 75

model = Pipeline([
    ("tfidf", TfidfVectorizer(
    ngram_range=(1,2),
    min_df=1,
    max_df=0.95,
    sublinear_tf=True
)),
    ("clf", LinearSVC())
])

In [127]:
# Train
model.fit(X_train, y_train)

In [128]:
# Evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8481012658227848
                      precision    recall  f1-score   support

            ambience       0.73      0.89      0.80         9
chef_recommendations       1.00      0.60      0.75         5
 contact_information       1.00      1.00      1.00         5
     dietary_options       0.83      1.00      0.91         5
             goodbye       1.00      0.89      0.94         9
            greeting       0.80      0.67      0.73         6
            location       0.83      1.00      0.91         5
        menu_inquiry       0.67      0.67      0.67         6
       opening_hours       0.67      0.80      0.73         5
       price_inquiry       1.00      1.00      1.00         5
      special_offers       0.83      1.00      0.91         5
   table_reservation       1.00      0.60      0.75         5
           wait_time       0.89      0.89      0.89         9

            accuracy                           0.85        79
           macro avg       0.87      0.

In [129]:
model_path = r"C:\Users\User\anaconda3\envs\Chatbot\restaurant_intent_classifier.joblib"
joblib.dump(model, model_path)
print("Model saved to:", model_path)
print("Has idf_?:", hasattr(model.named_steps['tfidf'], 'idf_'))

Model saved to: C:\Users\User\anaconda3\envs\Chatbot\restaurant_intent_classifier.joblib
Has idf_?: True
