
# NLP Dialogue Chatbot using Classical NLP & DistilBERT

**Topic:** Natural Language Processing (NLP)  
**By:** Swayam Sodha

In [9]:

import pandas as pd
import numpy as np
import re
import nltk
import torch

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from transformers import DistilBertTokenizer, DistilBertModel

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [10]:

df = pd.read_csv(
    "dialogues.txt",
    sep="\t",
    header=None,
    names=["context", "response"]
)

df.head()


Unnamed: 0,context,response
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.


In [11]:

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

df["clean_context"] = df["context"].apply(clean_text)
df["clean_response"] = df["response"].apply(clean_text)

df.head()


Unnamed: 0,context,response,clean_context,clean_response
0,"hi, how are you doing?",i'm fine. how about yourself?,hi how are you doing,im fine how about yourself
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.,im fine how about yourself,im pretty good thanks for asking
2,i'm pretty good. thanks for asking.,no problem. so how have you been?,im pretty good thanks for asking,no problem so how have you been
3,no problem. so how have you been?,i've been great. what about you?,no problem so how have you been,ive been great what about you
4,i've been great. what about you?,i've been good. i'm in school right now.,ive been great what about you,ive been good im in school right now


In [12]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)

df["processed_context"] = df["clean_context"].apply(preprocess)

df.head()

Unnamed: 0,context,response,clean_context,clean_response,processed_context
0,"hi, how are you doing?",i'm fine. how about yourself?,hi how are you doing,im fine how about yourself,hi
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.,im fine how about yourself,im pretty good thanks for asking,im fine
2,i'm pretty good. thanks for asking.,no problem. so how have you been?,im pretty good thanks for asking,no problem so how have you been,im pretty good thanks asking
3,no problem. so how have you been?,i've been great. what about you?,no problem so how have you been,ive been great what about you,problem
4,i've been great. what about you?,i've been good. i'm in school right now.,ive been great what about you,ive been good im in school right now,ive great


In [13]:

vectorizer = TfidfVectorizer(
    ngram_range=(1,2),
    max_features=5000
)

X = vectorizer.fit_transform(df["processed_context"])
y = df["response"]


In [14]:

nb_model = MultinomialNB()
nb_model.fit(X, y)


In [15]:

def chatbot_classical(user_input):
    user_input = preprocess(clean_text(user_input))
    vec = vectorizer.transform([user_input])
    return nb_model.predict(vec)[0]

chatbot_classical("i think it may rain")


np.str_('what do you mean?')

In [16]:

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [17]:

def encode_bert(texts):
    inputs = tokenizer(
        texts,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

X_bert = encode_bert(df["processed_context"].tolist())


In [18]:

bert_clf = LogisticRegression(max_iter=1000)
bert_clf.fit(X_bert, df["response"])


In [19]:

def chatbot_bert(user_input):
    user_input = preprocess(clean_text(user_input))
    emb = encode_bert([user_input])
    return bert_clf.predict(emb)[0]

chatbot_bert("hi, how are you doing?")
#chatbot_bert("i really wish it wasn't so hot every day")

'now a stamp is 42 cents.'

#Improvised Model

In [20]:
import torch
from sklearn.metrics.pairwise import cosine_similarity

context_embeddings = encode_bert(
    df["processed_context"].tolist()
)

In [21]:
def chatbot_semantic(user_input, threshold=0.6):
    user_input = preprocess(clean_text(user_input))
    user_emb = encode_bert([user_input])

    similarities = cosine_similarity(
        user_emb,
        context_embeddings
    )[0]

    best_idx = similarities.argmax()
    best_score = similarities[best_idx]

    if best_score < threshold:
        return "I'm not sure about that. Can you rephrase?"

    return df.iloc[best_idx]["response"]


In [22]:
!pip install transformers torch



In [30]:
from transformers import AutoTokenizer, AutoModelForCausalLM

dg_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
dg_model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")

dg_tokenizer.pad_token = dg_tokenizer.eos_token
dg_model.config.pad_token_id = dg_model.config.eos_token_id

In [36]:
def chatbot_generative(user_input, max_retries=2):
    # Add conversation start conditioning
    prompt = f"The following is a friendly conversation.\nUser: {user_input}\nBot:"

    inputs = dg_tokenizer(
        prompt,
        return_tensors="pt",
        padding=True
    )

    for attempt in range(max_retries + 1):
        output_ids = dg_model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=150,
            do_sample=True,
            temperature=0.7 + 0.3 * attempt,
            top_k=50 + 50 * attempt,
            top_p=0.9,
            pad_token_id=dg_tokenizer.eos_token_id
        )

        decoded = dg_tokenizer.decode(
            output_ids[0],
            skip_special_tokens=True
        )

        # Extract only bot response
        if "Bot:" in decoded:
            decoded = decoded.split("Bot:")[-1]

        decoded = decoded.strip()

        if decoded:
            return decoded

    return "[No response generated]"


In [25]:
small_talk_keywords = [
    "hi", "hello", "hey", "how are you",
    "what's up", "how's it going"
]

def is_small_talk(text):
    text = text.lower()
    return any(k in text for k in small_talk_keywords)


In [26]:
def chatbot_hybrid(user_input):
    if is_small_talk(user_input):
        return chatbot_generative(user_input)

    return chatbot_semantic(user_input)

In [41]:
chatbot_hybrid("hello there")

'Hello.'