In [20]:
import json
import numpy as np
import math
import pickle
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# EDA

## Load JSON file & show details about the file

In [2]:
print("Loading data...")
with open('chatbotdata.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

Loading data...


In [3]:
patterns = []
labels = []
intent_counts = {}

for intent in data['intents']:
    intent_name = intent['name']
    intent_counts[intent_name] = len(intent['patterns'])

    for pattern in intent['patterns']:
        if pattern.strip():
            patterns.append(pattern)
            labels.append(intent_name)

print(f"\n Loaded {len(patterns)} patterns from {len(data['intents'])} intents")
print("\n\n Intent distribution:")
for intent_name, count in sorted(intent_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"    {intent_name}: {count} patterns")


 Loaded 6413 patterns from 11 intents


 Intent distribution:
    thanks: 2826 patterns
    goodbye: 2249 patterns
    greeting: 987 patterns
    about_owner: 74 patterns
    capabilities: 52 patterns
    who_are_you: 51 patterns
    contact: 47 patterns
    projects: 36 patterns
    compliment: 34 patterns
    skills: 33 patterns
    joke: 24 patterns


### Because we have some intents with < 1% patterns

# Apply TF-IDF

In [4]:
# Simple Tokenizer
def tokenize(text):
    return text.lower().split()

vocab = set()
for pattern in patterns:
    vocab.update(tokenize(pattern))

vocab = sorted(vocab)
word2idx = {word: i for i, word in enumerate(vocab)}
print(f"Vocabulary size: {len(vocab)}")

Vocabulary size: 428


In [5]:
## Calculate TF (Term Frequency) for each document
def compute_tf(document):
    tokens = tokenize(document)
    tf_counter = Counter(tokens)
    total_count = len(tokens)
    tf_vector = np.zeros(len(vocab))

    for token, count in tf_counter.items():
        if token in word2idx:
            tf_vector[word2idx[token]] = count / total_count
    return tf_vector

tf_vectors = np.array([compute_tf(p) for p in patterns])

In [6]:
## Calculate IDF (Inverse Document Freq. )
N = len(patterns)
df = np.zeros(len(vocab))

for i, word in enumerate(vocab):
    df[i] = sum(1 for pattern in patterns if word in tokenize(pattern))

idf = np.log((1 + N) / (1 + df)) + 1

In [8]:
tfidf_vectors = tf_vectors * idf
print("TF-IDF matrix shape:", tfidf_vectors.shape)

TF-IDF matrix shape: (6413, 428)


In [10]:
### Verify an TF-IDF vector
print("First pattern:", patterns[0])
print("Vector TF-IDF:", tfidf_vectors[0])

First pattern: nice to see you dude
Vector TF-IDF: [0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.  

In [12]:
## Find most important words for each document
## Let's say.. we want for the first document
top_idx = tfidf_vectors[0].argsort()[::-1][:5]
print([vocab[i] for i in top_idx])

['nice', 'dude', 'to', 'see', 'you']


# Train the model using LR

In [16]:
## Apply Label Encoding
le = LabelEncoder()
y_encoded = le.fit_transform(labels)

pickle.dump(le, open('label_encoder.pkl', 'wb'))
print(f"Num. classes: {le.classes_}")

Num. classes: ['about_owner' 'capabilities' 'compliment' 'contact' 'goodbye' 'greeting'
 'joke' 'projects' 'skills' 'thanks' 'who_are_you']


In [19]:
## Split the data ( train/test split )
X_train, X_test, y_train, y_test = train_test_split(
    tfidf_vectors,
    y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded)

print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

X_train shape: (5130, 428), y_train shape: (5130,)
X_test shape: (1283, 428), y_test shape: (1283,)


In [22]:
## Train the model using Logistic Regression
model = LogisticRegression(max_iter=500, random_state=42)
model.fit(X_train, y_train)

pickle.dump(model, open('nlp_model_lr.pkl', 'wb'))
print("LR model was trained and saved on (nlp_model_lr.pkl).")

LR model was trained and saved on (nlp_model_lr.pkl).


In [24]:
## Evaluate the model

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Model acc. on data test set: {accuracy:.4f}")

print("\n\nDetailed clasification:")
print(classification_report(y_test, y_pred, target_names=le.classes_, zero_division=0))

Model acc. on data test set: 0.9914


Detailed clasification:
              precision    recall  f1-score   support

 about_owner       1.00      0.87      0.93        15
capabilities       1.00      0.60      0.75        10
  compliment       1.00      1.00      1.00         7
     contact       1.00      0.89      0.94         9
     goodbye       1.00      1.00      1.00       450
    greeting       1.00      1.00      1.00       198
        joke       1.00      0.80      0.89         5
    projects       0.75      0.86      0.80         7
      skills       0.83      0.71      0.77         7
      thanks       1.00      1.00      1.00       565
 who_are_you       0.71      1.00      0.83        10

    accuracy                           0.99      1283
   macro avg       0.94      0.88      0.90      1283
weighted avg       0.99      0.99      0.99      1283

