In [1]:
import json
import numpy as np
import math
import pickle
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# EDA

## Load JSON file & show details about the file

In [2]:
print("Loading data...")
with open('chatbotdata.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

Loading data...


In [3]:
patterns = []
labels = []
intent_counts = {}

for intent in data['intents']:
    intent_name = intent['name']
    intent_counts[intent_name] = len(intent['patterns'])

    for pattern in intent['patterns']:
        if pattern.strip():
            patterns.append(pattern)
            labels.append(intent_name)

print(f"\n Loaded {len(patterns)} patterns from {len(data['intents'])} intents")
print("\n\n Intent distribution:")
for intent_name, count in sorted(intent_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"    {intent_name}: {count} patterns")


 Loaded 1159 patterns from 20 intents


 Intent distribution:
    goodbye: 107 patterns
    thanks: 105 patterns
    greeting: 104 patterns
    about_owner: 74 patterns
    price_website: 67 patterns
    pred_types: 67 patterns
    price_ml: 64 patterns
    predictions: 63 patterns
    about_model: 59 patterns
    ask_weather: 54 patterns
    capabilities: 52 patterns
    who_are_you: 51 patterns
    motivation: 48 patterns
    contact: 47 patterns
    education: 40 patterns
    projects: 36 patterns
    compliment: 34 patterns
    skills: 33 patterns
    ask_time: 30 patterns
    joke: 24 patterns


# Apply TF-IDF

In [4]:
# Simple Tokenizer
def tokenize(text):
    return text.lower().split()

vocab = set()
for pattern in patterns:
    vocab.update(tokenize(pattern))

vocab = sorted(vocab)
word2idx = {word: i for i, word in enumerate(vocab)}
print(f"Vocabulary size: {len(vocab)}")

Vocabulary size: 754


In [5]:
## Calculate TF (Term Frequency) for each document
def compute_tf(document):
    tokens = tokenize(document)
    tf_counter = Counter(tokens)
    total_count = len(tokens)
    tf_vector = np.zeros(len(vocab))

    for token, count in tf_counter.items():
        if token in word2idx:
            tf_vector[word2idx[token]] = count / total_count
    return tf_vector

tf_vectors = np.array([compute_tf(p) for p in patterns])

In [6]:
## Calculate IDF (Inverse Document Freq. )
N = len(patterns)
df = np.zeros(len(vocab))

for i, word in enumerate(vocab):
    df[i] = sum(1 for pattern in patterns if word in tokenize(pattern))

idf = np.log((1 + N) / (1 + df)) + 1

In [7]:
tfidf_vectors = tf_vectors * idf
print("TF-IDF matrix shape:", tfidf_vectors.shape)

TF-IDF matrix shape: (1159, 754)


In [8]:
### Verify an TF-IDF vector
print("First pattern:", patterns[0])
print("Vector TF-IDF:", tfidf_vectors[0])

First pattern: Hello
Vector TF-IDF: [0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.      

In [9]:
## Find most important words for each document
## Let's say.. we want for the first document
top_idx = tfidf_vectors[0].argsort()[::-1][:5]
print([vocab[i] for i in top_idx])

['hello', 'â€”', 'generation.', 'gator', 'game?']


# Train the model using LR

In [10]:
## Apply Label Encoding
le = LabelEncoder()
y_encoded = le.fit_transform(labels)

pickle.dump(le, open('label_encoder.pkl', 'wb'))
print(f"Num. classes: {le.classes_}")

Num. classes: ['about_model' 'about_owner' 'ask_time' 'ask_weather' 'capabilities'
 'compliment' 'contact' 'education' 'goodbye' 'greeting' 'joke'
 'motivation' 'pred_types' 'predictions' 'price_ml' 'price_website'
 'projects' 'skills' 'thanks' 'who_are_you']


In [11]:
## Split the data ( train/test split )
X_train, X_test, y_train, y_test = train_test_split(
    tfidf_vectors,
    y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded)

print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

X_train shape: (927, 754), y_train shape: (927,)
X_test shape: (232, 754), y_test shape: (232,)


In [12]:
## Train the model using Logistic Regression
model = LogisticRegression(max_iter=500, random_state=42)
model.fit(X_train, y_train)

pickle.dump(model, open('nlp_model_lr.pkl', 'wb'))
print("LR model was trained and saved on (nlp_model_lr.pkl).")

LR model was trained and saved on (nlp_model_lr.pkl).


In [13]:
## Evaluate the model

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Model acc. on data test set: {accuracy:.4f}")

print("\n\nDetailed clasification:")
print(classification_report(y_test, y_pred, target_names=le.classes_, zero_division=0))

Model acc. on data test set: 0.8836


Detailed clasification:
               precision    recall  f1-score   support

  about_model       0.71      1.00      0.83        12
  about_owner       0.94      1.00      0.97        15
     ask_time       1.00      1.00      1.00         6
  ask_weather       1.00      0.91      0.95        11
 capabilities       0.57      0.40      0.47        10
   compliment       1.00      0.71      0.83         7
      contact       1.00      1.00      1.00         9
    education       1.00      0.62      0.77         8
      goodbye       0.94      0.81      0.87        21
     greeting       1.00      0.95      0.98        21
         joke       1.00      1.00      1.00         5
   motivation       0.91      1.00      0.95        10
   pred_types       0.87      1.00      0.93        13
  predictions       0.92      0.92      0.92        13
     price_ml       0.63      0.92      0.75        13
price_website       0.80      0.92      0.86        13
  

In [14]:
pickle.dump(vocab, open('vocab.pkl', 'wb'))
pickle.dump(word2idx, open('word2idx.pkl', 'wb'))
pickle.dump(idf, open('idf.pkl', 'wb'))
print("Vocabulary and IDF saved!")

Vocabulary and IDF saved!
