In [3]:
import json
import numpy as np
import nltk
import string
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
# Load intents file
with open("intents.json") as file:
    data = json.load(file)

# Initialize lists
patterns = []
tags = []


In [5]:
# Extract patterns and tags
for intent in data['intents']:
    for pattern in intent['patterns']:
        patterns.append(pattern)
        tags.append(intent['tag'])


In [6]:

# Preprocessing function
def preprocess(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

# Apply preprocessing
processed_patterns = [preprocess(p) for p in patterns]

# Encode target labels
le = LabelEncoder()
encoded_tags = le.fit_transform(tags)

# Convert text to vectors using TF-IDF
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(processed_patterns).toarray()
y = np.array(encoded_tags)

In [7]:
# Step 3: Model Training with Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Logistic Regression model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       0.00      0.00      0.00         1
           2       1.00      1.00      1.00         1
           3       0.00      0.00      0.00         0
           4       1.00      1.00      1.00         1
           5       1.00      1.00      1.00         1

    accuracy                           0.80         5
   macro avg       0.67      0.67      0.67         5
weighted avg       0.80      0.80      0.80         5

Confusion Matrix:
 [[1 0 0 0 0 0]
 [0 0 0 1 0 0]
 [0 0 1 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 1 0]
 [0 0 0 0 0 1]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
# Step 4: Prediction and Bot Response
import random
import json

# Load intents again (if in a separate script)
with open("intents.json") as file:
    intents = json.load(file)

# Define preprocessing function again for user input
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
stemmer = PorterStemmer()

def preprocess_input(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

def get_response(user_input):
    processed = preprocess_input(user_input)
    vect_input = tfidf.transform([processed]).toarray()
    pred_tag_index = model.predict(vect_input)[0]
    tag = le.inverse_transform([pred_tag_index])[0]

    for intent in intents['intents']:
        if intent['tag'] == tag:
            return random.choice(intent['responses'])

    return "I'm not sure I understand. Could you rephrase that?"

# Example interaction
while True:
    user_input = input("You: ")
    if user_input.lower() in ['exit', 'quit']:
        print("Bot: Goodbye!")
        break
    print("Bot:", get_response(user_input))


Bot: Goodbye!


In [12]:
import pickle

with open("package/model.pkl", "wb") as f:
    pickle.dump(model, f)
with open("package/vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)
with open("package/label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)
