In [103]:
# Importing Nescessary libraries
import random
import pickle
import numpy as np
import json
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

import nltk
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')

In [104]:
# Loading Intents and creating Lemmatizer object
lemmatizer = WordNetLemmatizer()

# Loading JSON file
intents = json.loads(open('intents.json').read())

text_data = []
labels = []
stopwords = set(nltk.corpus.stopwords.words('english'))
ignore_letters = ['?', '!', '.', ',','']

In [129]:
# Collection of all words by tokenizing the patterns and lemmatizing them


for intent in intents['intents']:
    for example in intent['patterns']:
        tokens = nltk.word_tokenize(example.lower())
        filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stopwords and token not in ignore_letters]
        if filtered_tokens:
            # Storing the token list of every tag in text_data in form of tuple
            text_data.append(' '.join(filtered_tokens))
            # Storing all the tags in labels
            labels.append(intent['tag'])

In [106]:
print(len(text_data))
print(len(labels))

# Creating pickle files
pickle.dump(text_data, open('words.pkl', 'wb'))
pickle.dump(labels, open('labels.pkl', 'wb'))  

23
23


In [107]:
# Now we are basically creating vectors of the tags using Tfidf vectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(text_data)
y = labels

In [None]:
vectorizer.vocabulary_

In [109]:
# Using grid search CV to find best model among these
def find_best_model(X, y, test_size=0.2):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=100)


    models = [
        ('Logistic Regression', LogisticRegression(), {
            'penalty': ['l2'],
            'C': [0.1, 1.0, 10.0],
            'solver': ['liblinear'],
            'max_iter': [100, 1000, 10000]
        }),
        ('Multinomial Naive Bayes', MultinomialNB(), {'alpha': [0.1, 0.5, 1.0]}),
        ('Linear SVC', LinearSVC(), {
            'penalty': ['l2'],
            'loss': ['hinge', 'squared_hinge'],
            'C': [0.1, 1, 10],
            'max_iter': [100, 1000, 10000]
        }),
        ('Decision Tree', DecisionTreeClassifier(), {
            'max_depth': [5, 10, 20, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'criterion': ['gini', 'entropy']
        }),
        ('Random Forest', RandomForestClassifier(), {
            'n_estimators': [100, 200, 300],
            'max_depth': [10, 20, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        })
    ]
    best_model_info = None
    best_score = -float('inf')
    for name, model, param_grid in models:
        grid = GridSearchCV(model, param_grid, cv=3, n_jobs=-1)
        grid.fit(X_train, y_train)
        y_pred = grid.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        print(f'{name}: {score:.4f} (best parameters: {grid.best_params_})')
        if score > best_score:
            best_score = score
            best_model_info = (name, grid.best_estimator_)

    best_model_name, best_model = best_model_info
    print(f'\nBest model: {best_model_name}')
    
    # Fit the best model to the full dataset
    best_model.fit(X, y)
    return best_model[1]
    

In [110]:
best_model = find_best_model(X, y)

Logistic Regression: 0.2000 (best parameters: {'C': 10.0, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'})
Multinomial Naive Bayes: 0.2000 (best parameters: {'alpha': 0.1})
Linear SVC: 0.2000 (best parameters: {'C': 1, 'loss': 'hinge', 'max_iter': 100, 'penalty': 'l2'})
Decision Tree: 0.2000 (best parameters: {'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2})
Random Forest: 0.2000 (best parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300})

Best model: Logistic Regression


In [123]:
# Cleaning user input by tokenizing and lemmatizing
def clean_sentence(input):
    input=nltk.word_tokenize(input.lower())
    cleaned=[lemmatizer.lemmatize(w) for w in input if w not in ignore_letters]
    return [' '.join(cleaned)]


In [127]:
# Using the best model to predict
def predict_output(input):
    x=clean_sentence(input)
    predicted_intent=best_model.predict(vectorizer.transform(x))
    for intent in intents['intents']:
        if intent['tag'] == predicted_intent:
            response = random.choice(intent['responses'])
            break
            
    return response

In [128]:
print('Hello! I am a chatbot. How can I help you today? Type "quit" to exit.')
while True:
    user_input = input("")
    if user_input.lower() == 'quit':
        print("Comeback again :)")
        break
    response = predict_output(user_input)
    print(response)

Hello! I am a chatbot. How can I help you today? Type "quit" to exit.
The Big Bang theory is the prevailing cosmological model for the universe. It states that the universe began with a very hot, dense state and has been expanding and cooling ever since.
I'm in jouful mood today!
Hello


In [130]:
import os


if not os.path.exists('model'):
    os.makedirs('model')

# Save the trained model
with open('model/chatbot_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

# Save the vectorizer
with open('model/vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
