### 1 - Importing necessary libraries and loading the data

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import joblib

In [2]:
# nltk.download("punkt")
# nltk.download("stopwords")
# nltk.download("wordnet")

In [3]:
df = pd.read_csv("../data/raw/prompts_v1.csv")
df.head()

Unnamed: 0,prompt,cluster,sub_class
0,How can I integrate a chatbot into my website ...,Communication,Chatbots and Virtual Assistants
1,What are the best practices for designing a co...,Communication,Chatbots and Virtual Assistants
2,"Can chatbots handle complex queries, or are th...",Communication,Chatbots and Virtual Assistants
3,What platforms are available for building cust...,Communication,Chatbots and Virtual Assistants
4,How do I ensure my chatbot understands user in...,Communication,Chatbots and Virtual Assistants


### 2 - Data Preprocessing

#### Preprocessing the prompts

In [4]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))


# Preprocessing function
def preprocess_text(text):

    text = text.lower()
    # Removing punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Tokenizing text
    tokens = nltk.word_tokenize(text)

    # Removing stopwords and applying lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    # Joining tokens back to string
    processed_text = " ".join(tokens)

    return processed_text

In [5]:
df["processed_prompt"] = df["prompt"].apply(preprocess_text)
df.head()

Unnamed: 0,prompt,cluster,sub_class,processed_prompt
0,How can I integrate a chatbot into my website ...,Communication,Chatbots and Virtual Assistants,integrate chatbot website customer support
1,What are the best practices for designing a co...,Communication,Chatbots and Virtual Assistants,best practice designing conversational flow vi...
2,"Can chatbots handle complex queries, or are th...",Communication,Chatbots and Virtual Assistants,chatbots handle complex query better simple task
3,What platforms are available for building cust...,Communication,Chatbots and Virtual Assistants,platform available building custom chatbots
4,How do I ensure my chatbot understands user in...,Communication,Chatbots and Virtual Assistants,ensure chatbot understands user intent accurately


#### Encoding the labels

In [6]:
from sklearn.preprocessing import LabelEncoder

le_cluster = LabelEncoder()
le_sub_class = LabelEncoder()

df["cluster_encoded"] = le_cluster.fit_transform(df["cluster"])
df["sub_class_encoded"] = le_sub_class.fit_transform(df["sub_class"])

#### Feature Extraction

In [7]:
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df["processed_prompt"])

In [8]:
y_cluster = df["cluster_encoded"]
y_sub_class = df["sub_class_encoded"]

### 3 - Modelling

#### Clusters

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_cluster, test_size=0.2, stratify=y_cluster
)

# Define models to evaluate
cluster_models = {
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "Naive Bayes": MultinomialNB(),
}

best_model = None
best_acc = 0

for name, model in cluster_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    if accuracy_score(y_test, y_pred) > best_acc:
        best_acc = accuracy_score(y_test, y_pred)
        best_model = model
    print(
        f"{name} - Accuracy: {accuracy_score(y_test, y_pred)}, F1 Score: {f1_score(y_test, y_pred, average='weighted')}"
    )
    print(f"Confusion Matrix:\n {confusion_matrix(y_test, y_pred)}\n")

Logistic Regression - Accuracy: 0.96875, F1 Score: 0.9678485576923076
Confusion Matrix:
 [[13  0  0  0]
 [ 0 19  0  0]
 [ 0  0 19  0]
 [ 0  1  1 11]]

SVM - Accuracy: 0.953125, F1 Score: 0.9510495052954291
Confusion Matrix:
 [[13  0  0  0]
 [ 0 19  0  0]
 [ 0  0 19  0]
 [ 0  1  2 10]]

Naive Bayes - Accuracy: 0.9375, F1 Score: 0.9376003556910568
Confusion Matrix:
 [[11  1  1  0]
 [ 0 19  0  0]
 [ 0  1 18  0]
 [ 0  1  0 12]]



#### Saving the best performing model

In [15]:
best_model, best_acc

(LogisticRegression(), 0.96875)

In [24]:
joblib.dump(best_model, "../models/ML Models/cluster_LR_model.pkl")

['../models/ML Models/cluster_LR_model.pkl']

#### Sub Classes

In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_sub_class, test_size=0.2, stratify=y_sub_class
)

# Define models to evaluate
subclass_models = {
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "Naive Bayes": MultinomialNB(),
}

best_model = None
best_acc = 0

for name, model in cluster_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    if accuracy_score(y_test, y_pred) > best_acc:
        best_acc = accuracy_score(y_test, y_pred)
        best_model = model
    print(
        f"{name} - Accuracy: {accuracy_score(y_test, y_pred)}, F1 Score: {f1_score(y_test, y_pred, average='weighted')}"
    )
    print(f"Confusion Matrix:\n {confusion_matrix(y_test, y_pred)}\n")

Logistic Regression - Accuracy: 0.984375, F1 Score: 0.9842948717948719
Confusion Matrix:
 [[6 0 0 0 0 0 0 0 0 0]
 [0 6 0 1 0 0 0 0 0 0]
 [0 0 6 0 0 0 0 0 0 0]
 [0 0 0 7 0 0 0 0 0 0]
 [0 0 0 0 7 0 0 0 0 0]
 [0 0 0 0 0 6 0 0 0 0]
 [0 0 0 0 0 0 6 0 0 0]
 [0 0 0 0 0 0 0 7 0 0]
 [0 0 0 0 0 0 0 0 6 0]
 [0 0 0 0 0 0 0 0 0 6]]

SVM - Accuracy: 0.953125, F1 Score: 0.9539658258408258
Confusion Matrix:
 [[6 0 0 0 0 0 0 0 0 0]
 [0 6 0 1 0 0 0 0 0 0]
 [0 0 6 0 0 0 0 0 0 0]
 [0 0 0 7 0 0 0 0 0 0]
 [0 0 0 0 7 0 0 0 0 0]
 [0 0 1 0 0 5 0 0 0 0]
 [0 0 0 0 0 0 6 0 0 0]
 [0 0 1 0 0 0 0 6 0 0]
 [0 0 0 0 0 0 0 0 6 0]
 [0 0 0 0 0 0 0 0 0 6]]

Naive Bayes - Accuracy: 0.921875, F1 Score: 0.9113782051282051
Confusion Matrix:
 [[6 0 0 0 0 0 0 0 0 0]
 [0 6 0 1 0 0 0 0 0 0]
 [3 0 2 0 1 0 0 0 0 0]
 [0 0 0 7 0 0 0 0 0 0]
 [0 0 0 0 7 0 0 0 0 0]
 [0 0 0 0 0 6 0 0 0 0]
 [0 0 0 0 0 0 6 0 0 0]
 [0 0 0 0 0 0 0 7 0 0]
 [0 0 0 0 0 0 0 0 6 0]
 [0 0 0 0 0 0 0 0 0 6]]



In [26]:
best_model, best_acc

(LogisticRegression(), 0.984375)

In [27]:
joblib.dump(best_model, "../models/ML Models/subclass_LR_model.pkl")

['../models/ML Models/subclass_LR_model.pkl']

#### Saving the vectorizer and label encoders

In [28]:
joblib.dump(vectorizer, "../models/ML Models/tfidf_vectorizer.pkl")

['../models/ML Models/tfidf_vectorizer.pkl']

In [38]:
joblib.dump(le_cluster, "../models/ML Models/cluster_label_encoder.pkl")
joblib.dump(le_sub_class, "../models/ML Models/subclass_label_encoder.pkl")

['../models/ML Models/subclass_label_encoder.pkl']

### 4 - Evaluating the models

In [29]:
cluster_LR = joblib.load("../models/ML Models/cluster_LR_model.pkl")
subclass_LR = joblib.load("../models/ML Models/subclass_LR_model.pkl")

In [30]:
def predict_cluster_and_subclass(new_prompt):

    processed_prompt = preprocess_text(new_prompt)
    X_new = vectorizer.transform([processed_prompt])

    predicted_cluster = cluster_LR.predict(X_new)
    predicted_sub_class = subclass_LR.predict(X_new)

    # Convert the predicted cluster from its numerical label back to the original string label
    predicted_cluster_label = le_cluster.inverse_transform(predicted_cluster)
    predicted_sub_class_label = le_sub_class.inverse_transform(predicted_sub_class)

    return predicted_cluster_label[0], predicted_sub_class_label[0]

In [31]:
predict_cluster_and_subclass("How to use OpenAI's API within Streamlit?")

('Programming and Development', 'API Integration')

#### Printing the confidence scores

In [39]:
loaded_vector = joblib.load("../models/ML Models/tfidf_vectorizer.pkl")
cluster_encoder = joblib.load("../models/ML Models/cluster_label_encoder.pkl")
subclass_encoder = joblib.load("../models/ML Models/subclass_label_encoder.pkl")

In [42]:
def predict_cluster_and_subclass(new_prompt):
    # Preprocess the new prompt
    processed_prompt = preprocess_text(new_prompt)

    # Transform the processed prompt using the loaded vectorizer
    X_new = loaded_vector.transform([processed_prompt])

    # Predict the cluster
    cluster_probabilities = cluster_LR.predict_proba(X_new)
    subclass_probabilities = subclass_LR.predict_proba(X_new)

    # Get the index of the highest probability (the predicted class)
    cluster_predicted_index = cluster_probabilities.argmax(axis=1)
    subclass_predicted_index = subclass_probabilities.argmax(axis=1)

    # Get the confidence score (highest probability)
    cluster_confidence_score = cluster_probabilities[0][cluster_predicted_index]
    subclass_confidence_score = subclass_probabilities[0][subclass_predicted_index]

    # Convert the predicted cluster from its numerical label back to the original string label
    predicted_cluster_label = cluster_encoder.inverse_transform(cluster_predicted_index)
    predicted_subclass_label = subclass_encoder.inverse_transform(
        subclass_predicted_index
    )

    return (
        predicted_cluster_label[0],
        cluster_confidence_score,
        predicted_subclass_label[0],
        subclass_confidence_score,
    )

In [48]:
predict_cluster_and_subclass("What is digital marketing?")

('Music and Audio', array([0.36561371]), 'Music Creation', array([0.16015151]))

In [49]:
predict_cluster_and_subclass("How to cook a pizza?")

('Music and Audio',
 array([0.31907863]),
 'Coding and Programming Assistance',
 array([0.12500426]))

In [54]:
predict_cluster_and_subclass("What can I do to improve my mental health?")

('Communication', array([0.79150815]), 'Mental Health', array([0.64666375]))

In [56]:
predict_cluster_and_subclass("What can I do to improve my business?")

('Business and Productivity',
 array([0.29935747]),
 'Presentation Creation',
 array([0.2522915]))