### 1 - Importing necessary libraries

In [21]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from LearningAlgorithms import ClassificationAlgorithms
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix
from mlxtend.plotting import plot_confusion_matrix

### 2 - Feature Engineering

In [2]:
df = pd.read_csv("../data/raw/prompts_v1.csv")
df.head()

Unnamed: 0,prompt,cluster,sub_class
0,How can I integrate a chatbot into my website ...,Communication,Chatbots and Virtual Assistants
1,What are the best practices for designing a co...,Communication,Chatbots and Virtual Assistants
2,"Can chatbots handle complex queries, or are th...",Communication,Chatbots and Virtual Assistants
3,What platforms are available for building cust...,Communication,Chatbots and Virtual Assistants
4,How do I ensure my chatbot understands user in...,Communication,Chatbots and Virtual Assistants


In [3]:
X = df["prompt"]
y_sub_class = df["sub_class"]
y_cluster = df["cluster"]

##### Encoding the data

In [4]:
from sklearn.preprocessing import LabelEncoder

label_encoder_sub_class = LabelEncoder()
label_encoder_cluster = LabelEncoder()
y_sub_class = label_encoder_sub_class.fit_transform(y_sub_class)
y_cluster = label_encoder_cluster.fit_transform(y_cluster)

In [5]:
y_sub_class

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 9, 9, 9, 9, 9,
       9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
       9, 9, 9, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,

In [6]:
y_cluster

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

#### Splitting the data

In [7]:
(
    X_train,
    X_test,
    y_sub_class_train,
    y_sub_class_test,
    y_cluster_train,
    y_cluster_test,
) = train_test_split(X, y_sub_class, y_cluster, test_size=0.2, random_state=42)

In [15]:
y_sub_class_train, y_cluster_train = pd.DataFrame(y_sub_class_train), pd.DataFrame(
    y_cluster_train
)

#### Vectorizing the prompts

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

In [9]:
X_train_vectors

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2529 stored elements and shape (252, 733)>

In [10]:
X_test_vectors

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 541 stored elements and shape (64, 733)>

### 3 - Training the model

In [11]:
learner = ClassificationAlgorithms()

#### KNN Algorithm

In [16]:
(
    pred_sub_class_train,
    pred_sub_class_test,
    prob_sub_class_train,
    prob_sub_class_test,
) = learner.k_nearest_neighbor(X_train_vectors, y_sub_class_train, X_test_vectors)

In [18]:
pred_sub_class_test

array([6, 3, 7, 5, 5, 8, 1, 1, 3, 5, 2, 2, 7, 0, 9, 2, 7, 7, 7, 3, 1, 5,
       9, 8, 4, 1, 2, 0, 5, 4, 4, 9, 3, 0, 8, 8, 6, 1, 1, 9, 2, 6, 5, 5,
       3, 3, 1, 6, 4, 7, 1, 5, 5, 2, 5, 3, 4, 3, 4, 9, 8, 7, 0, 0])

In [19]:
performance_test_knn = accuracy_score(y_sub_class_test, pred_sub_class_test)

In [20]:
performance_test_knn

0.8125

In [24]:
prob_sub_class_test.iloc[0].max()

0.3

In [25]:
# 5. Function to Predict for a Custom Sentence
def predict_custom_sentence(
    sentence,
    sub_class_model,
    cluster_model,
    vectorizer,
    label_encoder_sub_class,
    label_encoder_cluster,
):
    """Predicts sub-class and cluster for a custom sentence."""

    # 1. Vectorize the sentence
    sentence_vectorized = vectorizer.transform([sentence])

    # 2. Predict Sub-class
    pred_sub_class = sub_class_model.predict(sentence_vectorized)[0]
    prob_sub_class = sub_class_model.predict_proba(sentence_vectorized)[0]
    confidence_sub_class = prob_sub_class.max()

    # 3. Predict Cluster
    pred_cluster = cluster_model.predict(sentence_vectorized)[0]
    prob_cluster = cluster_model.predict_proba(sentence_vectorized)[0]
    confidence_cluster = prob_cluster.max()

    # 4. Decode Predictions
    decoded_sub_class = label_encoder_sub_class.inverse_transform([pred_sub_class])[0]
    decoded_cluster = label_encoder_cluster.inverse_transform([pred_cluster])[0]

    return decoded_sub_class, confidence_sub_class, decoded_cluster, confidence_cluster


# 6. Example Usage
custom_sentence = "Write me a Python script to automate sending emails."
sub_class, sub_class_conf, cluster, cluster_conf = predict_custom_sentence(
    custom_sentence,
    learner.k_nearest_neighbor,  # Replace with your trained sub-class model
    learner.k_nearest_neighbor,  # Replace with your trained cluster model
    vectorizer,
    label_encoder_sub_class,
    label_encoder_cluster,
)

print(f"Predicted Sub-class: {sub_class} (Confidence: {sub_class_conf:.4f})")
print(f"Predicted Cluster: {cluster} (Confidence: {cluster_conf:.4f})")

AttributeError: 'function' object has no attribute 'predict'

In [26]:
df

Unnamed: 0,prompt,cluster,sub_class
0,How can I integrate a chatbot into my website ...,Communication,Chatbots and Virtual Assistants
1,What are the best practices for designing a co...,Communication,Chatbots and Virtual Assistants
2,"Can chatbots handle complex queries, or are th...",Communication,Chatbots and Virtual Assistants
3,What platforms are available for building cust...,Communication,Chatbots and Virtual Assistants
4,How do I ensure my chatbot understands user in...,Communication,Chatbots and Virtual Assistants
...,...,...,...
311,Draft an email to promote an upcoming sale to ...,Business and Productivity,Email Generation
312,Write an email to request a meeting with a pot...,Business and Productivity,Email Generation
313,Compose an email to express appreciation to a ...,Business and Productivity,Email Generation
314,Generate an email to solicit feedback from emp...,Business and Productivity,Email Generation


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 316 entries, 0 to 315
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   prompt     316 non-null    object
 1   cluster    316 non-null    object
 2   sub_class  316 non-null    object
dtypes: object(3)
memory usage: 7.5+ KB


In [28]:
###

In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

In [32]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to C:\Users\win
[nltk_data]     11\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\win
[nltk_data]     11\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to C:\Users\win
[nltk_data]     11\AppData\Roaming\nltk_data...


True

In [30]:
df = pd.read_csv("../data/raw/prompts_v1.csv")
df.head()

Unnamed: 0,prompt,cluster,sub_class
0,How can I integrate a chatbot into my website ...,Communication,Chatbots and Virtual Assistants
1,What are the best practices for designing a co...,Communication,Chatbots and Virtual Assistants
2,"Can chatbots handle complex queries, or are th...",Communication,Chatbots and Virtual Assistants
3,What platforms are available for building cust...,Communication,Chatbots and Virtual Assistants
4,How do I ensure my chatbot understands user in...,Communication,Chatbots and Virtual Assistants


In [33]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [36]:
# Preprocessing function
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Tokenize text
    tokens = nltk.word_tokenize(text)

    # Remove stopwords and apply lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    # Join tokens back to string
    processed_text = " ".join(tokens)

    return processed_text

In [37]:
df["processed_prompt"] = df["prompt"].apply(preprocess_text)

In [38]:
df

Unnamed: 0,prompt,cluster,sub_class,processed_prompt
0,How can I integrate a chatbot into my website ...,Communication,Chatbots and Virtual Assistants,integrate chatbot website customer support
1,What are the best practices for designing a co...,Communication,Chatbots and Virtual Assistants,best practice designing conversational flow vi...
2,"Can chatbots handle complex queries, or are th...",Communication,Chatbots and Virtual Assistants,chatbots handle complex query better simple task
3,What platforms are available for building cust...,Communication,Chatbots and Virtual Assistants,platform available building custom chatbots
4,How do I ensure my chatbot understands user in...,Communication,Chatbots and Virtual Assistants,ensure chatbot understands user intent accurately
...,...,...,...,...
311,Draft an email to promote an upcoming sale to ...,Business and Productivity,Email Generation,draft email promote upcoming sale customer base
312,Write an email to request a meeting with a pot...,Business and Productivity,Email Generation,write email request meeting potential investor
313,Compose an email to express appreciation to a ...,Business and Productivity,Email Generation,compose email express appreciation mentor guid...
314,Generate an email to solicit feedback from emp...,Business and Productivity,Email Generation,generate email solicit feedback employee new p...


In [39]:
# Feature Extraction

vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df["processed_prompt"])

In [40]:
# Label Encoding
le_cluster = LabelEncoder()
le_sub_class = LabelEncoder()

df["cluster_encoded"] = le_cluster.fit_transform(df["cluster"])
df["sub_class_encoded"] = le_sub_class.fit_transform(df["sub_class"])

# The labels you will use for training
y_cluster = df["cluster_encoded"]
y_sub_class = df["sub_class_encoded"]

In [43]:
# Cluster model

X_train, X_test, y_train, y_test = train_test_split(
    X, y_cluster, test_size=0.2, stratify=y_cluster
)

# Define models to evaluate
models = {
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "Naive Bayes": MultinomialNB(),
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(
        f"{name} - Accuracy: {accuracy_score(y_test, y_pred)}, F1 Score: {f1_score(y_test, y_pred, average='weighted')}"
    )
    print(f"Confusion Matrix:\n {confusion_matrix(y_test, y_pred)}\n")

Logistic Regression - Accuracy: 0.9375, F1 Score: 0.9358251463210703
Confusion Matrix:
 [[13  0  0  0]
 [ 0 19  0  0]
 [ 0  1 18  0]
 [ 0  1  2 10]]

SVM - Accuracy: 0.9375, F1 Score: 0.9356929347826086
Confusion Matrix:
 [[12  1  0  0]
 [ 0 19  0  0]
 [ 0  0 19  0]
 [ 0  1  2 10]]

Naive Bayes - Accuracy: 0.875, F1 Score: 0.8732541235937976
Confusion Matrix:
 [[ 9  4  0  0]
 [ 0 19  0  0]
 [ 0  1 18  0]
 [ 0  1  2 10]]



In [45]:
kfold = StratifiedKFold(n_splits=5)
for name, model in models.items():
    cv_results = cross_val_score(model, X, y_cluster, cv=kfold, scoring="f1_weighted")
    print(
        f"{name} - Cross-Validated F1 Score: {cv_results.mean()} ({cv_results.std()})"
    )

Logistic Regression - Cross-Validated F1 Score: 0.9369231245582176 (0.046766422800450236)
SVM - Cross-Validated F1 Score: 0.9328057211693563 (0.06001816352209688)
Naive Bayes - Cross-Validated F1 Score: 0.9002699927649486 (0.07742751425161094)


In [49]:
log_reg = models["Logistic Regression"]
loaded_vector = vectorizer

In [73]:
# 3. Making predictions on new data
def predict_cluster_and_subclass(new_prompt):
    # Preprocess the new prompt
    processed_prompt = preprocess_text(new_prompt)

    # Transform the processed prompt using the loaded vectorizer
    X_new = loaded_vector.transform([processed_prompt])

    # Predict the cluster
    probabilities = log_reg.predict_proba(X_new)

    # Get the index of the highest probability (the predicted class)
    predicted_index = probabilities.argmax(axis=1)

    # Get the confidence score (highest probability)
    confidence_score = probabilities[0][predicted_index]

    # Convert the predicted cluster from its numerical label back to the original string label
    predicted_cluster_label = le_cluster.inverse_transform(predicted_index)

    return predicted_cluster_label[0], confidence_score, probabilities, predicted_index

In [74]:
x, y, z, a = predict_cluster_and_subclass("How to create an API?")

In [75]:
x, y

('Programming and Development', array([0.71459811]))

In [72]:
z

array([[0.08806466, 0.06255716, 0.13478007, 0.71459811]])

In [64]:
a

array([3], dtype=int64)