In [2]:
!pip install nlpaug

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Collecting gdown>=4.0.0 (from nlpaug)
  Downloading gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
Downloading gdown-5.2.0-py3-none-any.whl (18 kB)
Installing collected packages: gdown, nlpaug
Successfully installed gdown-5.2.0 nlpaug-1.1.11


In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
import nlpaug.augmenter.word as naw
import warnings
warnings.filterwarnings('ignore')

In [4]:
# Step 1: Load the dataset
# Assuming the dataset is in the same directory as your notebook
df = pd.read_excel('chatbot_5class_dataset.xlsx')

In [5]:
# Display the first few rows and class distribution
print("Dataset Overview:")
print(df.head())
print("\nClass Distribution:")
print(df['merged_category'].value_counts())

Dataset Overview:
                                                text merged_category
0  can you make an llm to talk to my cat tibbles ...        Services
1  i am looking to integrate the openai api into ...        Services
2  do you build intelligent automation tools yes ...        Services
3  im concerend about ai dont worry ai is here to...        Services
4  pouvezvous intgrer lintelligence artificielle ...        Services

Class Distribution:
merged_category
General       207
Info          152
Support        80
Services       76
Engagement     53
Name: count, dtype: int64


In [6]:
# Step 2: Data Preprocessing and Augmentation
# We'll augment the data to balance the classes by generating paraphrased examples for underrepresented categories

# Initialize the SBERT model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to augment text data using synonym replacement
aug = naw.SynonymAug(aug_src='wordnet')

def augment_text(text, num_augmentations=2):
    augmented_texts = [aug.augment(text) for _ in range(num_augmentations)]
    return [text] + [t[0] for t in augmented_texts]

In [7]:
# Augment underrepresented classes (e.g., those with fewer than 40 samples)
min_samples = 40
augmented_texts = []
augmented_labels = []

for category in df['merged_category'].unique():
    category_df = df[df['merged_category'] == category]
    num_samples = len(category_df)
    
    if num_samples < min_samples:
        # Calculate how many new samples we need
        num_to_augment = min_samples - num_samples
        # Select texts to augment
        texts_to_augment = category_df['text'].sample(n=min(num_samples, num_to_augment), replace=True)
        for text in texts_to_augment:
            new_texts = augment_text(text, num_augmentations=2)
            augmented_texts.extend(new_texts)
            augmented_labels.extend([category] * len(new_texts))
    else:
        augmented_texts.extend(category_df['text'].tolist())
        augmented_labels.extend(category_df['merged_category'].tolist())

# Create a new DataFrame with augmented data
augmented_df = pd.DataFrame({'text': augmented_texts, 'merged_category': augmented_labels})

# Display new class distribution
print("\nClass Distribution After Augmentation:")
print(augmented_df['merged_category'].value_counts())


Class Distribution After Augmentation:
merged_category
General       207
Info          152
Support        80
Services       76
Engagement     53
Name: count, dtype: int64


In [8]:
# Step 3: Feature Engineering with SBERT
# Encode the text data into embeddings
embeddings = model.encode(augmented_df['text'].tolist(), show_progress_bar=True)

# Encode the labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(augmented_df['merged_category'])



Batches:   0%|          | 0/18 [00:00<?, ?it/s]

In [9]:
# Step 4: Train-Test Split
# Use a 90%-10% split to give the model more training data
X_train, X_test, y_train, y_test = train_test_split(embeddings, y, test_size=0.1, random_state=42, stratify=y)

In [10]:

# Step 5: Model Training with SVM
# Use SVM with RBF kernel instead of Logistic Regression
svm = SVC(kernel='rbf', probability=True, class_weight='balanced')

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.1, 1]
}
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_
print("\nBest Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)


Best Parameters: {'C': 10, 'gamma': 'scale'}
Best Cross-Validation Accuracy: 0.7280982295830954


In [11]:
# Step 6: Evaluation on Test Set
# Predict on the test set
y_pred = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("\nTest Set Accuracy:", accuracy)

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))



Test Set Accuracy: 0.7192982456140351

Classification Report:
              precision    recall  f1-score   support

  Engagement       0.67      0.40      0.50         5
     General       0.81      0.81      0.81        21
        Info       0.72      0.87      0.79        15
    Services       0.56      0.62      0.59         8
     Support       0.67      0.50      0.57         8

    accuracy                           0.72        57
   macro avg       0.68      0.64      0.65        57
weighted avg       0.72      0.72      0.71        57



In [12]:
# Step 7: Cross-Validation for Robustness
# Perform 5-fold cross-validation on the entire dataset
cv_scores = cross_val_score(best_model, embeddings, y, cv=5, scoring='accuracy')
print("\n5-Fold Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())
print("Standard Deviation:", cv_scores.std())


5-Fold Cross-Validation Scores: [0.57894737 0.69298246 0.74561404 0.76106195 0.54867257]
Mean CV Accuracy: 0.6654556745846918
Standard Deviation: 0.08653973326406365


In [13]:

# Step 8: Handling Uncertainty
# Flag predictions with low confidence for manual review
proba = best_model.predict_proba(X_test)
max_proba = np.max(proba, axis=1)
uncertain_indices = np.where(max_proba < 0.6)[0]
print("\nNumber of Uncertain Predictions (confidence < 60%):", len(uncertain_indices))
if len(uncertain_indices) > 0:
    print("Uncertain Predictions (indices):", uncertain_indices)


Number of Uncertain Predictions (confidence < 60%): 17
Uncertain Predictions (indices): [ 5 12 14 17 19 20 21 22 24 33 37 39 40 48 49 50 56]


In [14]:
# Step 9: Save the Model (Optional)
# You can save the model for deployment
import joblib
joblib.dump(best_model, 'svm_chatbot_classifier.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')
joblib.dump(model, 'sbert_model.pkl')

print("\nModel, label encoder, and SBERT model saved for deployment.")


Model, label encoder, and SBERT model saved for deployment.
