In [1]:
import json
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import RandomOverSampler # type: ignore
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the JSON training data
json_file_path = "intent.json"
try:
    with open(json_file_path, "r") as file:
        training_data = json.load(file)
    print("JSON file loaded successfully.")
except FileNotFoundError:
    print(f"File not found at {json_file_path}. Please check the path.")

# Preprocess text data
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters
    return text

def preprocess_text(text):
    tokens = re.findall(r'\b\w+\b', text.lower())
    processed_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return processed_tokens

# Extract patterns and intents
patterns = []
intents = []
for intent in training_data["intents"]:
    for pattern in intent["patterns"]:
        patterns.append(pattern)
        intents.append(intent["tag"])

# Clean and preprocess patterns
cleaned_patterns = [clean_text(pattern) for pattern in patterns]
tokenized_patterns = [' '.join(preprocess_text(pattern)) for pattern in cleaned_patterns]

# Vectorize the patterns and encode the intents
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(tokenized_patterns)
words = vectorizer.get_feature_names_out()

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(intents)

# Handle imbalanced data using oversampling
ros = RandomOverSampler(random_state=42)
X_balanced, y_balanced = ros.fit_resample(X, y)
print(f"After balancing: {Counter(y_balanced)}")

# Split data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train the Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Evaluate on validation set
y_val_pred = nb_classifier.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")

# Evaluate on test set
y_test_pred = nb_classifier.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Prediction function
def predict_intent(query):
    query_tokens = preprocess_text(query)
    query_bow = vectorizer.transform([' '.join(query_tokens)])
    predicted_class = nb_classifier.predict(query_bow)
    predicted_intent = label_encoder.inverse_transform(predicted_class)
    return predicted_intent[0]

# Prediction with probabilities
def predict_intent_with_probabilities(query):
    query_tokens = preprocess_text(query)
    query_bow = vectorizer.transform([' '.join(query_tokens)])
    class_probabilities = nb_classifier.predict_proba(query_bow)[0]
    intent_labels = label_encoder.classes_
    probabilities = {intent: prob for intent, prob in zip(intent_labels, class_probabilities)}
    return probabilities

# Test the prediction function
user_query = "vintage watch"
predicted_intent = predict_intent(user_query)
print(f"Predicted intent for the query '{user_query}': {predicted_intent}")

# Get and plot probabilities
query_probabilities = predict_intent_with_probabilities(user_query)
sorted_probabilities = dict(sorted(query_probabilities.items(), key=lambda item: item[1], reverse=True))

plt.figure(figsize=(10, 6))
plt.bar(sorted_probabilities.keys(), sorted_probabilities.values(), color='skyblue')
plt.xlabel('Intents', fontsize=14)
plt.ylabel('Probability', fontsize=14)
plt.title(f'Intent Probability Distribution for Query: "{user_query}"', fontsize=16)
plt.xticks(rotation=45, fontsize=12)
plt.tight_layout()
plt.show()


ModuleNotFoundError: No module named 'imblearn'