In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Load the cleaned dataset
data = pd.read_csv("Movie_Questions_Base.csv")

# Define the features (questions) and labels (categories)
X = data['question']
y = data['category']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train an SVM classifier
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = svm_model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)


Accuracy: 0.9840425531914894
Classification Report:
                precision    recall  f1-score   support

       Factual       0.96      1.00      0.98        52
    Multimedia       0.98      1.00      0.99        42
Recommendation       1.00      1.00      1.00        42
     Unrelated       1.00      0.94      0.97        52

      accuracy                           0.98       188
     macro avg       0.98      0.99      0.98       188
  weighted avg       0.98      0.98      0.98       188



In [9]:
import torch

# Save the trained SVM model and vectorizer
torch.save({
    'svm_model': svm_model,
    'vectorizer': vectorizer
}, "svm_question_classifier.pth")






In [10]:
import torch
# Load the model and vectorizer
checkpoint = torch.load("svm_question_classifier.pth")
loaded_model = checkpoint['svm_model']
loaded_vectorizer = checkpoint['vectorizer']

  checkpoint = torch.load("svm_question_classifier.pth")


In [None]:
# Use the loaded model for inference
new_questions = ["Show me Cillian Murphy"]
new_questions_tfidf = loaded_vectorizer.transform(new_questions)
predictions = loaded_model.predict(new_questions_tfidf)
print(predictions)

['Unrelated']


In [48]:
from enum import Enum
import re

EMBEDDING_REL_MAPPING = {
    "director": ["director", "directed", "directs", "direct"],
    "award": ["award", "oscar", "prize"],
    'publication date': ['release', 'date', 'released', 'releases','release date', 'publication', 'launch', 'broadcast','launched'],
    'executive producer': ['showrunner', 'executive producer'],
    'screenwriter': ['screenwriter', 'scriptwriter', 'writer', 'story'],
    'film editor': ['editor', 'film editor'],
    'box office': ['box', 'office', 'funding', 'box office'],
    'cost': ['budget', 'cost'],
    'nominated for': ['nomination', 'award', 'finalist', 'shortlist', 'selection', 'nominated for'],
    'production company': ['company', 'company of production', "produced", 'production company'],
    'country of origin': ['origin', 'country', 'country of origin'],
    'cast member' :['actor', 'actress', 'cast', 'cast member'],
    'genre': ['type', 'kind', 'genre'],
}


class QuestionType(Enum):
    FACTUAL = "Factual"
    RECOMMENDATION = "Recommendation"
    MULTIMEDIA = "Multimedia"
    UNRELATED = "Unrelated"
    
def _get_question_type(user_query) -> QuestionType:

    fall_back_type = QuestionType.FACTUAL

    # Use the loaded model for inference
    new_questions = [user_query]
    new_questions_tfidf = loaded_vectorizer.transform(new_questions)
    predictions = loaded_model.predict(new_questions_tfidf)
    
    if not predictions:
        return fall_back_type
    
    type = predictions[0]
    
    match type:
        case "Factual": return QuestionType.FACTUAL
        case "Recommendation": return QuestionType.RECOMMENDATION
        case "Multimedia": return QuestionType.MULTIMEDIA
        case "Unrelated": return _double_check_question_type_for_unRelated(user_query)

def _double_check_question_type_for_unRelated(user_query) -> QuestionType:
    user_query = re.sub(r'[^a-zA-Z0-9 ]', '', user_query.lower().strip())

    factual_keywords = {"language", "mpaa"}
    for key, keywords in EMBEDDING_REL_MAPPING.items():
        factual_keywords.update(keywords)

    # Keywords for multimedia-related queries
    multimedia_keywords = {"show", "display", "view", "present", "see"}
    # Keywords for recommendation-related queries
    recommendation_keywords = {"recommend", "suggest", "advise", "offer", "favor", "i like", "i like"}
    
    if any(keyword in user_query for keyword in factual_keywords):
        return QuestionType.FACTUAL
    
    if any(keyword in user_query for keyword in multimedia_keywords):
        return QuestionType.MULTIMEDIA
    
    if any(keyword in user_query for keyword in recommendation_keywords):
        return QuestionType.RECOMMENDATION

    return QuestionType.UNRELATED



In [None]:
# Use the loaded model for inference
user_query = "Who is the executive producer of X-Men: First Class? "
new_questions = [user_query]
new_questions_tfidf = loaded_vectorizer.transform(new_questions)
predictions = loaded_model.predict(new_questions_tfidf)
print(predictions)
_get_question_type(user_query)

['Unrelated']


<QuestionType.FACTUAL: 'Factual'>