In [8]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Sample data with more examples to improve model performance
summaries = [
    "A thrilling sci-fi adventure in space", "A romantic love story between two strangers",
    "An action-packed journey of a superhero", "A horror tale that will chill your bones",
    "A hilarious comedy about a group of friends", "A dramatic story of a family reunion",
    "A documentary on the wonders of nature", "A fantasy world with wizards and dragons",
    "A crime thriller about a detective solving mysteries", "A musical journey through different cultures"
]
genres = ["Sci-Fi", "Romance", "Action", "Horror", "Comedy", "Drama", "Documentary", "Fantasy", "Thriller", "Musical"]

# Vectorization
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(summaries)
y = np.array(genres)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Classification
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Confusion Matrix and Classification Report with zero_division parameter
unique_classes = np.unique(y)
conf_matrix = confusion_matrix(y_test, predictions, labels=unique_classes)
class_report = classification_report(y_test, predictions, labels=unique_classes, target_names=unique_classes, zero_division=1)

print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


Accuracy: 0.00%

Confusion Matrix:
[[0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0]]

Classification Report:
              precision    recall  f1-score   support

      Action       1.00      1.00      1.00       0.0
      Comedy       0.00      1.00      0.00       0.0
 Documentary       1.00      1.00      1.00       0.0
       Drama       0.00      1.00      0.00       0.0
     Fantasy       1.00      1.00      1.00       0.0
      Horror       1.00      1.00      1.00       0.0
     Musical       1.00      1.00      1.00       0.0
     Romance       1.00      0.00      0.00       1.0
      Sci-Fi       1.00      1.00      1.00       0.0
    Thriller       1.00      0.00      0.00       1.0

   micro avg       0.00      0.00      0.00       2.0
   macro avg       0.80      0.80      0.60       2.0
weight

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import random

# Function to augment text by replacing some words with synonyms
def augment_text(text, synonyms_dict, replace_prob=0.3):
    words = text.split()
    new_words = [
        synonyms_dict.get(word, word) if random.random() < replace_prob else word
        for word in words
    ]
    return " ".join(new_words)

# Example synonyms dictionary for basic augmentation
synonyms = {
    "adventure": "journey",
    "romantic": "love",
    "thrilling": "exciting",
    "horror": "terrifying",
    "hilarious": "funny",
    "dramatic": "emotional",
    "mysteries": "secrets",
}

# Original sample data
summaries = [
    "A thrilling sci-fi adventure in space", "A romantic love story between two strangers",
    "An action-packed journey of a superhero", "A horror tale that will chill your bones",
    "A hilarious comedy about a group of friends", "A dramatic story of a family reunion",
    "A documentary on the wonders of nature", "A fantasy world with wizards and dragons",
    "A crime thriller about a detective solving mysteries", "A musical journey through different cultures"
]
genres = ["Sci-Fi", "Romance", "Action", "Horror", "Comedy", "Drama", "Documentary", "Fantasy", "Thriller", "Musical"]

# Data augmentation: Create augmented summaries using synonym replacement
augmented_summaries = [augment_text(summary, synonyms) for summary in summaries]
augmented_summaries.extend(summaries)  # Add original and augmented data together
genres = genres * 2  # Double the genres list to match augmented data

# Vectorization with TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(augmented_summaries)
y = np.array(genres)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Classifier Ensemble: Combining Naive Bayes and Support Vector Classifier
nb_classifier = MultinomialNB()
svm_classifier = SVC(kernel='linear', probability=True)

# Ensemble with Voting Classifier
ensemble_classifier = VotingClassifier(
    estimators=[('nb', nb_classifier), ('svm', svm_classifier)],
    voting='soft'
)

# Training the ensemble model
ensemble_classifier.fit(X_train, y_train)
predictions = ensemble_classifier.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Confusion Matrix and Classification Report with zero_division parameter
unique_classes = np.unique(y)
conf_matrix = confusion_matrix(y_test, predictions, labels=unique_classes)
class_report = classification_report(y_test, predictions, labels=unique_classes, target_names=unique_classes, zero_division=1)

print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


Accuracy: 0.00%

Confusion Matrix:
[[0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1]
 [1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]]

Classification Report:
              precision    recall  f1-score   support

      Action       0.00      1.00      0.00       0.0
      Comedy       1.00      1.00      1.00       0.0
 Documentary       1.00      1.00      1.00       0.0
       Drama       1.00      0.00      0.00       1.0
     Fantasy       1.00      0.00      0.00       1.0
      Horror       1.00      1.00      1.00       0.0
     Musical       1.00      1.00      1.00       0.0
     Romance       1.00      0.00      0.00       1.0
      Sci-Fi       1.00      0.00      0.00       1.0
    Thriller       0.00      1.00      0.00       0.0

   micro avg       0.00      0.00      0.00       4.0
   macro avg       0.80      0.60      0.40       4.0
weight