In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

In [3]:
data = []
with open('train_data.txt', 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split(" ::: ")
        if len(parts) == 4:  # Ensure the line has all parts
            index, title, genre, plot = parts
            data.append((title, genre, plot))


In [4]:
df = pd.DataFrame(data, columns=['title', 'genre', 'plot'])

# Step 2: Preprocess and Split the Data
# For simplicity, we’ll use only the 'plot' and 'genre' columns
df = df[['plot', 'genre']].dropna()  # Drop rows with missing values

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['plot'], df['genre'], test_size=0.2, random_state=42)


In [5]:
# Vectorize the plot summaries using TF-IDF
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [6]:
# Naive Bayes Classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
nb_pred = nb_model.predict(X_test_tfidf)
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_pred))
print(classification_report(y_test, nb_pred))


Naive Bayes Accuracy: 0.4784961549416121
              precision    recall  f1-score   support

      action       1.00      0.02      0.04        96
       adult       0.00      0.00      0.00        35
   adventure       0.00      0.00      0.00        58
   animation       0.00      0.00      0.00        31
   biography       0.00      0.00      0.00        15
      comedy       0.54      0.35      0.42       464
       crime       0.00      0.00      0.00        32
 documentary       0.54      0.89      0.67       846
       drama       0.42      0.82      0.55       893
      family       0.00      0.00      0.00        45
     fantasy       0.00      0.00      0.00        25
   game-show       0.00      0.00      0.00        12
     history       0.00      0.00      0.00        12
      horror       0.79      0.10      0.18       148
       music       0.00      0.00      0.00        48
     musical       0.00      0.00      0.00        14
     mystery       0.00      0.00      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
# Logistic Regression Classifier
lr_model = LogisticRegression(max_iter=200)
lr_model.fit(X_train_tfidf, y_train)
lr_pred = lr_model.predict(X_test_tfidf)
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_pred))
print(classification_report(y_test, lr_pred))

Logistic Regression Accuracy: 0.5243520364568499
              precision    recall  f1-score   support

      action       0.38      0.05      0.09        96
       adult       0.00      0.00      0.00        35
   adventure       0.80      0.07      0.13        58
   animation       0.00      0.00      0.00        31
   biography       0.00      0.00      0.00        15
      comedy       0.47      0.54      0.50       464
       crime       0.00      0.00      0.00        32
 documentary       0.60      0.86      0.71       846
       drama       0.47      0.74      0.58       893
      family       0.00      0.00      0.00        45
     fantasy       0.00      0.00      0.00        25
   game-show       1.00      0.33      0.50        12
     history       0.00      0.00      0.00        12
      horror       0.72      0.36      0.48       148
       music       0.76      0.27      0.40        48
     musical       0.00      0.00      0.00        14
     mystery       0.00      0.0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
# Support Vector Machine (SVM) Classifier
svm_model = LinearSVC()
svm_model.fit(X_train_tfidf, y_train)
svm_pred = svm_model.predict(X_test_tfidf)
print("SVM Accuracy:", accuracy_score(y_test, svm_pred))
print(classification_report(y_test, svm_pred))

SVM Accuracy: 0.5046995158074623
              precision    recall  f1-score   support

      action       0.41      0.22      0.29        96
       adult       0.38      0.17      0.24        35
   adventure       0.45      0.17      0.25        58
   animation       0.17      0.06      0.09        31
   biography       0.00      0.00      0.00        15
      comedy       0.44      0.51      0.47       464
       crime       0.20      0.03      0.05        32
 documentary       0.64      0.77      0.70       846
       drama       0.50      0.62      0.55       893
      family       0.20      0.09      0.12        45
     fantasy       0.00      0.00      0.00        25
   game-show       0.62      0.42      0.50        12
     history       0.50      0.08      0.14        12
      horror       0.57      0.47      0.52       148
       music       0.46      0.33      0.39        48
     musical       0.00      0.00      0.00        14
     mystery       0.00      0.00      0.00     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
# Step 4: Define a Function to Predict Genre for New Plot Summaries

def predict_genre(plot_summary):
    # Transform the input plot summary using the fitted TF-IDF vectorizer
    plot_summary_tfidf = tfidf.transform([plot_summary])

    # Predict the genre using the trained model
    genre_prediction = lr_model.predict(plot_summary_tfidf)

    # Return the predicted genre
    return genre_prediction[0]

In [16]:
# Example input plot summary
new_plot = "In the rugged frontier town of Dusty Ridge, a former outlaw seeks redemption by taking on the role of sheriff. When a ruthless gang of bandits, led by his old partner, threatens the town, he must confront his dark past to protect the people he now calls his own. Alongside a local rancher and a quick-drawing drifter, he rides out to face the gang in a showdown. Amidst gunfights, betrayal, and the harsh landscape of the Old West, he struggles to bring justice and earn the trust of the townsfolk, ultimately proving that even the most hardened men can find redemption."

nn = "A middle-aged woman, struggling with the recent death of her husband, moves back to her hometown to start over. There, she reconnects with old friends, rekindles her relationship with her estranged daughter, and begins volunteering at a community center. As she helps others in her town confront their challenges, she slowly finds healing and purpose in her own life. Through moments of sorrow, joy, and personal growth, she learns to accept her loss and discovers inner strength she never knew she had."

# Predict genre for the new plot summary
predicted_genre = predict_genre(nn)
print(f"Predicted genre for the input plot: {predicted_genre}")


Predicted genre for the input plot: drama
