In [4]:
# Install necessary libraries
!pip install pandas scikit-learn nltk




In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk.corpus import stopwords
import re
import string

# Download NLTK data
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# Load dataset from GitHub (update the URL to your dataset's GitHub link)
url = "https://raw.githubusercontent.com/shashankvmaiya/Movie-Genre-Multi-Label-Text-Classification/refs/heads/master/Data/movies_genres.csv"
data = pd.read_csv(url, quotechar='"', delimiter="\t", encoding="utf-8", on_bad_lines="skip")

# Inspect the data
print(data.head())


                                               title  \
0                               "#7DaysLater" (2013)   
1       "#BlackLove" (2015) {Crash the Party (#1.9)}   
2  "#BlackLove" (2015) {Making Lemonade Out of Le...   
3      "#BlackLove" (2015) {Miss Independent (#1.5)}   
4     "#BlackLove" (2015) {Sealing the Deal (#1.10)}   

                                                plot  Action  Adult  \
0   #7dayslater is an interactive comedy series f...       0      0   
1   With just one week left in the workshops, the...       0      0   
2   All of the women start making strides towards...       0      0   
3   All five of these women are independent and s...       0      0   
4   Despite having gone through a life changing p...       0      0   

   Adventure  Animation  Biography  Comedy  Crime  Documentary  ...  \
0          0          0          0       1      0            0  ...   
1          0          0          0       0      0            0  ...   
2          0          0

In [7]:
# Preprocess Text
def preprocess_text(text):
    if not isinstance(text, str):  # Handle missing or non-string values
        return ""
    text = text.lower()  # Lowercase
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = " ".join(word for word in text.split() if word not in stopwords.words('english'))  # Remove stopwords
    return text

# Apply preprocessing
data['cleaned_plot'] = data['plot'].apply(preprocess_text)


In [8]:
# Get all genre columns (binary genre indicators)
genre_columns = [col for col in data.columns if col not in ['title', 'plot', 'plot_lang', 'cleaned_plot']]

# Check the available genres
print("Available genres:", genre_columns)

# Multilabel target format (optional, for better understanding)
data['genres'] = data[genre_columns].apply(lambda x: [genre_columns[i] for i, val in enumerate(x) if val == 1], axis=1)
print(data[['cleaned_plot', 'genres']].head())


Available genres: ['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Game-Show', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance', 'Sci-Fi', 'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western']
                                        cleaned_plot        genres
0  dayslater interactive comedy series featuring ...      [Comedy]
1  one week left workshops women consider idea on...  [Reality-TV]
2  women start making strides towards finding ver...  [Reality-TV]
3  five women independent strong willed theyve fa...  [Reality-TV]
4  despite gone life changing process past ten we...  [Reality-TV]


In [9]:
# Features (TF-IDF from cleaned plots) and Targets (binary genre columns)
X = data['cleaned_plot']
y = data[genre_columns]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [10]:
# TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [11]:
# Multilabel classification using OneVsRest and Naive Bayes
model = OneVsRestClassifier(MultinomialNB())
model.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = model.predict(X_test_tfidf)



In [12]:
from sklearn.metrics import classification_report, accuracy_score

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=genre_columns))


Accuracy: 0.22385767310892102

Classification Report:
               precision    recall  f1-score   support

      Action       0.88      0.19      0.32      2451
       Adult       0.00      0.00      0.00        11
   Adventure       0.72      0.14      0.23      2025
   Animation       0.83      0.18      0.29      2238
   Biography       0.00      0.00      0.00       291
      Comedy       0.86      0.35      0.50      6815
       Crime       0.84      0.40      0.54      3094
 Documentary       0.73      0.28      0.41      2397
       Drama       0.77      0.72      0.74      9234
      Family       0.84      0.09      0.16      3096
     Fantasy       0.82      0.09      0.16      1397
   Game-Show       0.87      0.33      0.47       421
     History       0.76      0.09      0.16       498
      Horror       1.00      0.01      0.02       530
       Music       0.89      0.23      0.37       588
     Musical       0.00      0.00      0.00       102
     Mystery       0.77   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
test_plot = "A thrilling adventure where a young hero discovers the power of friendship."
test_tfidf = tfidf.transform([preprocess_text(test_plot)])
test_prediction = model.predict(test_tfidf)

# Map predictions to genre names
predicted_genres = [genre for genre, value in zip(genre_columns, test_prediction[0]) if value == 1]
print("Predicted Genres:", predicted_genres)


Predicted Genres: []
