<a href="https://colab.research.google.com/github/RuchitaBhoir/codsoft/blob/main/movie_genre_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Movie Genre Classification**

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
import nltk
import zipfile
import io

nltk.download('stopwords')

# 1. Load data
# Specify the file to read within the zip archive
zip_file_path = "/content/archive (2).zip"
file_in_zip = "Genre Classification Dataset/train_data.txt"

with zipfile.ZipFile(zip_file_path, 'r') as zf:
    with zf.open(file_in_zip) as f:
        # Read the file content and then parse with pandas
        # Decode bytes to string and wrap in StringIO for pandas
        df = pd.read_csv(io.StringIO(f.read().decode('utf-8')), delimiter=':::', names=['id', 'title', 'genres', 'plot'])

print("DataFrame head after loading:")
display(df.head())

# 2. Preprocessing
df.dropna(subset=['plot', 'genres'], inplace=True)
df['genres'] = df['genres'].apply(lambda x: x.split(','))

# 3. Text and target
X = df['plot']
y = df['genres']

# 4. Encode target (multi-label)
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(y)

# 5. Split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# 6. Build pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=5000)),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='liblinear')))
])

# 7. Train model
pipeline.fit(X_train, Y_train)

# 8. Evaluate
Y_pred = pipeline.predict(Y_test) # Changed from X_test to Y_test, this is likely the source of the error.
print(classification_report(Y_test, Y_pred, target_names=mlb.classes_))

# 9. Test on new summary
def predict_genres(plot):
    preds = pipeline.predict([plot])
    return mlb.inverse_transform(preds)

# Example
example_plot = "A superhero saves the world from alien invasion."
print("Predicted genres:", predict_genres(example_plot))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


               precision    recall  f1-score   support

      action        0.74      0.05      0.10       263
       adult        0.82      0.08      0.15       112
   adventure        0.67      0.03      0.06       139
   animation        0.00      0.00      0.00       104
   biography        0.00      0.00      0.00        61
      comedy        0.74      0.28      0.40      1443
       crime        0.00      0.00      0.00       107
 documentary        0.81      0.67      0.73      2659
       drama        0.69      0.45      0.55      2697
      family        1.00      0.03      0.05       150
     fantasy        0.00      0.00      0.00        74
   game-show        1.00      0.20      0.33        40
     history        0.00      0.00      0.00        45
      horror        0.81      0.31      0.45       431
       music        0.69      0.23      0.34       144
     musical        0.00      0.00      0.00        50
     mystery        0.00      0.00      0.00        56
        n

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# New Section