In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


# Assuming your data is in a CSV file named 'movie_data.csv'
data = pd.read_csv('movies_initial.csv')

data = data[['genre','fullplot']]
# Drop rows with null values
data = data.dropna(subset=['genre', 'fullplot'])

# Split the genres into separate labels
data['genre'] = data['genre'].str.split(', ')

data_s=data
data = data_s.sample(frac=0.7, random_state=42)
print(data.head())
    

                  genre                                           fullplot
33876  [Drama, Romance]  Ronnie's (Miley Cyrus) and her younger brother...
20454          [Comedy]  A middle-aged man's conservative life is distu...
25431   [Action, Drama]  In Los Angeles, an ex-con takes the undergroun...
3451          [Western]  After Confederate officer Blayde Hollister's h...
36880        [Thriller]  An engaged couple's backpacking trip in the Ca...


In [2]:
# data['genre'] = data['genre'].str.split(',').apply(lambda x: [genre.strip() for genre in x])
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(data['genre'])

# Split the data into training and testing sets
train_data, test_data, y_train, y_test = train_test_split(data['fullplot'], y, test_size=0.2, random_state=42)

print(y_train[:5])

[[0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]]


In [3]:

# Create the bag-of-words representation
vectorizer = CountVectorizer(stop_words='english',max_features=5000)
X_train = vectorizer.fit_transform(train_data)
X_test = vectorizer.transform(test_data)

print("X_train",X_train[:5])

X_train   (0, 410)	1
  (0, 1955)	1
  (0, 1916)	1
  (0, 3044)	1
  (0, 3275)	1
  (0, 4955)	1
  (0, 3912)	2
  (0, 1695)	1
  (0, 1923)	1
  (0, 720)	1
  (0, 4904)	1
  (0, 1477)	1
  (0, 3353)	1
  (0, 2678)	1
  (0, 1579)	1
  (0, 2435)	1
  (0, 217)	1
  (0, 1455)	1
  (0, 1451)	1
  (0, 4416)	1
  (0, 2752)	1
  (0, 2106)	1
  (0, 660)	1
  (0, 2960)	1
  (0, 3523)	1
  :	:
  (4, 1288)	1
  (4, 802)	1
  (4, 1289)	1
  (4, 3233)	1
  (4, 4868)	1
  (4, 2684)	1
  (4, 4132)	1
  (4, 111)	1
  (4, 913)	2
  (4, 880)	1
  (4, 4788)	1
  (4, 2661)	1
  (4, 4987)	1
  (4, 570)	1
  (4, 4511)	3
  (4, 860)	1
  (4, 1657)	1
  (4, 2301)	1
  (4, 1911)	1
  (4, 746)	1
  (4, 3375)	1
  (4, 1816)	1
  (4, 3983)	1
  (4, 2388)	1
  (4, 4646)	1


In [None]:
# Train the Support Vector Machine (SVM) classifier using OneVsRest strategy
svm_classifier = OneVsRestClassifier(SVC(kernel='linear'),n_jobs=-1)
svm_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = svm_classifier.predict(X_test)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

In [None]:
# Print the complete classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=mlb.classes_))