In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from gensim.models import Word2Vec
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
from sklearn.metrics import make_scorer, f1_score
import numpy as np
from skmultilearn.problem_transform import LabelPowerset
from sklearn.model_selection import train_test_split, GridSearchCV

In [7]:
# Assuming your data is in a CSV file named 'movie_data.csv'
data = pd.read_csv('movies_initial.csv')

data = data[['genre','fullplot']]

data = data.dropna(subset=['genre', 'fullplot'])
print(data)

                          genre  \
0            Documentary, Short   
1                         Short   
2      Animation, Comedy, Short   
3            Documentary, Short   
4            Documentary, Short   
...                         ...   
46004                     Drama   
46006               Documentary   
46007                    Horror   
46008  Comedy, Fantasy, Romance   
46010                    Sci-Fi   

                                                fullplot  
0      Performing on what looks like a small wooden s...  
1      A stationary camera looks at a large anvil wit...  
2      One night, Arlequin come to see his lover Colo...  
3      A man (Edison's assistant) takes a pinch of sn...  
4      A man opens the big gates to the Lumi�re facto...  
...                                                  ...  
46004  A post modern theater adaptation of a classic ...  
46006  Musician Jonny Greenwood travels to Rajasthan,...  
46007  A cash strapped student who starts workin

In [8]:

# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Convert genre labels into a binary matrix for multi-label classification
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(train_data['genre'].str.split(', '))
y_test = mlb.transform(test_data['genre'].str.split(', '))


In [13]:
# Load GloVe embeddings (download 'glove.6B.50d.txt' or other variants)
glove_file = 'glove.6B.100d.txt'
word_vectors = {}
with open(glove_file, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], dtype='float32')
        word_vectors[word] = vectors

# Create a function to get the average vector for a document
def get_average_vector(doc, vectors):
    words = doc.split()
    vectors = [vectors[word] for word in words if word in vectors]
    if not vectors:
        return np.zeros(100)  # Assuming 50 dimensions for GloVe vectors
    return np.mean(vectors, axis=0)



In [14]:

# Apply the function to each document in the training and testing sets
X_train = np.array([get_average_vector(doc, word_vectors) for doc in train_data['fullplot']])
X_test = np.array([get_average_vector(doc, word_vectors) for doc in test_data['fullplot']])

# Initialize and train the Label Powerset Random Forest Classifier
classifier = LabelPowerset(classifier=RandomForestClassifier(n_estimators=100, random_state=42))
classifier.fit(X_train, y_train)


In [15]:

# Predict the labels for the test set
y_pred = classifier.predict(X_test)

In [16]:

# Display classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=mlb.classes_))

Classification Report:
              precision    recall  f1-score   support

      Action       0.60      0.24      0.34      1105
       Adult       0.00      0.00      0.00         1
   Adventure       0.66      0.10      0.18       752
   Animation       0.77      0.10      0.18       350
   Biography       1.00      0.06      0.12       372
      Comedy       0.54      0.36      0.43      2523
       Crime       0.56      0.21      0.31      1025
 Documentary       0.47      0.62      0.53       727
       Drama       0.59      0.79      0.68      4009
      Family       0.96      0.06      0.12       430
     Fantasy       0.82      0.09      0.16       415
   Film-Noir       0.00      0.00      0.00        85
     History       0.78      0.05      0.09       297
      Horror       0.55      0.21      0.30       860
       Music       0.72      0.09      0.16       255
     Musical       0.80      0.02      0.04       189
     Mystery       0.93      0.08      0.14       509
    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.20
