In [2]:
import pandas as pd
# from tqdm import tqdm

In [3]:
# load the plots_and_genres.csv file into a dataframe
df = pd.read_csv('./data/plots_and_genres.csv')
df

FileNotFoundError: [Errno 2] No such file or directory: './data/plots_and_genres.csv'

In [4]:
# count nan values
df.isna().sum()

movie     0
plot      0
genres    0
dtype: int64

In [5]:

genres_to_consider = [
    'Drama',
    'Comedy',
    'Thriller',
    'Romance',
    'Action',
    'Family',
    'Horror',
    'Crime',
    'Adventure',
    'Animation',
    'Fantasy',
    'Sci-Fi',
    'Mystery',
    'Biography',
    'Music',
    'History',
    'War',
    'Western',
    'Sport',
    'Musical',
]

In [6]:
# filter the dataframe to only include the genres we want to consider

def check(row):
    sublist = row['genres']
    # remove quotes from each item in the list
    sublist = [item.strip().strip("'") for item in sublist.strip('][').split(',')]

    # # if even one of the genres in the list is in the genres_to_consider list, return True
    # for genre in sublist:
    #     if genre in genres_to_consider:
    #         return True
    
    # check intersection of two lists
    if len(set(sublist).intersection(set(genres_to_consider))) > 0:
        # set genres to the intersection of the two lists
        row['genres'] = list(set(sublist).intersection(set(genres_to_consider)))
        return True
    
    return False

In [7]:

filtered_df = df[df.apply(check, axis=1)]

In [8]:
# count the number of movies in each genre
genre_counts = {}
for row in filtered_df['genres']:
    for genre in row:
        if genre in genre_counts:
            genre_counts[genre] += 1
        else:
            genre_counts[genre] = 1
genre_counts

{'Comedy': 80780,
 'Action': 27517,
 'Horror': 21274,
 'Drama': 134609,
 'Family': 22423,
 'Sci-Fi': 15520,
 'Romance': 31967,
 'Biography': 14315,
 'Crime': 20968,
 'Music': 13872,
 'Adventure': 19720,
 'War': 8057,
 'Sport': 6636,
 'Animation': 16935,
 'Fantasy': 16442,
 'Mystery': 15237,
 'Thriller': 31244,
 'History': 13341,
 'Musical': 6386,
 'Western': 8005}

In [9]:
# filter out plots that are more than 200 words long
filtered_df = filtered_df[filtered_df['plot'].apply(lambda x: len(x.split(' ')) < 200)]
filtered_df

filtered_df.to_csv('./data/filtered_plots_and_genres.csv', index=False)

Unnamed: 0,movie,plot,genres
0,"""#7DaysLater"" (2013)",#7dayslater is an interactive comedy series fe...,[Comedy]
1,"""#Cake"" (2015)",#CAKE is a hour-long serial narrative comedy a...,[Comedy]
2,"""#DaddyLeaks"" (????)",The life of four close friends in their late t...,[Comedy]
3,"""#Elmira"" (2014)",#Elmira follows the story of a bunch of strang...,[Comedy]
4,"""#Fuga"" (2016)","Months after an apocalyptic event, a group of ...","[Action, Horror, Drama]"
...,...,...,...
347529,� deux (2017),When 2 Musicians throw coins in the same wishi...,"[Musical, Romance]"
347531,�a go�te le ciel (2014),The 'Sky's the Limit' is the story of an 11 ye...,"[Family, Fantasy, Drama]"
347532,� solo un nastro che gira (2017),A woman in crisis finds in a peculiar bar the ...,"[Comedy, Romance, Drama]"
347533,�X-Driver the Movie (2002) (V),"The squeal of smoking tires, the roar of the e...","[Action, Animation]"


In [10]:
# use multi-label binarizer to convert the list of genres into a binary vector
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

# split the data into X and y
X = filtered_df['plot']
y = filtered_df['genres']

# transform the y data
y = mlb.fit_transform(y)
y

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 1, 0, 0]])

In [11]:
# train a naive bayes classifier with label powerset
from skmultilearn.problem_transform import LabelPowerset
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer

# vectorize the plot data
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(X)

In [12]:
X

<259557x784600 sparse matrix of type '<class 'numpy.float64'>'
	with 10260828 stored elements in Compressed Sparse Row format>

In [13]:

# split the data into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [14]:
# train the classifier
from sklearn.multiclass import OneVsRestClassifier
classifier = OneVsRestClassifier(GaussianNB(), n_jobs=1)

# train on a subset of the data
n = X_train.shape[0] // 2000
for i in tqdm(range(1999)):
    if i == 0:
        classifier.fit(X_train[:n].toarray(), y_train[:n])
    else:
        classifier.partial_fit(X_train[i*n: (i+1)*n].toarray(), y_train[i*n: (i+1)*n])


In [16]:

# predict the test data

predictions = classifier.predict(X_test[:30])


In [17]:

# evaluate the model

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, hamming_loss

print('Accuracy: ', accuracy_score(y_test[:30], predictions))
print('Precision: ', precision_score(y_test[:30], predictions, average='micro'))
print('Recall: ', recall_score(y_test[:30], predictions, average='micro'))
print('F1: ', f1_score(y_test[:30], predictions, average='micro'))
print('Hamming Loss: ', hamming_loss(y_test[:30], predictions))

Accuracy:  0.2
Precision:  0.4166666666666667
Recall:  0.2830188679245283
F1:  0.33707865168539325
Hamming Loss:  0.09833333333333333
