In [38]:
import pandas as pd
import numpy as np
import joblib
from nltk.corpus import stopwords
stop = stopwords.words("english")
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
from nltk.tokenize import sent_tokenize,word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import json

import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn import preprocessing

from sklearn.preprocessing import MultiLabelBinarizer
from skmultilearn.problem_transform import BinaryRelevance

<h3>Function: Cleaning the corpus</h3>

In [39]:
def clean_text(text):
    
    # To lower case
    text = text.lower()
    
    # Remove new line characters
    text = text.replace("\t"," ")
    text = text.replace("\n"," ")
    
    # Remove punctuation
    text = re.sub(r"[^\w\s]", " ", text)
    
    # Remove digits
    text = re.sub(r"\b\d+\b"," ", text)
    
    # Remove multiple white spaces
    text = re.sub(r' +', ' ', text)
    
    # Remove stopwords
    text = [x for x in text.split() if x not in stop]
    
    # Stemming (Did not use)
    # text = [stemmer.stem(x) for x in text]
    
    return " ".join(text)

<h3>Clean the corpus</h3>

In [40]:
for index, rows in movie_subset.iterrows():
    movie_subset.loc[index,'plot'] = clean_text(movie_subset.loc[index,'plot'])

<h3>Read the dataset</h3>

In [41]:
movie = pd.read_csv("../scripts/movie_cleaned.csv", sep="\t")
movie.shape

(22559, 4)

<h3>Binarize labels</h3>

In [42]:
mlb = MultiLabelBinarizer()
labels_list = []
for index, rows in movie.iterrows():
    labels = rows['genres'].split(",")
    labels_list.append(labels)
labels = mlb.fit_transform(labels_list)

<h3>Sanity check</h3>

In [43]:
print(labels.shape)
all_genres = list(mlb.classes_)
print(all_genres)
print(labels[0])
print(labels_list[0])

(22559, 16)
['Action Adventure', 'Animation', 'Comedy', 'Crime', 'Drama', 'Family Film', 'Fantasy', 'Horror', 'Mystery', 'Period piece', 'Romance', 'Science Fiction', 'Thriller', 'War film', 'Western', 'World cinema']
[1 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0]
['Action Adventure', 'Science Fiction', 'Thriller', 'Horror']


<h3>Ten-fold cross validation</h3>

In [44]:
movie_subset = movie.loc[:19999]
kf = KFold(n_splits=10, shuffle=False)
indices = np.array(movie_subset.index)

list_train_index = []
list_test_index = []

for train_index, test_index in kf.split(indices):
    list_train_index.append(train_index)
    list_test_index.append(test_index)

# Sanity check: Should have 10 sets of training and testing data
assert len(list_train_index) == len(list_test_index) == 10

<h3>Train individual logistic regression model for each genre</h3>

In [45]:
predicted_results = []
ground_truth = []

for i in range(0, 10):
    
    # Get the text
    training_data_index = list_train_index[i]
    testing_data_index = list_test_index[i]
    training_data = movie_subset.loc[movie_subset.index.isin(training_data_index)]
    testing_data  = movie_subset.loc[movie_subset.index.isin(testing_data_index)]
    
    # Build tf-idf vectors for training data
    tfidf_vectorizer = TfidfVectorizer(lowercase=True, min_df=0.005, ngram_range=(1, 1), max_df=0.9)
    tfidf_matrix_train = tfidf_vectorizer.fit_transform(training_data['plot'])
    training_features = tfidf_matrix_train.toarray()
    
    # Get labels for training and testing
    training_labels = labels[list_train_index[i]]
    testing_labels = labels[list_test_index[i]]
    
    # Build tf-idf vectors for testing data
    tfidf_matrix_test = tfidf_vectorizer.transform(testing_data['plot'])
    testing_features = tfidf_matrix_test.toarray()
    
    # Train the classifier
    print("running on fold ", i+1)
    BinaryClassifier = BinaryRelevance(classifier=LogisticRegression(class_weight='balanced'))
    BinaryClassifier.fit(training_features, training_labels)
    
    # Do the prediction
    predicted_probs = BinaryClassifier.predict_proba(testing_features)
    predicted_probs = predicted_probs.toarray()
    predicted_labels = predicted_probs.copy()
    predicted_labels[predicted_labels>=0.5] = 1
    predicted_labels[predicted_labels<0.5] = 0
    
    # Track predicted labels and ground truth
    predicted_results.append(predicted_labels)
    ground_truth.append(testing_labels)

running on fold  1
running on fold  2
running on fold  3
running on fold  4
running on fold  5
running on fold  6
running on fold  7
running on fold  8
running on fold  9
running on fold  10


<h3>Calculate the percision/recall/f1 score</h3>

In [48]:
results = pd.DataFrame(columns=['genre', 'fold_no', 'precision', 'recall', 'f1'])
 
for fold_no in range(0, 10):
    
    predicted_labels = predicted_results[fold_no] # Shape = 2000, 16
    true_labels = ground_truth[fold_no] # Shape = 2000, 16
    
    # Iterate over each genre
    for genre_index in range(0, 16):
        
        # Fetch an array of 2000 elements for x-th genre
        per_genre_predicted_label = predicted_labels[:, genre_index]
        per_genre_true_label = true_labels[:, genre_index]
        
        # Compute
        precision = precision_score(per_genre_true_label, per_genre_predicted_label)
        recall = recall_score(per_genre_true_label, per_genre_predicted_label)
        f1 = f1_score(per_genre_true_label, per_genre_predicted_label)
        
        genre_name = all_genres[genre_index]
        
        # Add to the dataframe
        new_row = dict()
        new_row['genre'] = genre_name
        new_row['fold_no'] = fold_no
        new_row['precision'] = precision
        new_row['recall'] = recall
        new_row['f1'] = f1
        results = results.append(new_row, ignore_index=True)

In [51]:
for genre in all_genres:
    print(genre)
    results_per_genre = results.loc[results.genre==genre]
    print(np.round(np.mean(results_per_genre['precision']), 4))
    print(np.round(np.mean(results_per_genre['recall']), 4))
    print(np.round(np.mean(results_per_genre['f1']), 4))

Action Adventure
0.7419
0.7519
0.7466
Animation
0.7419
0.7519
0.7466
Comedy
0.7419
0.7519
0.7466
Crime
0.7419
0.7519
0.7466
Drama
0.7419
0.7519
0.7466
Family Film
0.7419
0.7519
0.7466
Fantasy
0.7419
0.7519
0.7466
Horror
0.7419
0.7519
0.7466
Mystery
0.7419
0.7519
0.7466
Period piece
0.7419
0.7519
0.7466
Romance
0.7419
0.7519
0.7466
Science Fiction
0.7419
0.7519
0.7466
Thriller
0.7419
0.7519
0.7466
War film
0.7419
0.7519
0.7466
Western
0.7419
0.7519
0.7466
World cinema
0.7419
0.7519
0.7466
