# Multi-Label Classification

In [None]:
import pandas as pd
import numpy as np
import json
import nltk
import re
import csv
import matplotlib.pyplot as plt 
import seaborn as sns
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

%matplotlib inline
pd.set_option('display.max_colwidth', 300)

# Dataset
We will be using the CMU Movie Summary Corpus open dataset for this notebook. This dataset contains a list of movies and their genres. We can exploit movie summaries for predicting movie genres. 

In [None]:
data_dir = '/dsa/data/DSA-8410/MovieSummaries/'

In [None]:

meta = pd.read_csv(data_dir+"movie.metadata.tsv", sep = '\t', header = None)
meta.head()

## Set the proper column name for the dataframe.

In [None]:
# rename columns
meta.columns = ["movie_id","freebase_movie_id","movie_name",
                "release_date","revenue","runtime", "languages","countries","genre"]
meta.head()

# Load movie plots
The movie plot is in a different file. We need to load the plot separately.

In [None]:
plots = []

with open(data_dir + "plot_summaries.txt", 'r') as f:
       reader = csv.reader(f, dialect='excel-tab') 
       for row in tqdm(reader):
            plots.append(row)
            
movie_id = []
plot = []

# extract movie Ids and plot summaries
for i in tqdm(plots):
  movie_id.append(i[0])
  plot.append(i[1])

# create dataframe
movies = pd.DataFrame({'movie_id': movie_id, 'plot': plot})
movies.head()

# Data Exploration and Pre-processing
Now add the meta information to the movies dataframe. 

In [None]:

# change datatype of 'movie_id'
meta['movie_id'] = meta['movie_id'].astype(str)

# merge meta with movie plots
movies = pd.merge(movies, meta[['movie_id', 'movie_name', 'genre']], on = 'movie_id')

movies.head()

In [None]:

movies['genre'][0]


The tags are in json. We need to convert json to list

In [None]:
# an empty list
genres = [] 

# extract genres
for i in movies['genre']: 
  genres.append(list(json.loads(i).values())) 

# add to 'movies' dataframe  
movies['genre_new'] = genres

In [None]:
movies.head()

# T1. Drop movies which doesn't have any genre information

Dropping the movies which don't have any information about tags.

In [None]:
# remove samples with 0 genre tags


# List all genres

In [None]:
# get all genre tags in a list
all_genres = sum(genres,[])
len(set(all_genres))

There are around 363 genres. This is too many. To reduce computing load, we will use top 50 gneres for prediciton. 


In [None]:
all_genres = nltk.FreqDist(all_genres) 

# create dataframe
all_genres_df = pd.DataFrame({'Genre': list(all_genres.keys()), 
                              'Count': list(all_genres.values())})

g = all_genres_df.nlargest(columns="Count", n = 50) 

plt.figure(figsize=(12,15)) 
ax = sns.barplot(data=g, x= "Count", y = "Genre") 
ax.set(ylabel = 'Count') 
plt.show()

In [None]:
selected_genre = list(g['Genre'])

# an empty list
tmp_genres = [] 

# extract genres
for i in movies['genre_new']: 
  tmp_genres.append(list(set(i).intersection(set(selected_genre)))) 

# add to 'movies' dataframe  
movies['chosen_genre'] = tmp_genres

movies.head()

# T2. Drop rows that don't have any top-50 genres

We dropped the genres which are not in the top 50 list. So some movies now don't belong to any of these genres. We need to drop these movies. 

In [None]:
# remove samples with 0 genre tags


# T3. Clean the movie plot

This function drops the unnecessary characters from the movie plots. We will learn about regular expression later in this course. Use this function as a black box. 


In [None]:
# function for text cleaning 
def clean_text(text):
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    text = text.lower() 
    
    return text

Now, apply `clean_text` funciton in the dataframe to clean the plots.

In [None]:

movies_new['clean_plot'] = <write your code>

# Check the clean plots now.

In [None]:
movies_new.head()

# Plot a frequency distribution of words in all the plots and identify the most frequent words. 

In [None]:

def freq_words(x, terms = 30): 
  all_words = ' '.join([text for text in x]) 
  all_words = all_words.split() 
  fdist = nltk.FreqDist(all_words) 
  words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())}) 
  
  # selecting top 20 most frequent words 
  d = words_df.nlargest(columns="count", n = terms) 
  
  # visualize words and frequencies
  plt.figure(figsize=(12,15)) 
  ax = sns.barplot(data=d, x= "count", y = "word") 
  ax.set(ylabel = 'Word') 
  plt.show()
  
# print 100 most frequent words 
freq_words(movies_new['clean_plot'], 100)

All the top words are the stopwords, which won't help in predicting the movie tags. So we need to drop them. A python package named `nltk` has a stop words remover. We will use that to drop all the stopwords from the plots.

# Remove stop words

Most of the frequent words are stop words. We will download the list of stop words from `nltk` library and remove them from plots. 

In [None]:
# download stopwords
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# function to remove stopwords
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)

movies_new['clean_plot'] = movies_new['clean_plot'].apply(lambda x: remove_stopwords(x))

# Inspect the plots after removing the stopwords.

In [None]:
movies_new.head()

# Encoding target variables

We cannot use the text tags as targets directly in the model. We are required to convert the targets to multi-binary features. As we now have only 50 tags/genres, the number of target variables is 50. There is a 50-length output vector for each movie, where all the values will be zero except the corresponding movie tag position. 

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(movies_new['chosen_genre'])

# transform target variable
y = multilabel_binarizer.transform(movies_new['chosen_genre'])

In [None]:
y.shape

# Convert text to feature vector

We can't train the model directly from the text. We need to convert it to a numeric vector feature. To convert the text to a feature vector, we will use sklearn's `TfidfVectorizer`. This method converts a text data to a numeric vector.  

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=100)


# T4. Create train (80%) and test (20%) split

In [None]:
xtrain, xval, ytrain, yval = <write your code> 

# create TF-IDF features
xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain)
xval_tfidf = tfidf_vectorizer.transform(xval)

# T5. Multi-label Model Training

As we have multiple outputs (i.e., genres) for each movie, we will be using `MultiOutputClassifier` as it can learn multiple targets simultaneously. Internally it learns a model (aka base model) for every target. Let's use a decision tree classifier as a base model.

In [None]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.tree import DecisionTreeClassifier

clf = <write your code>
y_pred = <write your code>

# T6. Measure Accuracy

In [None]:
acc = <write your code>
print(f"Acc: {acc:.2}")

# T7. Qualitative evaluation: radnomly pick 10 plots, show their text, true genres, and predicted genres.

In [None]:
def infer_tags(q):
    q = clean_text(q)
    q = remove_stopwords(q)
    q_vec = <write your code>
    q_pred = <write your code>
    return multilabel_binarizer.inverse_transform(q_pred)



for i in range(10): 
    <write your code>

# Save your notebook, then `File > Close and Halt`