# Data Preparation

### Import the necessary libraries.

In [1]:
from __future__ import print_function
import pandas as pd
from scipy import misc
from keras.preprocessing.image import img_to_array

Using TensorFlow backend.


### Configure settings.

In [2]:
image_size = (138,92)
image_size_code = 'w92'
train_observations = 5000
test_observations = 1000

In [3]:
# Read in the input data sets.
movie_dataset = pd.read_csv('y_labels_multilabel_overview.csv')

In [4]:
movie_dataset.head()

Unnamed: 0.1,Unnamed: 0,movie_id,genre_id,genre_name,Overview
0,0,297761,"[28, 53]","[u'Action', u'Thriller']","From DC Comics comes the Suicide Squad, an ant..."
1,1,209112,[28],[u'Action'],Fearing the actions of a god-like Super Hero l...
2,2,271110,[28],[u'Action'],"Following the events of Age of Ultron, the col..."
3,3,329865,"[18, 28, 53]","[u'Drama', u'Action', u'Thriller']",Taking place after alien crafts land around th...
4,4,284052,[28],[u'Action'],"After his career is destroyed, a brilliant but..."


In [5]:
def load_poster_data_multilabel(dataset, image_size, source_size = 'w92', verbose = False):
    # Loads the poster image data at the requested size, the assigned genre, and the movie id.
    #
    y_labels = dataset
        
    image_path = './posters/' + source_size + '/'
    posters = pd.DataFrame()
    for movie in y_labels.iterrows():
        row = movie[0]
        movie_id = movie[1]['movie_id']
        genre_ids = movie[1]['genre_id'].replace('[', '').replace(']','').split(',')
        try:
            image = misc.imread(image_path + str(movie_id) + '.jpg')
            image_resize = img_to_array(misc.imresize(image, image_size))
            if (image_resize.shape[2]==3):
                posters = posters.append({'movie_id' : movie_id, 
                                          'genre_id' : genre_ids,
                                          'genre_name' : movie[1]['genre_name'],
                                          'overview' : movie[1]['Overview'],
                                          'poster' : image_resize}, ignore_index = True)
        except IOError:
            if (verbose == True):
                print('Unable to load poster for movie #', movie_id)
    print('Loaded', posters.shape[0], 'posters.')
    return posters

In [6]:
poster_data = load_poster_data_multilabel(movie_dataset, image_size, image_size_code, False)

Loaded 6824 posters.


In [7]:
def convert_str(s):
    return s.replace('[', '').replace("u\'", '').replace(']','').replace('\'', '').split(", ")
poster_data['genre_name_list'] = poster_data['genre_name'].apply(lambda x: convert_str(x))

In [8]:
# Apply multilabel to the responses
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(poster_data['genre_name_list'])
classes = list(mlb.classes_)

### Stratified sample the data into training and test datasets.

In [18]:
from sklearn.cross_validation import train_test_split
train_x, test_x, train_y, test_y = train_test_split(poster_data, genre_matrix, 
                                                    test_size = 0.25, 
                                                    stratify = poster_data['genre_name_list'].astype(str))

In [19]:
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)

(5120, 6)
(5120, 8)
(1704, 6)
(1704, 8)


In [20]:
dataset = { 'full_data' : poster_data,
            'genre_matrix' : genre_matrix, 
            'labels' : classes,
            'train_x' : train_x,
            'train_y' : train_y,
            'test_x' : test_x,
            'test_y' : test_y }

### Save the dataset for future use.

In [21]:
import pickle
with open('full_dataset.pickle', 'wb') as handle:
    pickle.dump(dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)