In [1]:
import json
from itertools import groupby
import random
import pandas as pd

import os
import re

import urllib.request

import numpy as np

from keras.preprocessing.image import img_to_array, load_img
from IPython.display import display
from PIL import Image

Using TensorFlow backend.


# Files and where to find them

### Pre-processed data for CNN (y are images)
* https://s3.amazonaws.com/cs109b-data/test_X_array.npy

* https://s3.amazonaws.com/cs109b-data/test_y_array.npy

* https://s3.amazonaws.com/cs109b-data/Y_array.npy

* https://s3.amazonaws.com/cs109b-data/X_array.npy

### JSON format of the data, cleaned
* https://s3.amazonaws.com/cs109b-data/testing_4-29.json

* https://s3.amazonaws.com/cs109b-data/training_4-29.json

### Whole data set, used to create both test and train
* https://s3.amazonaws.com/cs109b-data/themoviedb-4-18-2017.json

### Original, uncleaned train set
* https://s3.amazonaws.com/cs109b-data/themoviedb-sample-4-17-2017.json

In [2]:
def merge_two_dicts(x, y):
    """Given two dicts, merge them into a new dict as a shallow copy."""
    z = x.copy()
    z.update(y)
    return z

# Load data

In [3]:
all_data = list()
with open('../../data/raw/themoviedb-4-18-2017.json') as data_file:
    all_data.extend(json.loads(data_file.read()))

In [5]:
training_data = list()
with open('../../data/raw/themoviedb-sample-4-17-2017.json') as data_file:
    training_data.extend(json.loads(data_file.read()))

# Filter out all movies in the training set

In [6]:
training_movie_ids = {movie['id'] for movie in training_data}

In [7]:
## This set will be sampled for a test set
total_test_data = [movie for movie in all_data if movie['id'] not in training_movie_ids ]

In [8]:
len(total_test_data)

90110

In [9]:
total_test_data = [movie for movie in total_test_data if len(movie['genres']) is not 0 ]

In [10]:
len(total_test_data)

90110

# Pre-process train set

In [None]:
df = pd.read_json('../../data/themoviedb-sample-4-17-2017.json')
df = df[df['poster_path'].isnull() == False]
df['poster_url'] = df['poster_path'].apply(lambda x: 'http://image.tmdb.org/t/p/w92{}'.format(x))
df = df[['title', 'genre', 'poster_url']]
print(df.shape)
df.head()

#### Downloading all the posters
Here, we download them at the smallest size (width = 92 pixels). 

In [None]:
for url in df['poster_url'].unique():
    file_name = re.findall(r'(?<=w92\/)(.*)', url)[0]
    if file_name not in os.listdir('data/img'):
         urllib.urlretrieve(url, 'data/img/{}'.format(file_name))

#### Flattening into numpy arrays
Resizing images to `(138, 92, 3)`. I expect this will lose us some precision later.

In [None]:
images = []
y = []

for image_file in os.listdir('../../data/img'):
    img = load_img('../../data/img/{}'.format(image_file))
    images.append(np.resize(img_to_array(img),(138,92,3)))
    y.append(np.array([x for x in df[df['poster_url'] == 'http://image.tmdb.org/t/p/w92/{}'.format(image_file)]['genre'].values]))

## $y$: Multi-label

In [None]:
X = np.array(images)

# Normalizing away from 255
X = X / 255.

# Saving it for later, train X
np.save('../../data/X_array.npy', X)

input_shape = X.shape
print(input_shape)

In [None]:
all_genres = pd.DataFrame(index = range(0,4722))

for row in y:
    for genre in row:
        if genre in all_genres:
            pass
        else:
            all_genres[genre] = 0

for index, row in enumerate(y):
    for genre in row:
        all_genres[genre].loc[index] = 1
        
print(all_genres.shape)
all_genres.head()

In [None]:
# Save Y_array
Y = np.array(all_genres)
np.save('../../data/Y_array.npy', Y)

# Sample data for test set
Follow identical pre-process set done on train

In [11]:
sample_test_data = random.sample(total_test_data, 2000)

In [44]:
df = pd.DataFrame(sample_test_data)
df = df[df['poster_path'].isnull() == False]
df['poster_url'] = df['poster_path'].apply(lambda x: 'http://image.tmdb.org/t/p/w92{}'.format(x))
df = df[['title', 'genre', 'poster_url']]
print(df.shape)
df.head()

(2000, 3)


Unnamed: 0,title,genre,poster_url
0,The Silent Force,Thriller,http://image.tmdb.org/t/p/w92/52lwYVVeq52AVqR1...
1,Chicken Little,Family,http://image.tmdb.org/t/p/w92/iLMALbInUmbNn1tH...
2,Shake Hands With the Devil,History,http://image.tmdb.org/t/p/w92/o2NBY3deN6NIBCIb...
3,Closer,Romance,http://image.tmdb.org/t/p/w92/jZINusxU1VUKae1s...
4,True Wolf,Documentary,http://image.tmdb.org/t/p/w92/xyAP3DcXG1Lm2a9K...


In [13]:
# Download images
for url in df['poster_url'].unique():
    file_name = re.findall(r'(?<=w92\/)(.*)', url)[0]
    if file_name not in os.listdir('../../data/test_img_92'):
        urllib.request.urlretrieve(url, '../../data/test_img_92/{}'.format(file_name))

In [45]:
test_images = []
test_Y = []

for image_file in os.listdir('../../data/test_img_92'):
    img = load_img('../../data/test_img_92/{}'.format(image_file))
    test_images.append(np.resize(img_to_array(img),(138,92,3)))
    genres = [x for x in df[df['poster_url'] == 'http://image.tmdb.org/t/p/w92/{}'.format(image_file)]['genre'].values]
    test_Y.append(np.array(genres))

In [46]:
test_X = np.array(test_images)

# Normalizing away from 255
test_X = test_X / 255.

# Saving it for later
np.save('../../data/test_X_array.npy', test_X)

input_shape = test_X.shape
print(input_shape)

(1957, 138, 92, 3)


In [47]:
test_Y[:3]

[array(['Comedy'], 
       dtype='<U6'), array(['Foreign'], 
       dtype='<U7'), array(['Action'], 
       dtype='<U6')]

# Convert to multilable

In [None]:
all_genres = pd.DataFrame(index = range(0,1957))

for row in test_Y:
    for genre in row:
        if genre in all_genres:
            pass
        else:
            all_genres[genre] = 0

for index, row in enumerate(test_Y):
    for genre in row:
        all_genres[genre].loc[index] = 1
        
print(all_genres.shape)
all_genres.head()

In [None]:
all_genres

In [49]:
test_Y = np.array(all_genres)

In [50]:
# Save test_y
np.save('../../data/test_y_array.npy', test_Y)

In [51]:
train_df = pd.DataFrame(training_data)

In [52]:
test_df = pd.DataFrame(sample_test_data)

In [53]:
matching_columns = set(test_df.columns).intersection(train_df.columns)

train_df = train_df[list(matching_columns)]

In [54]:
test_df = test_df[list(matching_columns)]

In [55]:
len(train_df)

9549

In [56]:
len(test_df)

2000

# Save training and test set as json

In [57]:
train_df.to_json('../../data/processed/training_4-29.json', orient='records')
test_df.to_json('../../data/processed/testing_4-29.json', orient='records')

In [58]:
len(test_Y)

1957

In [59]:
len(test_X)

1957

# Realized a mixup

The order of the columns was important to the `test_x_array`; the order of the labels is swaped. In these cells, recalculate the arrays.

In [61]:
train_df

Unnamed: 0,release_date,tagline,vote_count,revenue,imdb_id,popularity,runtime,crew_count,original_language,backdrop_path,...,budget,title,overview,id,poster_path,original_title,vote_average,genre_ids,video,reviews
0,2015-01-02,,0,0,tt3485752,0.010669,93.0,2,en,,...,0,Bought,Modern industrialization is no longer about st...,328380,/5q3CQLHMbvsjxK4Vs5xia2pLXjo.jpg,Bought,0.0,[99],False,[]
1,2009-09-20,,2,0,tt1798146,0.000782,60.0,2,en,,...,0,Egypt Underworld,The documentary is an analysis of the fascinat...,112052,/vHZf7KA7RGMxWl2rVVl7qtu81rl.jpg,Egypt Underworld,3.8,[99],False,[]
2,2001-07-12,,0,0,,0.002123,1.0,1,en,/jWgtb7CRy3jWoXiz0IK352MG8b5.jpg,...,0,Chinese Heroes,A good-natured kung fu kid gets caught up in c...,201706,/74F1NxjpQFoXoJzGCZjng4T0jgV.jpg,Chinese Heroes,0.0,[28],False,[]
3,2002-12-24,he quake of the century... get ready to rumble...,2,0,tt0300470,0.097457,92.0,2,en,,...,0,Shakedown,In Los Angeles a deadly plague called the 'Pan...,61803,/eILw7xfKn99cPU6ngBVtW8XjQcK.jpg,Shakedown,3.0,"[28, 18, 53]",False,[]
4,2002-12-24,he quake of the century... get ready to rumble...,2,0,tt0300470,0.097457,92.0,2,en,,...,0,Shakedown,In Los Angeles a deadly plague called the 'Pan...,61803,/eILw7xfKn99cPU6ngBVtW8XjQcK.jpg,Shakedown,3.0,"[28, 18, 53]",False,[]


In [63]:
all_genres.columns

Index(['Comedy', 'Foreign', 'Action', 'Drama', 'Documentary', 'Thriller',
       'Horror', 'Romance', 'Crime', 'Fantasy', 'Science Fiction', 'TV Movie',
       'Family', 'Western', 'War', 'Music', 'Animation', 'Mystery',
       'Adventure', 'History'],
      dtype='object')

In [72]:
test = list()
with open('../../data/processed/testing_4-29.json') as data_file:
    test.extend(json.loads(data_file.read()))

In [74]:
for movie in test:
    for genre in movie['genres']:
        movie[genre['name']] = True

In [78]:
test_df = pd.DataFrame(test).fillna(False)

In [79]:
train = list()
with open('../../data/processed/training_4-29.json') as data_file:
    train.extend(json.loads(data_file.read()))

In [80]:
for movie in train:
    for genre in movie['genres']:
        movie[genre['name']] = True

In [81]:
train_df = pd.DataFrame(train).fillna(False)

In [82]:
train_df.to_json('../../data/processed/training_4-29.json', orient='records')
test_df.to_json('../../data/processed/testing_4-29.json', orient='records')

In [83]:
train_df.tail(3)

Unnamed: 0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,Foreign,...,release_date,revenue,reviews,runtime,status,tagline,title,video,vote_average,vote_count
9546,False,False,False,True,False,False,False,False,False,False,...,2010-01-28,0,[],96.0,Released,,Baby Mama's Club,False,0.0,0
9547,True,False,False,False,False,False,False,False,False,False,...,2011-01-01,0,[],6.0,Released,,Initiation,False,0.0,0
9548,True,False,False,False,False,False,False,False,False,False,...,2011-01-01,0,[],6.0,Released,,Initiation,False,0.0,0


In [37]:
columns = ["Drama",
 "Horror",
 "Documentary",
 "Romance",
 "Adventure",
 "Comedy",
 "Family",
 "Fantasy",
 "Science Fiction",
 "Foreign",
 "Action",
 "Crime",
 "Music",
 "Thriller",
 "TV Movie",
 "History",
 "Mystery",
 "Animation",
 "War",
 "Western"
]

In [17]:
test_df = pd.read_json('../../data/processed/testing_4-29.json')
test_df.columns

Index(['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary',
       'Drama', 'Family', 'Fantasy', 'Foreign', 'History', 'Horror', 'Music',
       'Mystery', 'Romance', 'Science Fiction', 'TV Movie', 'Thriller', 'War',
       'Western', 'adult', 'backdrop_path', 'belongs_to_collection', 'budget',
       'cast_count', 'crew_count', 'genre', 'genre_ids', 'genres', 'homepage',
       'id', 'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'release_date', 'revenue', 'reviews',
       'runtime', 'status', 'tagline', 'title', 'video', 'vote_average',
       'vote_count'],
      dtype='object')

In [23]:
test_df = test_df.drop_duplicates(['id'], keep='last')
len(test_df)

test_df = test_df[test_df['poster_path'].isnull() == False]
test_df['poster_url'] = test_df['poster_path'].apply(lambda x: 'http://image.tmdb.org/t/p/w92{}'.format(x))
test_df = test_df[['title', 'genre', 'poster_url']]
print(test_df.shape)
test_df.head()


(1957, 3)


Unnamed: 0,title,genre,poster_url
0,The Silent Force,Thriller,http://image.tmdb.org/t/p/w92/52lwYVVeq52AVqR1...
1,Chicken Little,Family,http://image.tmdb.org/t/p/w92/iLMALbInUmbNn1tH...
2,Shake Hands With the Devil,History,http://image.tmdb.org/t/p/w92/o2NBY3deN6NIBCIb...
3,Closer,Romance,http://image.tmdb.org/t/p/w92/jZINusxU1VUKae1s...
4,True Wolf,Documentary,http://image.tmdb.org/t/p/w92/xyAP3DcXG1Lm2a9K...


In [24]:
test_images = []
test_Y = []

for image_file in os.listdir('../../data/test_img_92'):
    img = load_img('../../data/test_img_92/{}'.format(image_file))
    test_images.append(np.resize(img_to_array(img),(138,92,3)))
    genres = [x for x in test_df[test_df['poster_url'] == 'http://image.tmdb.org/t/p/w92/{}'.format(image_file)]['genre'].values]
    test_Y.append(np.array(genres))

In [25]:
test_X = np.array(test_images)

# Normalizing away from 255
test_X = test_X / 255.

# Saving it for later
np.save('../../data/test_X_array.npy', test_X)

input_shape = test_X.shape
print(input_shape)

(1957, 138, 92, 3)


In [35]:
test_df = pd.read_json('../../data/processed/testing_4-29.json')

test_df[columns].astype(int)
test_df = test_df.drop_duplicates(['id'], keep='last')
test_Y = np.array(test_df[columns].astype(int))
print(test_Y.shape)

(1957, 20)


In [36]:
np.save('../../data/test_y_array.npy', test_Y)