In [1]:
import pandas as pd
import numpy as np
import io
import sys
import os
import urllib.request
from tqdm import tqdm
from os import listdir
from PIL import Image
import glob

In [2]:
CSV_FILE = os.path.abspath('../data/MovieGenre.csv')

df = pd.read_csv(CSV_FILE,encoding='ISO-8859-1')
df.head()

Unnamed: 0,imdbId,Imdb Link,Title,IMDB Score,Genre,Poster
0,114709,http://www.imdb.com/title/tt114709,Toy Story (1995),8.3,Animation|Adventure|Comedy,https://images-na.ssl-images-amazon.com/images...
1,113497,http://www.imdb.com/title/tt113497,Jumanji (1995),6.9,Action|Adventure|Family,https://images-na.ssl-images-amazon.com/images...
2,113228,http://www.imdb.com/title/tt113228,Grumpier Old Men (1995),6.6,Comedy|Romance,https://images-na.ssl-images-amazon.com/images...
3,114885,http://www.imdb.com/title/tt114885,Waiting to Exhale (1995),5.7,Comedy|Drama|Romance,https://images-na.ssl-images-amazon.com/images...
4,113041,http://www.imdb.com/title/tt113041,Father of the Bride Part II (1995),5.9,Comedy|Family|Romance,https://images-na.ssl-images-amazon.com/images...


In [3]:
df.shape

(40107, 6)

In [4]:
check_nan_in_df = df.isnull().values.any()
print (check_nan_in_df)

True


In [5]:
#Remove NaN valued entries
df= df.dropna(how='any')
df.shape

(39245, 6)

In [6]:
#Method 1 to download images
IMAGES_PATH = os.path.join(os.path.abspath('../data/'), 'IMDB_IMAGES')
if (os.path.exists(IMAGES_PATH) == False):
    os.mkdir(IMAGES_PATH)

not_found = []
for index, row in tqdm(df.iterrows()):
    
    url = row['Poster']
    imdb_id = row['imdbId']
    
    file_path = os.path.join(IMAGES_PATH,  str(imdb_id) + ".jpg")
    
    try:
        response = urllib.request.urlopen(url)
        data = response.read()
        file = open(file_path, 'wb')
        file.write(bytearray(data))
        file.close()
    except:
        not_found.append(imdb_id)
        
print("Number of posters not found = ", len(not_found))
print("Following IMDB_ID posters were not found::", not_found)

39245it [1:03:47, 10.25it/s]

Number of posters not found =  3864
Following IMDB_ID posters were not found:: [112302, 113189, 114057, 115012, 112697, 113347, 114814, 109950, 110299, 112499, 118002, 113149, 113010, 113537, 117002, 113247, 115734, 116483, 112579, 75314, 112373, 116606, 114808, 110647, 111055, 114015, 109093, 112462, 112541, 109508, 112857, 114558, 114781, 109635, 112899, 109579, 111797, 109758, 113028, 109771, 113538, 76759, 110367, 110538, 113808, 113870, 117169, 113948, 114151, 110413, 108394, 111161, 111280, 114852, 114888, 114857, 109655, 109040, 109484, 109830, 113305, 110091, 107472, 105226, 111756, 113173, 113827, 114047, 106226, 109068, 106489, 106505, 106519, 109443, 109480, 109783, 106880, 111712, 106918, 107004, 107076, 112966, 107151, 107207, 110197, 110265, 107413, 107468, 107497, 111689, 107818, 108000, 105032, 108065, 111201, 108162, 111418, 108333, 108358, 108399, 108515, 110259, 107002, 109403, 110363, 111709, 99785, 117381, 115509, 115956, 48473, 52572, 117705, 117104, 117774, 11327




In [7]:
# Remove from dataframe those whose posters were not found
df_2 = df[~df['imdbId'].isin(not_found)]
df_2.shape

(35381, 6)

In [8]:
# Check which downloaded images are corrupt and remove them 
bad_images = []
for file in glob.glob("Posters/*.jpg"):

    try:
        img = Image.open(file) # open image file
        img.verify() # verify its an image
    except (IOError, SyntaxError) as e:
        print('Bad file:', file) 

        bad_images.append(file)
        
print("Number of corrupt files:", len(bad_images))

Number of corrupt files: 0


In [None]:
# No corrupt files found so not performing any further data-cleaning in terms of unfound/bad data

In [9]:
df_2.columns
df_2.head()

Unnamed: 0,imdbId,Imdb Link,Title,IMDB Score,Genre,Poster
0,114709,http://www.imdb.com/title/tt114709,Toy Story (1995),8.3,Animation|Adventure|Comedy,https://images-na.ssl-images-amazon.com/images...
1,113497,http://www.imdb.com/title/tt113497,Jumanji (1995),6.9,Action|Adventure|Family,https://images-na.ssl-images-amazon.com/images...
2,113228,http://www.imdb.com/title/tt113228,Grumpier Old Men (1995),6.6,Comedy|Romance,https://images-na.ssl-images-amazon.com/images...
3,114885,http://www.imdb.com/title/tt114885,Waiting to Exhale (1995),5.7,Comedy|Drama|Romance,https://images-na.ssl-images-amazon.com/images...
4,113041,http://www.imdb.com/title/tt113041,Father of the Bride Part II (1995),5.9,Comedy|Family|Romance,https://images-na.ssl-images-amazon.com/images...


In [65]:
# Keep only IMDB_ID, Genres, Title

df_3 = df_2[['imdbId','Title','Genre']]
print(df_3.shape)
df_3.head()

(37495, 3)


Unnamed: 0,imdbId,Title,Genre
0,114709,Toy Story (1995),Animation|Adventure|Comedy
1,113497,Jumanji (1995),Action|Adventure|Family
2,113228,Grumpier Old Men (1995),Comedy|Romance
3,114885,Waiting to Exhale (1995),Comedy|Drama|Romance
4,113041,Father of the Bride Part II (1995),Comedy|Family|Romance


In [66]:
#Checking if all the imdb_id listed here actually have its poster image

image_list = []
for file in glob.glob("Posters/*.jpg"):
    image_list.append(file)
    
print("Number of files found:", len(image_list))

Number of files found: 36918


In [67]:
# Mis-match still exists, so we further perform data-cleaning
# This is performed in the next python script: "Clean_data"
# We export the df_3 as csv, to be used with images downloaded in the Posters folder

df_3.to_csv("MovieGenre_cleaned.csv", index = None)