In [29]:
import pandas as pd
import numpy as np
import io
import sys
import os
import urllib.request
from tqdm import tqdm
from os import listdir
from PIL import Image
import glob

In [30]:
CSV_FILE = os.path.abspath('../data/MovieGenre.csv')

df = pd.read_csv(CSV_FILE,encoding='ISO-8859-1')
df.head()

Unnamed: 0,imdbId,Imdb Link,Title,IMDB Score,Genre,Poster
0,114709,http://www.imdb.com/title/tt114709,Toy Story (1995),8.3,Animation|Adventure|Comedy,https://images-na.ssl-images-amazon.com/images...
1,113497,http://www.imdb.com/title/tt113497,Jumanji (1995),6.9,Action|Adventure|Family,https://images-na.ssl-images-amazon.com/images...
2,113228,http://www.imdb.com/title/tt113228,Grumpier Old Men (1995),6.6,Comedy|Romance,https://images-na.ssl-images-amazon.com/images...
3,114885,http://www.imdb.com/title/tt114885,Waiting to Exhale (1995),5.7,Comedy|Drama|Romance,https://images-na.ssl-images-amazon.com/images...
4,113041,http://www.imdb.com/title/tt113041,Father of the Bride Part II (1995),5.9,Comedy|Family|Romance,https://images-na.ssl-images-amazon.com/images...


In [31]:
df.shape

(40107, 6)

In [32]:
# There exists a genre value for all images
Genre_list = df['Genre']
print(len(Genre_list))

40107


In [33]:
# Breaks "Genre" into the constituting individual genres
def find_genres(genre):
    
    start = 0
    set_of_genre = []
    
    for i in range(len(genre)):
        
        k=0
        substring = ""
        if (genre[i]=='|'):
            substring = genre[start:i]
            start = i+1
            k = 1
        
        if(i==len(genre)-1):
            substring = genre[start:i+1]
            k = 1
            
        if (k==1):
            set_of_genre.append(substring)         
    
    return (set_of_genre)

In [34]:
# Extract list of genre values for each image
all_genre = []
Genre_list = df['Genre']

for i in range (len(Genre_list)):
	
	if (isinstance(Genre_list[i], str)):
		set_of_genre = find_genres(Genre_list[i])
		for j in range (len(set_of_genre)):
			all_genre.append(set_of_genre[j])
		
uniq, counts = np.unique(all_genre, return_counts=True)
print("Number of unique genres:", len(uniq))
print("Unique genres are:", uniq)
dict(zip(uniq, counts))

Number of unique genres: 28
Unique genres are: ['Action' 'Adult' 'Adventure' 'Animation' 'Biography' 'Comedy' 'Crime'
 'Documentary' 'Drama' 'Family' 'Fantasy' 'Film-Noir' 'Game-Show'
 'History' 'Horror' 'Music' 'Musical' 'Mystery' 'News' 'Reality-TV'
 'Romance' 'Sci-Fi' 'Short' 'Sport' 'Talk-Show' 'Thriller' 'War' 'Western']


{'Action': 5330,
 'Adult': 14,
 'Adventure': 3829,
 'Animation': 1750,
 'Biography': 1999,
 'Comedy': 12682,
 'Crime': 5270,
 'Documentary': 3882,
 'Drama': 20052,
 'Family': 2100,
 'Fantasy': 2012,
 'Film-Noir': 403,
 'Game-Show': 1,
 'History': 1426,
 'Horror': 3990,
 'Music': 1292,
 'Musical': 845,
 'Mystery': 2394,
 'News': 83,
 'Reality-TV': 2,
 'Romance': 6224,
 'Sci-Fi': 2020,
 'Short': 1066,
 'Sport': 707,
 'Talk-Show': 7,
 'Thriller': 4816,
 'War': 1173,
 'Western': 853}

In [35]:
# Prepare multi-hot-encoded-labels for the various genres
def multi_hot_encoded_labels(img_id, genre):
    
    col_names =  ['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime',
                  'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'Game-Show', 'History',
                  'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance',
                  'Sci-Fi', 'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western']
    
    set_of_genre = find_genres(genre)
    
    row=[]
    row.append(img_id)
    
    for i in range(len(col_names)):
        
        found = 0
        for j in range (len(set_of_genre)):
            if (set_of_genre[j]==col_names[i]):
                found = 1
                break
        
        row.append(found)
    
    row.append(genre) #add the overall combined genre for record purposes
        
    return row  

In [36]:
# Perform the encoding of the labels and save data in the format :
# Img-ID <multi-hot-encoded-labels> overall_genre
CSV_FINAL = os.path.abspath('../data/MovieGenre_final.csv')

df = pd.read_csv(CSV_FINAL, encoding="ISO-8859-1")
all_data = []

for index, row in tqdm(df.iterrows()):
    
    path = row['Image_Paths']
    genre = row['Genre']
    row = multi_hot_encoded_labels(path, genre)
    
    all_data.append(row)

col_names =  ['Img-paths', 'Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime',
                  'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'Game-Show', 'History',
                  'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance',
                  'Sci-Fi', 'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western', 'Genre']

np.savetxt("Multi_hot_encoded_data.csv", np.asarray(all_data), fmt='%s', delimiter=" ")   
np.savetxt("Encoded_data_column_lookup.csv", np.asarray(col_names), fmt='%s', delimiter=" ")

0it [00:00, ?it/s]


In [37]:
df_encoded = pd.read_csv("Multi_hot_encoded_data.csv", delimiter=" ", 
                  names =  ['Img-paths', 'Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime',
                  'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'Game-Show', 'History',
                  'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance',
                  'Sci-Fi', 'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western', 'Genre'])

df_encoded.head()

Unnamed: 0,Img-paths,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,...,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,Genre


In [38]:
# To split into train / validation / test in the ratio 80 / 15 / 5%
MHE_DATA_CSV = os.path.abspath('./Multi_hot_encoded_data.csv')

df = pd.read_csv(MHE_DATA_CSV, delimiter=" ")
random_seed = 50
train_df = df.sample(frac=0.70, random_state=random_seed) #Taking 70% of the data
tmp_df = df.drop(train_df.index)
test_df = tmp_df.sample(frac=0.1, random_state=random_seed) #Taking 20% of the remaining (after train is taken)
valid_df = tmp_df.drop(test_df.index)

print("Train_df=",len(train_df))
print("Val_df=",len(valid_df))
print("Test_df=",len(test_df))

np.savetxt("Train.csv", train_df, fmt='%s', delimiter=" ")
np.savetxt("Test.csv", test_df, fmt='%s', delimiter=" ")
np.savetxt("Valid.csv", valid_df, fmt='%s', delimiter=" ")

#Numpy method
#train, validate, test = np.split(df_encoded.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])
#np.split will split at 60% of the length of the shuffled array, 
#then 80% of length (which is an additional 20% of data), thus leaving a remaining 20% of the data.

EmptyDataError: No columns to parse from file