In [1]:
# Analyse van +5000 IMDB geregistreerde films voor data mining

# dataset gedownload van https://www.kaggle.com/deepmatrix/imdb-5000-movie-dataset
# 5043 films met elk 26 variabelen :
# "movie_title" "color" "num_critic_for_reviews" "movie_facebook_likes" "duration" "director_name" "director_facebook_likes" 
# "actor_3_name" "actor_3_facebook_likes" "actor_2_name" "actor_2_facebook_likes" "actor_1_name" "actor_1_facebook_likes" 
# "gross" "genres" "num_voted_users" "cast_total_facebook_likes" "facenumber_in_poster" "plot_keywords" "movie_imdb_link" 
# "num_user_for_reviews" "language" "country" "content_rating" "budget" "title_year" "imdb_score" "aspect_ratio"

from pyspark.sql import SQLContext
from pyspark.sql.types import *
from collections import Counter
import matplotlib.pyplot as plt

raw_movie_data = sc.textFile('./movie_metadata.csv')
raw_movie_data.count()

5044

In [2]:
raw_movie_data.take(3)

[u'color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,actor_1_name,movie_title,num_voted_users,cast_total_facebook_likes,actor_3_name,facenumber_in_poster,plot_keywords,movie_imdb_link,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes',
 u'Color,James Cameron,723,178,0,855,Joel David Moore,1000,760505847,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar\xa0,886204,4834,Wes Studi,0,avatar|future|marine|native|paraplegic,http://www.imdb.com/title/tt0499549/?ref_=fn_tt_tt_1,3054,English,USA,PG-13,237000000,2009,936,7.9,1.78,33000',
 u"Color,Gore Verbinski,302,169,563,1000,Orlando Bloom,40000,309404152,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End\xa0,471220,48350,Jack Davenport,0,goddess|marriage ceremony|marriage proposal|pirate|singapore,http://www.imdb.com/title/tt044908

In [3]:
header = raw_movie_data.first() #extract header
raw_movie_data = raw_movie_data.filter(lambda x: x != header)
raw_movie_data.take(2)

[u'Color,James Cameron,723,178,0,855,Joel David Moore,1000,760505847,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar\xa0,886204,4834,Wes Studi,0,avatar|future|marine|native|paraplegic,http://www.imdb.com/title/tt0499549/?ref_=fn_tt_tt_1,3054,English,USA,PG-13,237000000,2009,936,7.9,1.78,33000',
 u"Color,Gore Verbinski,302,169,563,1000,Orlando Bloom,40000,309404152,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End\xa0,471220,48350,Jack Davenport,0,goddess|marriage ceremony|marriage proposal|pirate|singapore,http://www.imdb.com/title/tt0449088/?ref_=fn_tt_tt_1,1238,English,USA,PG-13,300000000,2007,5000,7.1,2.35,0"]

In [4]:
movie_data = raw_movie_data.map(lambda x: x.split(","))
genre_data = movie_data.map(lambda x: x[9])
genre_data = genre_data.map(lambda x: x.split('|'))
genre_data.take(5)
#movie_data.sortBy(lambda x: x[22]).take(5)0

[[u'Action', u'Adventure', u'Fantasy', u'Sci-Fi'],
 [u'Action', u'Adventure', u'Fantasy'],
 [u'Action', u'Adventure', u'Thriller'],
 [u'Action', u'Thriller'],
 [u'Documentary']]

In [5]:
counter_genres_json = genre_data.map(Counter).reduce(lambda x, y: x + y)
print(counter_genres_json)

Counter({u'Drama': 2594, u'Comedy': 1872, u'Thriller': 1411, u'Action': 1153, u'Romance': 1107, u'Adventure': 923, u'Crime': 889, u'Sci-Fi': 616, u'Fantasy': 610, u'Horror': 565, u'Family': 546, u'Mystery': 500, u'Biography': 293, u'Animation': 242, u'Music': 214, u'War': 213, u'History': 207, u'Sport': 182, u'Musical': 132, u'Documentary': 121, u'Western': 97, u'Film-Noir': 6, u'Short': 5, u'News': 3, u'Reality-TV': 2, u'Game-Show': 1})


In [6]:
c = counter_genres_json.most_common()
genres = []
g_amount = []
for a, b in c:
    genres.append(str(a))
    g_amount.append(b)

print(genres)

['Drama', 'Comedy', 'Thriller', 'Action', 'Romance', 'Adventure', 'Crime', 'Sci-Fi', 'Fantasy', 'Horror', 'Family', 'Mystery', 'Biography', 'Animation', 'Music', 'War', 'History', 'Sport', 'Musical', 'Documentary', 'Western', 'Film-Noir', 'Short', 'News', 'Reality-TV', 'Game-Show']


In [7]:
print(g_amount)

[2594, 1872, 1411, 1153, 1107, 923, 889, 616, 610, 565, 546, 500, 293, 242, 214, 213, 207, 182, 132, 121, 97, 6, 5, 3, 2, 1]


In [8]:
g_sum = sum(g_amount)
for i, a in enumerate(g_amount):
    a = float(a) / float(g_sum)
    g_amount[i] = a * 100
    
print(g_amount)

[17.884721456150025, 12.906784335355765, 9.72835079977937, 7.949531163816878, 7.632377275234418, 6.363761720904577, 6.129343629343629, 4.247104247104247, 4.205736348593492, 3.895477109762824, 3.7644787644787647, 3.4473248758963044, 2.0201323772752344, 1.6685052399338114, 1.4754550468836183, 1.4685603971318257, 1.42719249862107, 1.2548262548262548, 0.9100937672366244, 0.8342526199669057, 0.6687810259238831, 0.04136789851075565, 0.03447324875896304, 0.020683949255377827, 0.013789299503585218, 0.006894649751792609]


In [12]:
fig = plt.figure(figsize=(15, 7))
fig.suptitle('% occurence of genres in top 5000', fontsize=14, fontweight='bold')
ax1 = fig.add_subplot(121)
ax1.pie(g_amount,explode=None,labels=genres,autopct='%1.2f%%',shadow=True, startangle=900)

plt.show()

In [None]:
director_data = movie_data.map(lambda x: x[1])
director_data.take(5)
counter_director_data = director_data.map(Counter).reduce(lambda x, y: x + y)
print(counter_director_data)

In [None]:
#csv_data = movie_data.map(lambda x: x.split(","))
#key_value_data = csv_data.map(lambda x: (x[11], x[8])) # x[11] = movie_title 8 = gross, 21 = rating, 22 = budget, 9 = genre
#key_value_data.take(5)

In [None]:
#lambda x: Christian Bale in x

In [None]:
for tag in sorted(movie_data, key=duration_means_by_type.get, reverse=True):
    print tag, duration_means_by_type[tag]