In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('ggplot')

from imdb import Cinemagoer

In [4]:
imdb = Cinemagoer()

In [5]:
ratings = pd.read_csv("./data/ratings.csv")
movies = pd.read_csv("./data/movies.csv")
links = pd.read_csv("./data/links.csv", index_col = "movieId")

In [8]:
links.loc[links['tmdbId'] == 19995]

Unnamed: 0_level_0,imdbId,tmdbId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
72998,499549,19995.0


In [9]:
movies.loc[movies['movieId'] == 72998]

Unnamed: 0,movieId,title,genres
7212,72998,Avatar (2009),Action|Adventure|Sci-Fi|IMAX


In [6]:
ratings.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
50126,323,377,2.5,1422640386
34728,232,71464,4.0,1269465509
99409,608,7076,4.0,1117503270
13936,89,121097,4.5,1520409085
91357,592,419,3.0,837350664


In [10]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [17]:
movies.sample(5)

Unnamed: 0,movieId,title,genres
213,249,Immortal Beloved (1994),Drama|Romance
7799,92198,Seeking Justice (2011),Action|Drama|Thriller
2612,3497,Max Dugan Returns (1983),Comedy
718,937,Love in the Afternoon (1957),Comedy|Romance
6692,58291,College Road Trip (2008),Comedy


In [18]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [39]:
links = pd.read_csv("./data/links.csv", index_col = "movieId")
links.sample(5)

Unnamed: 0_level_0,imdbId,tmdbId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
84637,377981,45772.0
5165,80057,7219.0
440,106673,11566.0
4412,69372,28650.0
3606,41716,31516.0


In [12]:
links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB


Removing Duplicate Movies

In [19]:
dups = movies['movieId'].duplicated().to_frame()
dups.value_counts()

movieId
False      9742
dtype: int64

In [24]:
dups = movieId['title'].duplicated().to_frame()
dups.value_counts()

title
False    9737
True        5
dtype: int64

In [22]:
movieId = movies.set_index('movieId')

In [25]:
dupId = dups[dups['title']==True]
dupId

Unnamed: 0_level_0,title
movieId,Unnamed: 1_level_1
26958,True
64997,True
144606,True
147002,True
168358,True


In [27]:
dupList = movieId.loc[[26958, 64997, 144606, 147002, 168358]]

In [28]:
dupplicates = movieId[movieId.index.isin(dupId.index)]
dupplicates

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
26958,Emma (1996),Romance
64997,War of the Worlds (2005),Action|Sci-Fi
144606,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Romance|Thriller
147002,Eros (2004),Drama|Romance
168358,Saturn 3 (1980),Sci-Fi|Thriller


In [29]:
dup_titles = movieId[movieId['title'].isin(dupplicates['title'])]
dup_titles.sort_values(by=['title'])

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
6003,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Thriller
144606,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Romance|Thriller
838,Emma (1996),Comedy|Drama|Romance
26958,Emma (1996),Romance
32600,Eros (2004),Drama
147002,Eros (2004),Drama|Romance
2851,Saturn 3 (1980),Adventure|Sci-Fi|Thriller
168358,Saturn 3 (1980),Sci-Fi|Thriller
34048,War of the Worlds (2005),Action|Adventure|Sci-Fi|Thriller
64997,War of the Worlds (2005),Action|Sci-Fi


In [32]:
movieId.loc[6003,'genres'] = 'Comedy|Crime|Drama|Thriller|Romance'

In [36]:
movieId.drop([144606, 26958, 147002, 168358,64997], inplace =True)

In [None]:
movieId.loc[144606]

# Getting Names of Director from IMDB

In [42]:
dir = movieId.join(links)

In [43]:
dir.head()

Unnamed: 0_level_0,title,genres,imdbId,tmdbId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0
2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0
3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0
4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0
5,Father of the Bride Part II (1995),Comedy,113041,11862.0


In [44]:
ToyStory = imdb.get_movie('114709')

AttributeError: 'builtin_function_or_method' object has no attribute 'head'

In [62]:
print(sorted(ToyStory.keys()))

['akas', 'animation department', 'art department', 'art direction', 'aspect ratio', 'box office', 'camera and electrical department', 'canonical title', 'cast', 'casting department', 'casting director', 'certificates', 'color info', 'composer', 'countries', 'country codes', 'cover url', 'director', 'distributors', 'editor', 'editorial department', 'full-size cover url', 'genres', 'imdbID', 'kind', 'language codes', 'languages', 'localized title', 'long imdb canonical title', 'long imdb title', 'miscellaneous crew', 'music department', 'original air date', 'original title', 'other companies', 'plot', 'plot outline', 'producer', 'production companies', 'production manager', 'rating', 'runtimes', 'smart canonical title', 'smart long imdb canonical title', 'sound crew', 'sound mix', 'synopsis', 'title', 'top 250 rank', 'visual effects', 'votes', 'writer', 'year']


In [107]:
top_list = ToyStory['cast'][0]['name']
top_list

'Tom Hanks'

In [98]:
top_list[0]['name']

'Tom Hanks'

In [109]:
def get_top_actor(movie_id):
    theMovie = imdb.get_movie(movie_id)
    top_actor = theMovie['cast'][0]['name']
    return top_actor

In [110]:
get_top_actor('114709')

'Tom Hanks'

In [46]:
for director in ToyStory['directors']:
    print(director['name'])

John Lasseter


In [112]:
def get_directors(movie_id):
    theMovie = imdb.get_movie(movie_id)
    director_name = [director['name'] for director in theMovie['directors']]
    return director_name

In [114]:
# def get_directorsss(movie_id):
#     theMovie = imdb.get_movie(movie_id)
#     for director in theMovie['directors']:
#          return director['name']


In [117]:
get_directors('11709')

['Victor Schertzinger']

In [None]:
dir['Top Actor'] = dir['imdbId'].apply(get_top_actor)

In [None]:
dir['Actors'] = dir['imdbId'].apply(get_actors)

In [59]:
dir.drop('Directors', axis=1, inplace=True)

In [60]:
dir

Unnamed: 0_level_0,title,genres,imdbId,tmdbId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0
2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0
3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0
4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0
5,Father of the Bride Part II (1995),Comedy,113041,11862.0
...,...,...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,5476944,432131.0
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,5914996,445030.0
193585,Flint (2017),Drama,6397426,479308.0
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,8391976,483455.0
