# 03  Combined the Disney Films List with IMDB Data


## 03.01 Imports


### 03.01.01 Python Imports


In [1]:
import gzip
import pandas as pd

### 03.01.02 IMDB Rating and Vote Data

In [7]:
rt=gzip.open('../Other Source Data/IMDB/title.ratings.tsv.gz','rb')
df_ratings = pd.read_csv(rt,sep='\t', low_memory=False)
df_ratings.head()
# Index	tconst	averageRating	numVotes
# 0	tt0000001	5.7	1868
# 1	tt0000002	5.9	247
# 2	tt0000003	6.5	1640
# 3	tt0000004	5.8	159
# 4	tt0000005	6.2	2463

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1868
1,tt0000002,5.9,247
2,tt0000003,6.5,1640
3,tt0000004,5.8,159
4,tt0000005,6.2,2463


### 03.01.03 Import Feature Films


In [8]:
Disney_IMDB = pd.read_csv('../Bens_Data/Combined_IMDB_Disney_453.csv')
Disney_IMDB.head()

Unnamed: 0.1,Unnamed: 0,title,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,lower_title,_merge
0,0,101 Dalmatians,tt0115433,movie,101 Dalmatians,101 Dalmatians,0,1996,\N,103,"Adventure,Comedy,Crime",101 dalmatians,both
1,5,102 Dalmatians,tt0211181,movie,102 Dalmatians,102 Dalmatians,0,2000,\N,100,"Adventure,Comedy,Family",102 dalmatians,both
2,10,"20,000 Leagues Under the Sea",tt0046672,movie,"20,000 Leagues Under the Sea","20,000 Leagues Under the Sea",0,1954,\N,127,"Adventure,Drama,Family",20000 leagues under the sea,both
3,24,A Bug's Life,tt0120623,movie,A Bug's Life,A Bug's Life,0,1998,\N,95,"Adventure,Animation,Comedy",a bugs life,both
4,74,A Christmas Carol,tt1067106,movie,A Christmas Carol,A Christmas Carol,0,2009,\N,96,"Adventure,Animation,Comedy",a christmas carol,both


In [9]:
Disney_IMDB.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 453 entries, 0 to 452
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0      453 non-null    int64 
 1   title           453 non-null    object
 2   tconst          453 non-null    object
 3   titleType       453 non-null    object
 4   primaryTitle    453 non-null    object
 5   originalTitle   453 non-null    object
 6   isAdult         453 non-null    int64 
 7   startYear       453 non-null    int64 
 8   endYear         453 non-null    object
 9   runtimeMinutes  453 non-null    object
 10  genres          453 non-null    object
 11  lower_title     453 non-null    object
 12  _merge          453 non-null    object
dtypes: int64(3), object(10)
memory usage: 46.1+ KB


## 03.02 Clean Up Feature Films

In [1]:
#drop unnamed 0
Disney_IMDB = Disney_IMDB.rename(columns={'Unnamed: 0': 'orig_index','title': 'DFL_title'})

NameError: name 'Disney_IMDB' is not defined

In [11]:
Disney_IMDB['endYear'].value_counts()

\N    453
Name: endYear, dtype: int64

In [12]:
# drop endYear since we aren't working any series
Disney_IMDB.drop(columns=['_merge','isAdult','endYear' ], inplace=True)

Pulling in the film rating from IMDB

## 03.03 Combine Feature Films and Ratings

In [13]:
Disney_IMDB = pd.merge(Disney_IMDB ,                 # left df
                          df_ratings,                  # right df
                          how="left",                 # left join
                          left_on='tconst',            # left column
                          right_on='tconst',    # right column
                          indicator = True,           # indicates source of each row
                          #validate = "one_to_many"    # alerts us of the relationship from left to right, incase there are dups
        )

## 03.04 Explore Combined Feature Films and Ratings

### 03.04.01 Clean those missing Ratings and Votes


In [14]:
Disney_IMDB['_merge'].value_counts()

both          442
left_only      11
right_only      0
Name: _merge, dtype: int64

11 a missing ratings.  Let's take a closer look at those.

In [15]:
Disney_IMDB[Disney_IMDB['_merge'] == 'left_only']

Unnamed: 0,orig_index,DFL_title,tconst,titleType,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,lower_title,averageRating,numVotes,_merge
79,1607,Doctor Strange in the Multiverse of Madness,tt9419884,movie,Doctor Strange in the Multiverse of Madness,Doctor Strange in the Multiverse of Madness,2022,\N,"Action,Adventure,Fantasy",doctor strange in the multiverse of madness,,,left_only
125,2250,Holes,tt15507514,movie,Holes,Holes,2021,79,Documentary,holes,,,left_only
139,2535,Inspector Gadget,tt11069282,movie,Inspector Gadget,Inspector Gadget,2026,\N,"Action,Adventure,Comedy",inspector gadget,,,left_only
154,2957,Let it Shine,tt7932304,movie,Let It Shine,Let It Shine,2018,98,Family,let it shine,,,left_only
156,2965,Lightyear,tt10298810,movie,Lightyear,Lightyear,2022,\N,"Action,Adventure,Animation",lightyear,,,left_only
159,2971,Lilo & Stitch,tt11655566,movie,Lilo & Stitch,Lilo & Stitch,2024,85,"Action,Adventure,Comedy",lilo stitch,,,left_only
199,3380,National Treasure,tt8292728,movie,National Treasure,Gui bao,1983,96,Drama,national treasure,,,left_only
209,3431,Oceans,tt1171257,movie,Oceans,Oceans,1971,\N,"Documentary,Sport",oceans,,,left_only
232,3771,Polly,tt1924347,movie,Polly,Polly,2010,50,Horror,polly,,,left_only
385,5547,The Secret of The Magic Gourd,tt6476724,movie,The Secret of the Magic Gourd,Bao hu lu de bi mi,1963,68,"Comedy,Drama,Fantasy",the secret of the magic gourd,,,left_only


In [16]:
# This film has not yet been released (expected May 2022) tt9419884
Disney_IMDB[Disney_IMDB['lower_title'].str.contains("doctor strange in the multiverse of madness")]

Unnamed: 0,orig_index,DFL_title,tconst,titleType,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,lower_title,averageRating,numVotes,_merge
79,1607,Doctor Strange in the Multiverse of Madness,tt9419884,movie,Doctor Strange in the Multiverse of Madness,Doctor Strange in the Multiverse of Madness,2022,\N,"Action,Adventure,Fantasy",doctor strange in the multiverse of madness,,,left_only


In [17]:
Disney_IMDB.drop(Disney_IMDB[Disney_IMDB['tconst'] == "tt9419884"].index, inplace = True)

In [18]:
# Duplicate missed in the previous file.  tt15507514 should habe been dropped. 
Disney_IMDB[Disney_IMDB['lower_title'].str.contains("holes")]

Unnamed: 0,orig_index,DFL_title,tconst,titleType,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,lower_title,averageRating,numVotes,_merge
124,2244,Holes,tt0311289,movie,Holes,Holes,2003,117,"Adventure,Comedy,Drama",holes,7.0,85609.0,both
125,2250,Holes,tt15507514,movie,Holes,Holes,2021,79,Documentary,holes,,,left_only


In [19]:
Disney_IMDB.drop(Disney_IMDB[Disney_IMDB['tconst'] == "tt15507514"].index, inplace = True)

In [20]:
# tt11069282 This film has not yet been released (expected 2026) tt11069282 should habe been dropped. 
Disney_IMDB[Disney_IMDB['lower_title'].str.contains("inspector gadget")]

Unnamed: 0,orig_index,DFL_title,tconst,titleType,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,lower_title,averageRating,numVotes,_merge
138,2532,Inspector Gadget,tt0141369,movie,Inspector Gadget,Inspector Gadget,1999,78,"Action,Adventure,Comedy",inspector gadget,4.1,47777.0,both
139,2535,Inspector Gadget,tt11069282,movie,Inspector Gadget,Inspector Gadget,2026,\N,"Action,Adventure,Comedy",inspector gadget,,,left_only


In [21]:
Disney_IMDB.drop(Disney_IMDB[Disney_IMDB['tconst'] == "tt11069282"].index, inplace = True)

In [2]:
# Error misdded in the previous file. This is not a Disney Movie.  The Disney film, tt2165933, was actually a TV movie. So neither of these should be in the list.
Disney_IMDB[Disney_IMDB['lower_title'].str.contains("let it shine")]

NameError: name 'Disney_IMDB' is not defined

In [23]:
Disney_IMDB.drop(Disney_IMDB[Disney_IMDB['tconst'] == "tt7932304"].index, inplace = True)

In [24]:
# This film has not yet been released (expected June 2022) tt10298810
Disney_IMDB[Disney_IMDB['lower_title'].str.contains("lightyear")]

Unnamed: 0,orig_index,DFL_title,tconst,titleType,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,lower_title,averageRating,numVotes,_merge
156,2965,Lightyear,tt10298810,movie,Lightyear,Lightyear,2022,\N,"Action,Adventure,Animation",lightyear,,,left_only


In [25]:
Disney_IMDB.drop(Disney_IMDB[Disney_IMDB['tconst'] == "tt10298810"].index, inplace = True)

In [26]:
# This film has not yet been released (expected 2024) tt11655566
Disney_IMDB[Disney_IMDB['lower_title'].str.contains("lilo")]

Unnamed: 0,orig_index,DFL_title,tconst,titleType,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,lower_title,averageRating,numVotes,_merge
158,2968,Lilo & Stitch,tt0275847,movie,Lilo & Stitch,Lilo & Stitch,2002,85,"Adventure,Animation,Comedy",lilo stitch,7.3,183336.0,both
159,2971,Lilo & Stitch,tt11655566,movie,Lilo & Stitch,Lilo & Stitch,2024,85,"Action,Adventure,Comedy",lilo stitch,,,left_only


In [27]:
Disney_IMDB.drop(Disney_IMDB[Disney_IMDB['tconst'] == "tt11655566"].index, inplace = True)

In [28]:
# tt8292728 is not a Disney title and should be dropped
Disney_IMDB[Disney_IMDB['lower_title'].str.contains("national")]

Unnamed: 0,orig_index,DFL_title,tconst,titleType,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,lower_title,averageRating,numVotes,_merge
198,3366,National Treasure,tt0368891,movie,National Treasure,National Treasure,2004,131,"Action,Adventure,Mystery",national treasure,6.9,323124.0,both
199,3380,National Treasure,tt8292728,movie,National Treasure,Gui bao,1983,96,Drama,national treasure,,,left_only
200,3382,National Treasure: Book of Secrets,tt0465234,movie,National Treasure: Book of Secrets,National Treasure: Book of Secrets,2007,124,"Action,Adventure,Mystery",national treasure book of secrets,6.5,231520.0,both


In [29]:
Disney_IMDB.drop(Disney_IMDB[Disney_IMDB['tconst'] == "tt8292728"].index, inplace = True)

In [30]:
# tt1171257 is not a Disney movie
Disney_IMDB[Disney_IMDB['lower_title'].str.contains("oceans")]

Unnamed: 0,orig_index,DFL_title,tconst,titleType,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,lower_title,averageRating,numVotes,_merge
208,3425,Oceans,tt0765128,movie,Oceans,Océans,2009,104,Documentary,oceans,7.7,9609.0,both
209,3431,Oceans,tt1171257,movie,Oceans,Oceans,1971,\N,"Documentary,Sport",oceans,,,left_only


In [31]:
Disney_IMDB.drop(Disney_IMDB[Disney_IMDB['tconst'] == "tt1171257"].index, inplace = True)

In [32]:
# tt1924347 was a TV movie, and should have been excluded
Disney_IMDB[Disney_IMDB['lower_title'].str.contains("polly")]

Unnamed: 0,orig_index,DFL_title,tconst,titleType,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,lower_title,averageRating,numVotes,_merge
232,3771,Polly,tt1924347,movie,Polly,Polly,2010,50,Horror,polly,,,left_only
233,3788,Pollyanna,tt0054195,movie,Pollyanna,Pollyanna,1960,134,"Comedy,Drama,Family",pollyanna,7.4,10132.0,both


In [33]:
Disney_IMDB.drop(Disney_IMDB[Disney_IMDB['tconst'] == "tt1924347"].index, inplace = True)
Disney_IMDB.drop(Disney_IMDB[Disney_IMDB['orig_index'] == "3787"].index, inplace = True)

In [34]:
# Error made in the previous file.  tt6476724 should habe been dropped.
Disney_IMDB[Disney_IMDB['lower_title'].str.contains("the secret of the magic gourd")]

Unnamed: 0,orig_index,DFL_title,tconst,titleType,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,lower_title,averageRating,numVotes,_merge
384,5546,The Secret of The Magic Gourd,tt0496386,movie,The Secret of the Magic Gourd,Bao hu lu de mi mi,2007,84,"Drama,Family",the secret of the magic gourd,5.1,707.0,both
385,5547,The Secret of The Magic Gourd,tt6476724,movie,The Secret of the Magic Gourd,Bao hu lu de bi mi,1963,68,"Comedy,Drama,Fantasy",the secret of the magic gourd,,,left_only


In [35]:
Disney_IMDB.drop(Disney_IMDB[Disney_IMDB['tconst'] == "tt6476724"].index, inplace = True)

In [36]:
# This film has not yet been released (expected 2024) tt4872762
Disney_IMDB[Disney_IMDB['lower_title'].str.contains("the sword in the stone")]

Unnamed: 0,orig_index,DFL_title,tconst,titleType,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,lower_title,averageRating,numVotes,_merge
396,5597,The Sword in the Stone,tt0057546,movie,The Sword in the Stone,The Sword in the Stone,1963,79,"Adventure,Animation,Comedy",the sword in the stone,7.2,95874.0,both
397,5602,The Sword in the Stone,tt4872762,movie,The Sword in the Stone,The Sword in the Stone,2024,\N,"Action,Adventure,Family",the sword in the stone,,,left_only


In [37]:
Disney_IMDB.drop(Disney_IMDB[Disney_IMDB['tconst'] == "tt4872762"].index, inplace = True)

In [38]:
Disney_IMDB.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 442 entries, 0 to 452
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   orig_index      442 non-null    int64   
 1   DFL_title       442 non-null    object  
 2   tconst          442 non-null    object  
 3   titleType       442 non-null    object  
 4   primaryTitle    442 non-null    object  
 5   originalTitle   442 non-null    object  
 6   startYear       442 non-null    int64   
 7   runtimeMinutes  442 non-null    object  
 8   genres          442 non-null    object  
 9   lower_title     442 non-null    object  
 10  averageRating   442 non-null    float64 
 11  numVotes        442 non-null    float64 
 12  _merge          442 non-null    category
dtypes: category(1), float64(2), int64(2), object(8)
memory usage: 45.5+ KB


In [39]:
Disney_IMDB['titleType'].value_counts()

movie    442
Name: titleType, dtype: int64

We've removed errors, duplcates, and removed anything that was not a feature release.  That leaves us with 442 Disney Movies. This is the data we will use to scrape IMDB for additional features.

## 03.05 Export Features Films

In [41]:
Disney_IMDB.to_csv('../Bens_Data/Updated_IMDB_Disney_442.csv')