In [1]:
"Movie Predictions"
import pandas as pd
import os
os.makedirs('Data/',exist_ok=True) # Confirm folder created
os.listdir("Data/")

print ('retrieving basic url')
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
basics = pd.read_csv(basics_url, sep='\t', low_memory=False, compression = 'gzip')
print(' retrieved basic url')

print ('retrieving ratings url')
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False, compression = 'gzip')
print ('retrieved ratings url')

print ('retrieving akas url')
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
akas = pd.read_csv(akas_url, sep='\t', low_memory=False, compression = 'gzip')
print ('retrieved akas url')

retrieving basic url
 retrieved basic url
retrieving ratings url
retrieved ratings url
retrieving akas url
retrieved akas url


In [7]:
basics.info

<bound method DataFrame.info of              tconst  titleType               primaryTitle  \
0         tt0000001      short                 Carmencita   
1         tt0000002      short     Le clown et ses chiens   
2         tt0000003      short             Pauvre Pierrot   
3         tt0000004      short                Un bon bock   
4         tt0000005      short           Blacksmith Scene   
...             ...        ...                        ...   
10100790  tt9916848  tvEpisode              Episode #3.17   
10100791  tt9916850  tvEpisode              Episode #3.19   
10100792  tt9916852  tvEpisode              Episode #3.20   
10100793  tt9916856      short                   The Wind   
10100794  tt9916880  tvEpisode  Horrid Henry Knows It All   

                      originalTitle isAdult startYear endYear runtimeMinutes  \
0                        Carmencita       0      1894      \N              1   
1            Le clown et ses chiens       0      1892      \N              

# Replace \n with np.nan

In [10]:
akas = akas.replace({'\\N':'np.nan'})

In [9]:
basics = basics.replace({'\\N':'np.nan'})

In [11]:
ratings = ratings.replace({'\\N':'np.nan'})

In [13]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,np.nan,imdbDisplay,np.nan,0
1,tt0000001,2,Carmencita,DE,np.nan,np.nan,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,np.nan,imdbDisplay,np.nan,0
3,tt0000001,4,Καρμενσίτα,GR,np.nan,imdbDisplay,np.nan,0
4,tt0000001,5,Карменсита,RU,np.nan,imdbDisplay,np.nan,0


In [14]:
akas = akas[akas.region == 'US']

In [17]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,np.nan,imdbDisplay,np.nan,0
14,tt0000002,7,The Clown and His Dogs,US,np.nan,np.nan,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,np.nan,imdbDisplay,np.nan,0
36,tt0000005,1,Blacksmithing Scene,US,np.nan,alternative,np.nan,0
41,tt0000005,6,Blacksmith Scene #1,US,np.nan,alternative,np.nan,0


In [18]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers =basics['tconst'].isin(akas['titleId'])
keepers

0            True
1            True
2           False
3           False
4            True
            ...  
10100790    False
10100791    False
10100792    False
10100793    False
10100794    False
Name: tconst, Length: 10100795, dtype: bool

In [19]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,np.nan,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,np.nan,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,np.nan,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,np.nan,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,np.nan,1,"Short,Sport"
...,...,...,...,...,...,...,...,...,...
10100656,tt9916560,tvMovie,March of Dimes Presents: Once Upon a Dime,March of Dimes Presents: Once Upon a Dime,0,1963,np.nan,58,Family
10100685,tt9916620,movie,The Copeland Case,The Copeland Case,0,np.nan,np.nan,np.nan,Drama
10100723,tt9916702,short,Loving London: The Playground,Loving London: The Playground,0,np.nan,np.nan,np.nan,"Drama,Short"
10100746,tt9916756,short,Pretty Pretty Black Girl,Pretty Pretty Black Girl,0,2019,np.nan,np.nan,Short


# Eliminate movies that are null for runtimeMinutes
# Eliminate movies that are null for genre

In [20]:
# Exclude movies that are null for genre
null_genre = basics['genres'].str.contains('np.nan',case=False)
basics = basics[~null_genre]

In [22]:
# Exclude movies that are null for runtime
null_runtime = basics['runtimeMinutes'].str.contains('np.nan',case=False)
basics = basics[~null_runtime]

In [24]:
basics.tail()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
10100488,tt9916214,short,Drown the Clown,Drown the Clown,0,2019,np.nan,8,"Drama,Short"
10100508,tt9916254,video,Big Tit Cream Pie 32,Big Tit Cream Pie 32,1,2015,np.nan,226,Adult
10100554,tt9916348,video,Ancient World Exposed,Ancient World Exposed,0,2019,np.nan,67,History
10100561,tt9916362,movie,Coven,Akelarre,0,2020,np.nan,92,"Drama,History"
10100656,tt9916560,tvMovie,March of Dimes Presents: Once Upon a Dime,March of Dimes Presents: Once Upon a Dime,0,1963,np.nan,58,Family


# Keep only the movies

In [25]:
basics = basics[basics.titleType == 'movie']

In [26]:
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,np.nan,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,np.nan,100,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,np.nan,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,np.nan,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,np.nan,120,"Adventure,Fantasy"
...,...,...,...,...,...,...,...,...,...
10100150,tt9915436,movie,Vida em Movimento,Vida em Movimento,0,2019,np.nan,70,Documentary
10100328,tt9915872,movie,The Last White Witch,Boku no kanojo wa mahoutsukai,0,2019,np.nan,97,"Comedy,Drama,Fantasy"
10100468,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,np.nan,51,Drama
10100477,tt9916190,movie,Safeguard,Safeguard,0,2020,np.nan,95,"Action,Adventure,Thriller"


In [None]:
# drop the null values from startYear so we can convert it



In [28]:
# Exclude movies that are null for genre
null_start = basics['startYear'].str.contains('np.nan',case=False)
basics = basics[~null_start]

In [29]:
# Convert the startYear column to float data type.
basics['startYear'] = basics['startYear'].astype(float)
# Confirm the datatype
basics.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  basics['startYear'] = basics['startYear'].astype(float)


tconst             object
titleType          object
primaryTitle       object
originalTitle      object
isAdult            object
startYear         float64
endYear            object
runtimeMinutes     object
genres             object
dtype: object

In [31]:
basics = basics[(basics.startYear >=2000) & (basics.startYear <=2021)]

In [32]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]


In [34]:
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1990
1,tt0000002,5.8,264
2,tt0000003,6.5,1863
3,tt0000004,5.5,177
4,tt0000005,6.2,2647
...,...,...,...
1340666,tt9916730,8.3,10
1340667,tt9916766,7.0,22
1340668,tt9916778,7.2,36
1340669,tt9916840,8.8,6


In [35]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers = ratings['tconst'].isin(basics['tconst'])
keepers



0          False
1          False
2          False
3          False
4          False
           ...  
1340666    False
1340667    False
1340668    False
1340669    False
1340670    False
Name: tconst, Length: 1340671, dtype: bool

In [36]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [37]:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()



Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,np.nan,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,np.nan,70,Drama
2,tt0068865,movie,Lives of Performers,Lives of Performers,0,2016.0,np.nan,90,Drama
3,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,np.nan,122,Drama
4,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,np.nan,100,"Comedy,Horror,Sci-Fi"


In [42]:
## Save current dataframe to file.
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

In [43]:
# Open saved file and preview again
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory = False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,np.nan,imdbDisplay,np.nan,0
1,tt0000002,7,The Clown and His Dogs,US,np.nan,np.nan,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,np.nan,imdbDisplay,np.nan,0
3,tt0000005,1,Blacksmithing Scene,US,np.nan,alternative,np.nan,0
4,tt0000005,6,Blacksmith Scene #1,US,np.nan,alternative,np.nan,0


In [44]:
## Save current dataframe to file.
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

In [45]:
# Open saved file and preview again
ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory = False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1990
1,tt0000002,5.8,264
2,tt0000003,6.5,1863
3,tt0000004,5.5,177
4,tt0000005,6.2,2647


In [38]:
akas.info

<bound method DataFrame.info of             titleId  ordering                                      title  \
5         tt0000001         6                                 Carmencita   
14        tt0000002         7                     The Clown and His Dogs   
33        tt0000005        10                           Blacksmith Scene   
36        tt0000005         1                        Blacksmithing Scene   
41        tt0000005         6                        Blacksmith Scene #1   
...             ...       ...                                        ...   
36950627  tt9916560         1  March of Dimes Presents: Once Upon a Dime   
36950697  tt9916620         1                          The Copeland Case   
36950786  tt9916702         1              Loving London: The Playground   
36950829  tt9916756         1                   Pretty Pretty Black Girl   
36950845  tt9916764         1                                         38   

         region language        types             attri

In [39]:
basics.info

<bound method DataFrame.info of           tconst titleType                                       primaryTitle  \
0      tt0035423     movie                                     Kate & Leopold   
1      tt0062336     movie  The Tango of the Widower and Its Distorting Mi...   
2      tt0068865     movie                                Lives of Performers   
3      tt0069049     movie                         The Other Side of the Wind   
4      tt0088751     movie                                  The Naked Monster   
...          ...       ...                                                ...   
82004  tt9914942     movie                             Life Without Sara Amat   
82005  tt9915872     movie                               The Last White Witch   
82006  tt9916170     movie                                      The Rehearsal   
82007  tt9916190     movie                                          Safeguard   
82008  tt9916362     movie                                              Coven

In [40]:
ratings.info

<bound method DataFrame.info of             tconst  averageRating  numVotes
0        tt0000001            5.7      1990
1        tt0000002            5.8       264
2        tt0000003            6.5      1863
3        tt0000004            5.5       177
4        tt0000005            6.2      2647
...            ...            ...       ...
1340666  tt9916730            8.3        10
1340667  tt9916766            7.0        22
1340668  tt9916778            7.2        36
1340669  tt9916840            8.8         6
1340670  tt9916880            8.2         6

[1340671 rows x 3 columns]>