In [1]:
# imports
import pandas as pd
import numpy as np
import sqlalchemy
from sqlalchemy import create_engine

# Title.akas.tsv.gz

- **titleId (string):** A `tconst`, an alphanumeric unique identifier of the title
- **ordering (integer):** A number to uniquely identify rows for a given `titleId`
- **title (string):** The localized title
- **region (string):** The region for this version of the title
- **language (string):** The language of the title
- **types (array):** Enumerated set of attributes for this alternative title. One or more of the following: "alternative", "dvd", "festival", "tv", "video", "working", "original", "imdbDisplay". New values may be added in the future without warning
- **attributes (array):** Additional terms to describe this alternative title, not enumerated
- **isOriginalTitle (boolean):** 0: not original title; 1: origin
# Title.basics.tsv.gz

- **tconst (string):** Alphanumeric unique identifier of the title
- **titleType (string):** The type/format of the title (e.g., movie, short, tvseries, tvepisode, video, etc)
- **primaryTitle (string):** The more popular title / the title used by the filmmakers on promotional materials at the point of release
- **originalTitle (string):** Original title, in the original language
- **isAdult (boolean):** 0: non-adult title; 1: adult title
- **startYear (YYYY):** Represents the release year of a title. In the case of TV Series, it is the series start year
- **endYear (YYYY):** TV Series end year. ‘\N’ for all other title types
- **runtimeMinutes:** Primary runtime of the title, in minutes
- **genres (string array):** Includes up to three genres associated with

# Title.ratings.tsv.gz

- **tconst (string):** Alphanumeric unique identifier of the title
- **averageRating:** Weighted average of all the individual user ratings
- **numVotes:** Number of votes the title has received
 the title
al title


[Source](https://datasets.imdbws.com/)


In [2]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"

In [3]:
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)

In [4]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [5]:
df = basics

In [6]:
df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [7]:
# Reduce memory usage
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage(deep=True).sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':  # for integers
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:  # for floats.
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [8]:
reduce_mem_usage(df)

Mem. usage decreased to 5800.61 Mb (0.0% reduction)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
10277308,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2009,\N,\N,"Action,Drama,Family"
10277309,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
10277310,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
10277311,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


In [9]:
# Replace \N with nan
df = df.replace({'\\N':np.nan})
df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [10]:
df = df.dropna(subset=['runtimeMinutes'])

In [11]:
if df['runtimeMinutes'].isnull().any():
    print("There are null values in the runtimeMinutes column.")
else:
    print("No null values found in the runtimeMinutes column.")

No null values found in the runtimeMinutes column.


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3096871 entries, 0 to 10277312
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 236.3+ MB


In [13]:
df = df.dropna(subset=['genres'])

In [14]:
if df['genres'].isnull().any():
    print("There are null values in the genres column.")
else:
    print("No null values found in the genres column.")

No null values found in the genres column.


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3015959 entries, 0 to 10277312
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 230.1+ MB


In [16]:
df = df[df['titleType'] == 'movie']
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 391602 entries, 8 to 10277263
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          391602 non-null  object
 1   titleType       391602 non-null  object
 2   primaryTitle    391602 non-null  object
 3   originalTitle   391602 non-null  object
 4   isAdult         391602 non-null  object
 5   startYear       384948 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  391602 non-null  object
 8   genres          391602 non-null  object
dtypes: object(9)
memory usage: 29.9+ MB


In [17]:
#Check all records with year 
df = df[df['startYear'].str.contains('2000|2001|2002', na=False, regex=True, case=False)] 
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11725 entries, 34800 to 10258126
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          11725 non-null  object
 1   titleType       11725 non-null  object
 2   primaryTitle    11725 non-null  object
 3   originalTitle   11725 non-null  object
 4   isAdult         11725 non-null  object
 5   startYear       11725 non-null  object
 6   endYear         0 non-null      object
 7   runtimeMinutes  11725 non-null  object
 8   genres          11725 non-null  object
dtypes: object(9)
memory usage: 916.0+ KB


In [18]:
# Exclude movies that are included in the documentary category.
is_documentary = df['genres'].str.contains('documentary',case=False)
df = df[~is_documentary]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8652 entries, 34800 to 10257684
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          8652 non-null   object
 1   titleType       8652 non-null   object
 2   primaryTitle    8652 non-null   object
 3   originalTitle   8652 non-null   object
 4   isAdult         8652 non-null   object
 5   startYear       8652 non-null   object
 6   endYear         0 non-null      object
 7   runtimeMinutes  8652 non-null   object
 8   genres          8652 non-null   object
dtypes: object(9)
memory usage: 675.9+ KB


In [19]:
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"

mylist = []

for chunk in  pd.read_csv(akas_url, sep='\t', chunksize=20000):
    mylist.append(chunk)

akas = pd.concat(mylist, axis= 0)
del mylist

In [20]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [21]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers =df['tconst'].isin(akas['titleId'])
keepers

34800       True
93927       True
100065      True
110353      True
110465      True
            ... 
10230391    True
10230557    True
10241040    True
10248169    True
10257684    True
Name: tconst, Length: 8652, dtype: bool

In [22]:
df = df[keepers]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8643 entries, 34800 to 10257684
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          8643 non-null   object
 1   titleType       8643 non-null   object
 2   primaryTitle    8643 non-null   object
 3   originalTitle   8643 non-null   object
 4   isAdult         8643 non-null   object
 5   startYear       8643 non-null   object
 6   endYear         0 non-null      object
 7   runtimeMinutes  8643 non-null   object
 8   genres          8643 non-null   object
dtypes: object(9)
memory usage: 675.2+ KB


In [26]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")


['.ipynb_checkpoints']

In [27]:
## Save current dataframe to file.
df.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [28]:
# Open saved file and preview again
df = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama
2,tt0102362,movie,Istota,Istota,0,2000,,80,"Drama,Romance"
3,tt0112912,movie,Dune 7,Dune 7,0,2002,,97,Adventure
4,tt0113026,movie,The Fantasticks,The Fantasticks,0,2000,,86,"Musical,Romance"
