In [None]:
import pandas as pd
import numpy as np
from os.path import exists

# IMDB DATA
DOWNLOAD: https://datasets.imdbws.com

DETAILS:  https://www.imdb.com/interfaces/


## Analyzing each of the **individual** TSV files

### title.akas.tsv.gz - Contains the following information for titles:

- titleId (string) - a tconst, an alphanumeric unique identifier of the title
- ordering (integer) – a number to uniquely identify rows for a given titleId
- title (string) – the localized title
- region (string) - the region for this version of the title
- language (string) - the language of the title
- types (array) - Enumerated set of attributes for this alternative title. One or more of the following: "alternative", "dvd", "festival", "tv", "video", "working", "original", "imdbDisplay". New values may be added in the future without warning
- attributes (array) - Additional terms to describe this alternative title, not enumerated
- isOriginalTitle (boolean) – 0: not original title; 1: original title

In [None]:
# IGNORED DUE TO LACK OF USEFULLNESS

# try:
#         title_akas = pd.read_feather('data/title_akas.feather')
# except:
#         title_akas = pd.read_csv(
#                 "data/title_akas.tsv",
#                 sep="\t",
#                 usecols=[
#                 "titleId",
#                 "title",
#                 "region",
#                 "language",
#                 "types",
#                 "attributes",
#                 ],
#                 dtype={
#                         "titleId": "string",
#                         "title": "string",
#                         "region": "string",
#                         "language": "string",
#                         "types": "category",
#                         "attributes": "category",
#                 },
#                 )

In [None]:
# IGNORED DUE TO LACK OF USEFULLNESS

# # CLEANING SECTION
# if not exists("data/title_akas.feather"):
#     cleaned_title_akas = title_akas.copy()
# else:
#     pass

In [None]:
# # IGNORED DUE TO LACK OF USEFULLNESS

# # SAVING SECTION
# if not exists("data/title_akas.feather"):
#     # Clean up indexing for .feather file
#     cleaned_title_akas.reset_index(inplace=True, drop=True)
#     # Replace '\N' with proper NaN
#     cleaned_title_akas.replace('\\N', np.NaN, inplace=True)
#     # Save to Feather
#     cleaned_title_akas.to_feather("data/title_akas.feather")
# else:
#     pass

### title.basics.tsv.gz - Contains the following information for titles:
- tconst (string) - alphanumeric unique identifier of the title
- titleType (string) – the type/format of the title (e.g. movie, short, tvseries, tvepisode, video, etc)
- primaryTitle (string) – the more popular title / the title used by the filmmakers on promotional materials at the point of release
- originalTitle (string) - original title, in the original language
- isAdult (boolean) - 0: non-adult title; 1: adult title
- startYear (YYYY) – represents the release year of a title. In the case of TV Series, it is the series start year
- endYear (YYYY) – TV Series end year. ‘\N’ for all other title types
- runtimeMinutes – primary runtime of the title, in minutes
- genres (string array) – includes up to three genres associated with the title

In [None]:
try:
        title_basics = pd.read_feather("data/title_basics.feather")
except:
        title_basics = pd.read_csv(
                "data/title_basics.tsv",
                sep="\t",
                usecols=[
                "tconst",
                "titleType",
                "primaryTitle",
                "originalTitle",
                "startYear",
                "runtimeMinutes",
                "genres",
                ],
                dtype={
                        "tconst": "string",
                        "titleType": "string",
                        "primaryTitle": "string",
                        "originalTitle": "string",
                        # "startYear": "Int16",
                        # "runtimeMinutes": "Int16",
                        "genres": "category",
                },
                # parse_dates=["startYear"],
                # date_parser=lambda x: pd.to_datetime(x, format="%Y"), # See https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
                )

In [None]:
# CLEANING SECTION
if not exists("data/title_basics.feather"):
    cleaned_title_basics = title_basics.copy()
else:
    pass


In [None]:
if not exists("data/title_basics.feather"):
    # Clean up indexing for .feather file
    cleaned_title_basics.reset_index(inplace=True, drop=True)
    # Replace '\N' with proper NaN
    cleaned_title_basics.replace('\\N', np.NaN, inplace=True)
    # Save to Feather
    cleaned_title_basics.to_feather("data/title_basics.feather")
else:
    pass

### title.principals.tsv
- tconst (string) - alphanumeric unique identifier of the title
- ordering (integer) – a number to uniquely identify rows for a given titleId
- nconst (string) - alphanumeric unique identifier of the name/person
- category (string) - the category of job that person was in
- job (string) - the specific job title if applicable, else '\N'
- characters (string) - the name of the character played if applicable, else '\N'

In [None]:
try:
    title_principals = pd.read_feather("data/title_principals.feather")
except:
    title_principals = pd.read_csv(
            "data/title_principals.tsv",
            sep="\t",
            usecols=[
                "tconst",
                "nconst",
                "category",
                "characters",
            ],
            dtype={
                "tconst": "string",
                "nconst": "string",
                "category": "string",
                "characters": "string",
            }
            )

In [None]:
# CLEANING SECTION
if not exists("data/title_principals.feather"):
    cleaned_title_principals = title_principals.copy()

    # We only want actors, actresses & directors:
    cleaned_title_principals = cleaned_title_principals[
        (cleaned_title_principals["category"].str.lower().str.contains("actor")) |
        (cleaned_title_principals["category"].str.lower().str.contains("actress")) |
        (cleaned_title_principals["category"].str.lower().str.contains("director"))
        ]
else:
    pass

In [None]:

if not exists("data/title_principals.feather"):
    # Clean up indexing for .feather file
    cleaned_title_principals.reset_index(inplace=True, drop=True)
    # Replace '\N' with proper NaN
    cleaned_title_principals.replace('\\N', np.NaN, inplace=True)
    # Save to Feather
    cleaned_title_principals.to_feather("data/title_principals.feather")
else:
    pass

### title.ratings.tsv.gz – Contains the IMDb rating and votes information for titles
- tconst (string) - alphanumeric unique identifier of the title
- averageRating – weighted average of all the individual user ratings
- numVotes - number of votes the title has received

In [None]:
try:
    title_ratings = pd.read_feather("data/title_ratings.feather")
except:
    title_ratings = pd.read_csv(
            "data/title_ratings.tsv",
            sep="\t",
            dtype={
                "tconst": "string",
                "averageRating": "Float32",
                "numVotes": "Int32",
            }
            )

In [None]:
# CLEANING SECTION
if not exists("data/title_ratings.feather"):
    cleaned_title_ratings = title_ratings.copy()
else:
    pass


In [None]:

if not exists("data/title_ratings.feather"):
    # Clean up indexing for .feather file
    cleaned_title_ratings.reset_index(inplace=True, drop=True)
    # Replace '\N' with proper NaN
    cleaned_title_ratings.replace('\\N', np.NaN, inplace=True)
    # Save to Feather
    cleaned_title_ratings.to_feather("data/title_ratings.feather")
else:
    pass

# title_ratings.info()

### name.basics.tsv.gz – Contains the following information for names:
- nconst (string) - alphanumeric unique identifier of the name/person
- primaryName (string)– name by which the person is most often credited
- birthYear – in YYYY format
- deathYear – in YYYY format if applicable, else '\N'
- primaryProfession (array of strings)– the top-3 professions of the person
- knownForTitles (array of tconsts) – titles the person is known for

In [None]:
try:
    name_basics = pd.read_feather("data/name_basics.feather")
except:
    name_basics = pd.read_csv(
            "data/name_basics.tsv",
            sep="\t",
            usecols=[
                "nconst",
                "primaryName",
                "primaryProfession",
                "knownForTitles",
            ],
            dtype={
                "nconst": "string",
                "primaryName": "string",
                "primaryProfession": "category",
                "knownForTitles": "category",
            }
            )

#### Cleaning name_basics

In [None]:
# CLEANING SECTION
if not exists("data/name_basics.feather"):
    cleaned_name_basics = name_basics.copy()

    # If knownForTitles is \N then drop it, it's useless.
    cleaned_name_basics = cleaned_name_basics[cleaned_name_basics["knownForTitles"] != "\\N"]

    # We only want actors, actresses & directors:
    cleaned_name_basics = cleaned_name_basics[
        (cleaned_name_basics["primaryProfession"].str.lower().str.contains("actor")) |
        (cleaned_name_basics["primaryProfession"].str.lower().str.contains("actress")) |
        (cleaned_name_basics["primaryProfession"].str.lower().str.contains("director"))
        ]
else:
    pass

#### Saving as a feather file:

In [None]:

if not exists("data/name_basics.feather"):
    # Clean up indexing for .feather file
    cleaned_name_basics.reset_index(inplace=True, drop=True)
    # Replace '\N' with proper NaN
    cleaned_name_basics.replace('\\N', np.NaN, inplace=True)
    # Save to Feather
    cleaned_name_basics.to_feather("data/name_basics.feather")
else:
    pass

In [None]:
# IGNORED DUE TO LACK OF USEFULLNESS

# if not exists("data/title_akas.feather"):
#     title_akas = cleaned_title_akas
#     del cleaned_title_akas
# else:
#     pass
# title_akas.head(3)

In [None]:
if not exists("data/title_basics.feather"):
    title_basics = cleaned_title_basics
    del cleaned_title_basics
else:
    pass
title_basics.head(3)

In [None]:
if not exists("data/title_ratings.feather"):
    title_ratings = cleaned_title_ratings
    del cleaned_title_ratings
else:
    pass
title_ratings.head(3)

In [None]:
if not exists("data/name_basics.feather"):
    name_basics = cleaned_name_basics
    del cleaned_name_basics
else:
    pass
name_basics.head(3)

In [None]:
if not exists("data/title_principals.feather"):
    title_principals = cleaned_title_principals
    del cleaned_title_principals
else:
    pass
title_principals.head(3)

## Reducing the above into something more managable

In [None]:
imdb_final_df = title_basics.copy()

### Merge ratings to imdb_final_df
Let's try to merge & clear what we don't need to free up mem:

In [None]:
title_ratings = pd.read_feather("data/title_ratings.feather")
imdb_final_df = pd.merge(
    title_basics,
    title_ratings,
    on="tconst",
    how="left"
)
# del title_ratings

### Merge title_principles & name_basics

Create DF that:
- only shows directors, actors and actress
- is a merge of title_principles & name_basics

In [None]:
actors_and_directors = title_principals.copy()

In [None]:
actors_and_directors = actors_and_directors[
    (actors_and_directors["category"] == "director") |
    (actors_and_directors["category"] == "actor") |
    (actors_and_directors["category"] == "actress")
]

In [None]:
actors_and_directors = pd.merge(
    actors_and_directors,
    name_basics,
    on="nconst"
)

In [None]:
# We don't need originalTitle
imdb_final_df.drop("originalTitle", axis=1, inplace=True)

In [None]:
imdb_final_df = imdb_final_df[
    # (imdb_final_df["titleType"] == "short") | # Probably not needed?
    (imdb_final_df["titleType"] == "tvSeries") |
    (imdb_final_df["titleType"] == "movie")
]
imdb_final_df.replace('\\N', np.NaN, inplace=True)
imdb_final_df.reset_index(drop=True, inplace=True)

In [None]:
imdb_final_df.to_feather("data/imdb_final_df_v2.feather")

In [None]:
actors_and_directors.drop("primaryProfession", axis=1, inplace=True)   # Already have "category"
actors_and_directors.drop("characters", axis=1, inplace=True)          # Cool but not useful for lookup
actors_and_directors.drop("nconst", axis=1, inplace=True)              # Already served its purpose

In [None]:
actors_and_directors.to_feather("data/actors_and_directors_v2.feather")

Now we have 2 useful DFS that come to 0.74Gb instead of multiple tsv's that come to 5.68Gb

In [None]:
del title_principals
del name_basics
del title_basics

# From now on we can run from this line instead of the start:
Should save a few mins

In [None]:
# From now on we can run from this line instead of the start:
import pandas as pd
import numpy as np

imdb_final_df = pd.read_feather("data/imdb_final_df_v2.feather")
actors_and_directors = pd.read_feather("data/actors_and_directors.feather")

In [None]:
# Get unique genres
unique_genres = set()
for i in imdb_final_df["genres"].cat.categories:
    if ',' in i:
        i = i.split(",")
        unique_genres.update(i)
    else:
        unique_genres.add(i)
# unique_genres

In [None]:
search1 = imdb_final_df[imdb_final_df["primaryTitle"]=="Friends"]
search2 = search1[search1["titleType"] == "tvSeries"]
search3 = search2[search2["genres"].str.contains("Comedy")]
search3


In [None]:
search4 = actors_and_directors[actors_and_directors["primaryName"] == "Jennifer Aniston"]
search4

In [None]:
appears_in = []
for i in search3["tconst"]:
    search5 = search4[search4["tconst"] == str(i)]
    if search5.shape[0] > 0:
        appears_in.append(i)
appears_in

In [None]:
appears_in

In [None]:
actors_and_directors.head(3)

In [None]:
directors = actors_and_directors[actors_and_directors["category"] == "director"].copy()
directors.drop("primaryProfession", axis=1, inplace=True)   # Already have "category"
directors.drop("characters", axis=1, inplace=True)          # Cool but not useful for lookup
directors.drop("nconst", axis=1, inplace=True)              # Already served its purpose
directors.drop("category", axis=1, inplace=True)            # since we know they're all directors
directors.head()

In [None]:
imdb_final_df["startYear"] = imdb_final_df["startYear"].astype("Int16")
imdb_final_df["runtimeMinutes"] = imdb_final_df["runtimeMinutes"].astype("Int16")
imdb_final_df["primaryTitle"] = imdb_final_df["primaryTitle"].str.lower()
imdb_final_df.head(3)

In [None]:
directors_df = actors_and_directors[actors_and_directors["category"] =="director"].copy()
actors_df = actors_and_directors[actors_and_directors["category"] !="director"].copy()

In [None]:
directors_df.reset_index(inplace=True, drop=True)
actors_df.reset_index(inplace=True, drop=True)

In [None]:
directors_df.to_feather("data/directors.feather")

In [None]:
actors_df.drop("knownForTitles", axis=1, inplace=True)

In [None]:
actors_df.to_feather("data/actors.feather")

# REMOVED CODE

In [None]:
# title_basics = pd.read_csv(
#         "data/title_basics.tsv",
#         sep="\t",
#         usecols=[
#         "tconst",
#         "titleType",
#         "primaryTitle",
#         "originalTitle",
#         "startYear",
#         "runtimeMinutes",
#         "genres",
#         ],
#         dtype={
#                 "tconst": "string",
#                 "titleType": "string",
#                 "primaryTitle": "string",
#                 "originalTitle": "string",
#                 # "startYear": "Int16",
#                 # "runtimeMinutes": "Int16",
#                 "genres": "category",
#         },
#         # parse_dates=["startYear"],
#         # date_parser=lambda x: pd.to_datetime(x, format="%Y"), # See https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
#         )

In [None]:
# title_basics[title_basics["tconst"]=="tt12263402"]