In [35]:
import pandas as pd
import gc

In [36]:
basics = pd.read_csv('../../data/imdb_dataset/title.basics.tsv', sep='\t', low_memory=False, index_col='tconst')
movies = basics.loc[(basics['titleType'] == 'movie') | (basics['titleType'] == 'short') | (basics['titleType'] == 'video') | (basics['titleType'] == 'tvMovie')]
movies = movies.loc[movies['isAdult'] == '0']
movies = movies.loc[movies['runtimeMinutes'] != r'\N']
movies = movies.loc[movies['genres'] != r'\N']
movies = movies.loc[movies['startYear'] != r'\N']
movies = movies.drop(columns=['titleType', 'primaryTitle', 'isAdult', 'endYear'])
movies.dropna(inplace=True)
del basics
gc.collect()

1511

In [37]:
ratings = pd.read_csv('../../data/imdb_dataset/title.ratings.tsv', sep='\t', low_memory=False, index_col='tconst') 
ratings = ratings.loc[ratings.index.isin(movies.index)]
movies = pd.concat([movies, ratings], axis='columns', sort=False)
movies = movies.loc[movies['numVotes'] > 99]
movies.dropna(inplace=True)
del ratings
gc.collect()

0

In [38]:
crew = pd.read_csv('../../data/imdb_dataset/title.crew.tsv', sep='\t', low_memory=False, index_col='tconst')
crew = crew.loc[crew.index.isin(movies.index)]
movies = pd.concat([movies, crew], axis='columns', sort=False)
movies = movies.loc[movies['directors'] != r'\N']
movies = movies.loc[movies['writers'] != r'\N']
movies.dropna(inplace=True)
del crew
gc.collect()

0

In [39]:
cast = pd.read_csv('../../data/imdb_dataset/title.principals.tsv', sep='\t', low_memory=False, index_col='tconst')
cast = cast.loc[cast.index.isin(movies.index)]
cast = cast.loc[(cast['category'] == 'actor') | (cast['category'] == 'actress')]
cast['index'] = cast.index
cast.drop_duplicates(subset=['nconst','index'], inplace=True)
cast = cast.drop(columns='index')
cast = cast.groupby('tconst')['nconst'].apply(lambda x: ', '.join(x))
movies = pd.concat([movies, cast], axis='columns', sort=False)
movies = movies.loc[movies['nconst'] != r'\N']
movies.dropna(inplace=True)
del cast
gc.collect()

0

In [40]:
names = pd.read_csv('../../data/imdb_dataset/name.basics.tsv', sep='\t', low_memory=False)
code_to_name = dict(zip(names['nconst'], names['primaryName']))

def replace_codes_with_names(cell):
    if pd.isna(cell):
        return cell
    parts = [p.strip() for p in str(cell).split(',')]
    replaced = []
    for part in parts:
        if part == '':
            continue
        nome = code_to_name.get(part, part)
        replaced.append(nome)
    return ', '.join(replaced)

movies.loc[:,'directors'] = movies['directors'].apply(replace_codes_with_names)
movies.loc[:,'writers'] = movies['writers'].apply(replace_codes_with_names)
movies.loc[:,'nconst'] = movies['nconst'].apply(replace_codes_with_names)

del names
gc.collect()

0

In [41]:
movies.to_csv('../../data/general/movies.csv')