# Movies - Data Cleaning

In [86]:
import pandas as pd

In [87]:
path = "../../data/small"

In [88]:
links_urls = pd.read_csv(f"{path}/links_urls.csv")
init_movies = pd.read_csv(f"{path}/movies.csv")
ratings = pd.read_csv(f"{path}/ratings.csv")
tags = pd.read_csv(f"{path}/tags.csv")

## Movies
Only movies with at least one rating or tag are included in the dataset. These movie ids are consistent with those used on the MovieLens web site (e.g., id `1` corresponds to the URL <https://movielens.org/movies/1>). Movie ids are consistent between `ratings.csv`, `tags.csv`, `movies.csv`, and `links.csv` (i.e., the same id refers to the same movie across these four data files).


In [89]:
def init_pipeline(df):
    
    return df.copy()


In [90]:
def merge_tables(df):
    return df.merge(links_urls, how="left")


In [91]:
def create_columns(df):
    df["year"] = df["title"].str.extract("\((\d{4})\)", flags=0, expand=True)
    df["title"] = df["title"].str.replace("\(\d{4}\)","", regex=True)
    df["title"] = df["title"].str.replace("(.*), (The)", "\\2 \\1", regex=True)
    df = df.assign(movie_link = lambda x: "https://www.themoviedb.org/movie/" + str(x["tmdbId"]))

    return df

In [92]:
def missing_values(df):
    df[["year","tmdbId","image_url"]] = df[["year","tmdbId","image_url"]].fillna(0)
    return df


In [93]:
def adjust_dtypes(df):
    df[["year","tmdbId"]] = df[["year","tmdbId"]].astype(int)
    df = df.assign(genres = lambda x: x["genres"].str.split("|"))
    return df



In [94]:
def save_csv(df):
    df.to_csv("../../data/clean/movies.csv", index=False)
    return df


In [95]:
movies = (
    init_movies
        .pipe(init_pipeline)
        .pipe(merge_tables)
        .pipe(create_columns)
        .pipe(missing_values)
        .pipe(adjust_dtypes)
        .pipe(save_csv)

) 
movies.loc[movies.title.str.contains("Package")]
#movies.head()

Unnamed: 0,movieId,title,genres,tmdbId,image_url,year,movie_link
3405,4632,The Package,"[Action, Thriller]",31606,https://www.themoviedb.org/t/p/w300_and_h450_b...,1989,https://www.themoviedb.org/movie/0 86...


In [96]:
movies.head(5)

Unnamed: 0,movieId,title,genres,tmdbId,image_url,year,movie_link
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",862,https://www.themoviedb.org/t/p/w300_and_h450_b...,1995,https://www.themoviedb.org/movie/0 86...
1,2,Jumanji,"[Adventure, Children, Fantasy]",8844,https://www.themoviedb.org/t/p/w300_and_h450_b...,1995,https://www.themoviedb.org/movie/0 86...
2,3,Grumpier Old Men,"[Comedy, Romance]",15602,https://www.themoviedb.org/t/p/w300_and_h450_b...,1995,https://www.themoviedb.org/movie/0 86...
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",31357,https://www.themoviedb.org/t/p/w300_and_h450_b...,1995,https://www.themoviedb.org/movie/0 86...
4,5,Father of the Bride Part II,[Comedy],11862,https://www.themoviedb.org/t/p/w300_and_h450_b...,1995,https://www.themoviedb.org/movie/0 86...
