In [12]:
# Libraries
import pandas as pd
import numpy as np
import re
from datetime import datetime as dt

## Data Cleaning

In [13]:
# Importing Netflix DF
netflix = pd.read_csv("../data/raw-data/Netflix_Titles.csv", )
netflix = netflix.drop(columns=["show_id", "description"])
netflix["date_added"] = pd.to_datetime(netflix["date_added"])
netflix["date_added"] = netflix["date_added"].dt.strftime("%Y-%m")

# Importing IMDB DF
IMDB = pd.read_csv("../data/raw-data/IMDb_results_nov-22-2020.csv", index_col=0)

# Joinig both tables
netflix = netflix.join(IMDB)

# Cleaning the DF mantaining the desired columns
netflix = netflix.drop(columns=["IMDB_titleID", "IMDB_title_name"])
netflix = netflix.rename(columns={"IMDB_rating":"IMDb_rating"})

In [14]:
# Importing DF that indicates if a TV Show / Film appears in different streaming platforms

# Streaming Movies DF
streaming_platforms_movies = pd.read_csv("../data/raw-data/StreamingPlatforms_Movies.csv", index_col=0)
streaming_platforms_movies = streaming_platforms_movies.drop(columns=["Language", "Runtime", "Directors", "Genres", "Type", "IMDb", "Rotten Tomatoes", "ID", "Country", "Age"])

# Series Movies DF
streaming_platforms_series = pd.read_csv("../data/raw-data/StreamingPlatforms_TV_Shows.csv", index_col=0)
streaming_platforms_series = streaming_platforms_series.drop(columns=["IMDb", "Rotten Tomatoes", "type", "Age"])

# Merging both DF
streamings_complete = pd.concat([streaming_platforms_movies, streaming_platforms_series])
streamings_complete = streamings_complete.reset_index(drop=True)
streamings_complete = streamings_complete.rename(columns={"Title":"title", "Year":"year", "Age":"age"})

# Merge previous DF with the main (netflix)
netflix = netflix.merge(streamings_complete, on="title", how="left")

In [15]:
# Filling the NA values for being able to append from the other row if value is missing
netflix["year"] = netflix["year"].fillna(0).astype(int)

def correct_release_year(row):
    
    """
    Input: Row of the DF
    Output: If the first condition is met, the value will be appended; if it's missing will append the other row value
    
    """
    if row["year"] > 1:
        return row["year"]
    if row["year"] == 0:
        return row["release_year"]

In [16]:
# Creating new column with the function created above
netflix["correct_release_year"] = netflix.apply(correct_release_year, axis=1)

# Drop old year columns to avoid confusion
netflix = netflix.drop(columns=["release_year", "year"])

In [17]:
# Converting Floats from the Streaming Platforms to Int
# There are NaN values, that's why I use '.astype' from Pandas
netflix[["Netflix", "Hulu", "Prime Video", "Disney+"]] = netflix[["Netflix", "Hulu", "Prime Video", "Disney+"]].astype("Int64")

In [18]:
# First isolation of the rating (stars) of the column IMDb_rating
rating = []

for rate in netflix["IMDb_rating"]:
    try:
        rating.append(float(re.findall(r"\d{1}\.\d{1}", rate)[0]))
    except:
        rating.append(np.NAN)
        
netflix["IMDb_rate"] = rating

# Second isolation of the rating (votes) of the column IMDb_rating
votes = []

for rate in netflix["IMDb_rating"]:
    try:
        votes.append(re.findall(r"\d+", rate))
    except:
        votes.append(np.nan)
        

correct_votes = []
for v in votes:
    try:
        correct_votes.append(int(v[2]))
    except:
        correct_votes.append(np.nan)
        
netflix["IMDb_votes"] = correct_votes

# Convert the float of ["IMDb_votes"] to int with '.astype' from Pandas
netflix["IMDb_votes"] = netflix["IMDb_votes"].astype("Int64")

# Drop initial column to avoid confusion
netflix = netflix.drop(columns="IMDb_rating")

In [19]:
netflix.head(20)

Unnamed: 0,type,title,director,cast,country,date_added,rating,duration,listed_in,Netflix,Hulu,Prime Video,Disney+,correct_release_year,IMDb_rate,IMDb_votes
0,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China",2019-09,TV-PG,90 min,"Children & Family Movies, Comedies",1.0,0.0,0.0,0.0,2019,3.2,294.0
1,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,2016-09,TV-MA,94 min,Stand-Up Comedy,,,,,2016,5.0,21.0
2,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,2018-09,TV-Y7-FV,1 Season,Kids' TV,,,,,2013,7.9,5.0
3,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,2018-09,TV-Y7,1 Season,Kids' TV,,,,,2016,6.0,830.0
4,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,2017-09,TV-14,99 min,Comedies,1.0,0.0,0.0,0.0,2017,5.2,5.0
5,TV Show,Apaches,,"Alberto Ammann, Eloy Azorín, Verónica Echegui,...",Spain,2017-09,TV-MA,1 Season,"Crime TV Shows, International TV Shows, Spanis...",1.0,0.0,0.0,0.0,2015,7.0,221.0
6,Movie,Automata,Gabe Ibáñez,"Antonio Banderas, Dylan McDermott, Melanie Gri...","Bulgaria, United States, Spain, Canada",2017-09,R,110 min,"International Movies, Sci-Fi & Fantasy, Thrillers",1.0,0.0,0.0,0.0,2014,6.1,53.0
7,Movie,Fabrizio Copano: Solo pienso en mi,"Rodrigo Toro, Francisco Schultz",Fabrizio Copano,Chile,2017-09,TV-MA,60 min,Stand-Up Comedy,1.0,0.0,0.0,0.0,2017,4.7,27.0
8,TV Show,Fire Chasers,,,United States,2017-09,TV-MA,1 Season,"Docuseries, Science & Nature TV",1.0,0.0,0.0,0.0,2017,6.6,317.0
9,Movie,Good People,Henrik Ruben Genz,"James Franco, Kate Hudson, Tom Wilkinson, Omar...","United States, United Kingdom, Denmark, Sweden",2017-09,R,90 min,"Action & Adventure, Thrillers",1.0,0.0,0.0,0.0,2014,5.5,15.0


### Data Storage

In [11]:
# Export the file to CSV for better manipulation
netflix.to_csv("../data/netflix_cleandata.csv")