In [12]:
# Libraries
import pandas as pd
import numpy as np
import re
from datetime import datetime as dt

## Data Cleaning

In [13]:
# Importing Netflix DF
netflix = pd.read_csv("../data/raw-data/Netflix_Titles.csv", )
netflix = netflix.drop(columns=["show_id", "description"])
netflix["date_added"] = pd.to_datetime(netflix["date_added"])
netflix["year_added"] = netflix["date_added"].dt.strftime("%Y")
netflix["date_added"] = netflix["date_added"].dt.strftime("%Y-%m")

# Importing IMDB DF
IMDB = pd.read_csv("../data/raw-data/IMDb_results_nov-22-2020.csv", index_col=0)

# Joinig both tables
netflix = netflix.join(IMDB)

# Cleaning the DF mantaining the desired columns
netflix = netflix.drop(columns=["IMDB_titleID", "IMDB_title_name"])
netflix = netflix.rename(columns={"IMDB_rating":"IMDb_rating"})

In [14]:
# Importing DF that indicates if a TV Show / Film appears in different streaming platforms

# Streaming Movies DF
streaming_platforms_movies = pd.read_csv("../data/raw-data/StreamingPlatforms_Movies.csv", index_col=0)
streaming_platforms_movies = streaming_platforms_movies.drop(columns=["Language", "Runtime", "Directors", "Genres", "Type", "IMDb", "Rotten Tomatoes", "ID", "Country", "Age"])

# Series Movies DF
streaming_platforms_series = pd.read_csv("../data/raw-data/StreamingPlatforms_TV_Shows.csv", index_col=0)
streaming_platforms_series = streaming_platforms_series.drop(columns=["IMDb", "Rotten Tomatoes", "type", "Age"])

# Merging both DF
streamings_complete = pd.concat([streaming_platforms_movies, streaming_platforms_series])
streamings_complete = streamings_complete.reset_index(drop=True)
streamings_complete = streamings_complete.rename(columns={"Title":"title", "Year":"year", "Age":"age"})

# Merge previous DF with the main (netflix)
netflix = netflix.merge(streamings_complete, on="title", how="left")

In [15]:
# Filling the NA values for being able to append from the other row if value is missing
netflix["year"] = netflix["year"].fillna(0).astype(int)

def correct_release_year(row):
    
    """
    Input: Row of the DF
    Output: If the first condition is met, the value will be appended; if it's missing will append the other row value
    
    """
    if row["year"] > 1:
        return row["year"]
    if row["year"] == 0:
        return row["release_year"]

In [16]:
# Creating new column with the function created above
netflix["correct_release_year"] = netflix.apply(correct_release_year, axis=1)

# Drop old year columns to avoid confusion
netflix = netflix.drop(columns=["release_year", "year"])

In [17]:
# Converting Floats from the Streaming Platforms to Int
# There are NaN values, that's why I use '.astype' from Pandas
netflix[["Netflix", "Hulu", "Prime Video", "Disney+"]] = netflix[["Netflix", "Hulu", "Prime Video", "Disney+"]].astype("Int64")

In [18]:
# First isolation of the rating (stars) of the column IMDb_rating
rating = []

for rate in netflix["IMDb_rating"]:
    try:
        rating.append(float(re.findall(r"\d{1}\.\d{1}", rate)[0]))
    except:
        rating.append(np.NAN)
        
netflix["IMDb_rate"] = rating

# Second isolation of the rating (votes) of the column IMDb_rating
votes = []

for rate in netflix["IMDb_rating"]:
    try:
        votes.append(re.findall(r"\d+", rate))
    except:
        votes.append(np.nan)
        

correct_votes = []
for v in votes:
    try:
        correct_votes.append(int(v[2]))
    except:
        correct_votes.append(np.nan)
        
netflix["IMDb_votes"] = correct_votes

# Convert the float of ["IMDb_votes"] to int with '.astype' from Pandas
netflix["IMDb_votes"] = netflix["IMDb_votes"].astype("Int64")

# Drop initial column to avoid confusion
netflix = netflix.drop(columns="IMDb_rating")

### Data Storage

In [20]:
# Export the file to CSV for better manipulation
#netflix.to_csv("../data/netflix_cleandata.csv")