In [1]:
# Libraries
import pandas as pd
import numpy as np
import re

In [2]:
netflix = pd.read_csv("../data/netflix_cleandata.csv", index_col=0)

In [3]:
# Getting the unique values
countries = list(netflix["country"].apply(str).unique())
countries = [x for x in countries if x != "nan"]

# Getting the first country from the unique

only_first_country = []

for country in countries:
    only_first_country.extend(re.findall(r"^(.+?),", country))
    
# Removing duplicates and sorting the countries
countries_clean = list(set(only_first_country))
countries_clean.sort()

In [9]:
def country_hot_encoded(df, country):
    
    """
    Input: Dataframe, list of unique countries
    
    Output: The function returns a SUM one-hot encoded of the countries
    """
    
    list_of = []
    for c in country:
        country_1 = df[df["country"].str.contains(c, case=False, regex=False, na=False)]
        df2 = country_1[["listed_in"]].unstack().str.get_dummies(sep=", ").sum(level = 0).rename(index={"listed_in":c})
        list_of.append(df2)
          
    result = pd.concat(list_of).fillna(0).astype(int)
    
    return result

country_hot_encoded(netflix, countries_clean)

Unnamed: 0,Action & Adventure,Children & Family Movies,Classic Movies,Comedies,Crime TV Shows,Cult Movies,Documentaries,Docuseries,Dramas,Horror Movies,...,TV Action & Adventure,TV Mysteries,TV Sci-Fi & Fantasy,TV Thrillers,Stand-Up Comedy & Talk Shows,Anime Series,Classic & Cult TV,Korean TV Shows,Anime Features,TV Shows
Argentina,3,2,1,10,4,1,6,1,27,3,...,0,0,0,0,0,0,0,0,0,0
Australia,10,16,3,14,9,1,14,9,29,3,...,1,1,2,2,0,0,0,0,0,0
Austria,0,0,0,0,0,0,3,0,5,0,...,0,0,0,0,0,0,0,0,0,0
Belgium,9,4,0,13,8,1,5,0,33,4,...,1,1,1,0,0,0,0,0,0,0
Brazil,3,3,0,11,4,0,11,2,22,0,...,0,1,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
United Arab Emirates,3,3,0,5,1,0,4,0,10,1,...,0,0,0,0,0,0,0,0,0,0
United Kingdom,59,24,11,67,43,5,106,83,122,22,...,6,3,3,3,1,0,6,0,0,0
United States,247,258,61,447,118,37,401,135,533,145,...,77,46,52,23,27,9,16,4,3,3
Uruguay,0,0,0,1,0,0,5,0,4,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# Storing the one-hot encoding into a variable
country_encoded = country_hot_encoded(netflix, countries_clean)

# Saving the file into a CSV
# country_encoded.to_csv("../data/country_hotencoded.csv")

In [6]:
# Getting the unique values
genres = netflix["listed_in"].apply(str).unique()
genres = [x for x in genres if x != "nan"]

# Getting the first country from the unique

only_first_genre = []

for genre in genres:
    only_first_genre.extend(re.findall(r"^(.+?),", genre))
    
# Removing duplicates and sorting the countries
genre_clean = list(set(only_first_genre))
genre_clean.sort()

In [12]:
def genre_hot_encoded(df, genre):
    
    """
    Input: Dataframe, list of unique genres
    
    Output: The function returns a SUM one-hot encoded of the genres
    """
    
    list_of = []
    for g in genre:
        genre_1 = df[df["listed_in"].str.contains(g, case=False, regex=False, na=False)]
        df2 = genre_1[["country"]].unstack().str.get_dummies(sep=", ").sum(level = 0).rename(index={"country":g})
        list_of.append(df2)
          
    result = pd.concat(list_of).fillna(0).astype(int)
    
    return result

genre_hot_encoded(netflix, genre_clean)

Unnamed: 0,Argentina,Australia,Belgium,Brazil,Bulgaria,Cambodia,Canada,Chile,China,Colombia,...,Iran,Kenya,Kuwait,Latvia,Mauritius,Montenegro,Slovakia,Somalia,Sudan,Syria
Action & Adventure,3,11,10,3,3,1,37,1,53,2,...,0,0,0,0,0,0,0,0,0,0
Anime Features,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
Anime Series,0,0,0,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
British TV Shows,0,3,0,0,0,0,3,0,1,0,...,0,0,0,0,0,0,0,0,0,0
Children & Family Movies,2,16,4,3,1,1,43,1,9,0,...,0,0,0,0,0,0,0,0,0,0
Classic & Cult TV,0,0,0,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Classic Movies,1,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Comedies,11,26,13,17,0,1,96,6,32,5,...,0,0,0,0,0,0,0,0,0,0
Crime TV Shows,4,9,8,4,0,0,15,1,4,16,...,0,0,0,0,0,0,0,0,0,0
Cult Movies,1,1,1,0,0,0,6,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Storing the one-hot encoding into a variable
genre_encoded = genre_hot_encoded(netflix, genre_clean)

# Saving the file into a CSV
# genre_encoded.to_csv("../data/genre_hotencoded.csv")