In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
np.set_printoptions(legacy='1.25')

# Comparing Movies in different streaming services
## Is there a correlation between the genre of a movie and the country in which they are made?

In [2]:
netflix = pd.read_csv("netflix_titles.csv")
amazon = pd.read_csv("amazon_prime_titles.csv")
disney = pd.read_csv("disney_plus_titles.csv")

# Cleaning/Filtering Data
* The filter_data function removes all uneccessary data and cleans out any rows with missing values in the country or listed_in columns

In [3]:
def filter_data(platform):
    # Removing any TV show type
    platform = platform[platform["type"] == "Movie"]
    # Drop rows with missing values in country or listed_in
    platform = platform.dropna(subset=["country", "listed_in"])
    # A feature-length movie must be at least 40 minutes to be considered a movie
    platform["duration"] = platform["duration"].str.replace(" min", "")
    platform["duration"] = pd.to_numeric(platform["duration"])
    platform = platform[platform["duration"] >= 40]
    # Focusing on Necessary Columns
    platform = platform[["country", "listed_in", "title"]]
    return platform

netflix = filter_data(netflix)
amazon = filter_data(amazon)
disney = filter_data(disney)

* Split_commas function splits any rows with multiple countries and genres
* It then makes duplicates for each country and genre pair. Example:
  * Spider-Man: Into the Spider-Verse, United States, Action & Adventure
  * Spider-Man: Into the Spider-Verse, United States, Comedies

In [4]:
def split_commas(platform):
    platform["listed_in"] = platform["listed_in"].str.split(", ")
    platform = platform.explode("listed_in")
    platform["country"] = platform["country"].str.split(", ")
    platform = platform.explode("country")
    # Fixes Netflix data set-specific problem where some data has a comma but no values after the comma
    platform["listed_in"] = platform["listed_in"].str.replace(",", "")
    platform["country"] = platform["country"].str.replace(",", "")

    platform["listed_in"] = platform["listed_in"].str.strip()
    platform["country"] = platform["country"].str.strip()
    return platform

netflix = split_commas(netflix)
amazon = split_commas(amazon)
disney = split_commas(disney)

# Preparing Data for Merging
* Removing categories that do not represent a genre
* Additional filtering and cleaning for platform-specific issues

In [5]:
def remove_category(platform, categories_to_remove):
    # Keep only the rows that are NOT in the categories_to_remove list
    return platform[~platform["listed_in"].isin(categories_to_remove)]

netflix = remove_category(netflix, ["International Movies", "Movies", "Independent Movies", 
                                    "Anime Features", "Cult Movies", "LGBTQ Movies", "Classic Movies"])

amazon = remove_category(amazon, ["International", "Special Interest", "Arthouse", "Animation", 
                                  "LGBTQ", "Young Adult Audience", "Music Videos and Concerts", 
                                  "Military and War", "Entertainment", "and Culture"])

disney = remove_category(disney, ["Animation", "Anthology", "Movies", "Buddy", "Reality", "Parody", 
                                  "Romantic Comedy", "Disaster", "Variety", "Superhero", 
                                  "Survival", "Spy/Espionage", "Concert Film", "Dance", "Biographical"])

* Removing categories that do not exist in all three platforms and pooling those entries into a category that does
* Fixing names of similar categories so that they match among all three platforms
  * Example: "Romantic Movies" -> "Romance"

In [6]:
swap_netflix = {
    "Comedies": "Comedy",
    "Stand-Up Comedy": "Comedy",
    "Dramas": "Drama",
    "Action & Adventure": "Action-Adventure",
    "Horror Movies": "Horror",
    "Documentaries": "Documentary",
    "Thrillers": "Thriller",
    "Children & Family Movies": "Children & Family",
    "Sports Movies": "Sports",
    "Romantic Movies": "Romance"
}

netflix["listed_in"] = netflix["listed_in"].replace(swap_netflix)

swap_amazon = {
    "Arts": "Arts, Entertainment, and Culture",
    "Action": "Action-Adventure",
    "Adventure": "Action-Adventure",
    "Suspense": "Thriller",
    "Kids": "Children & Family",
    "Science Fiction": "Sci-Fi & Fantasy",
    "Fantasy": "Sci-Fi & Fantasy",
    "Faith and Spirituality": "Faith & Spirituality"
}

amazon["listed_in"] = amazon["listed_in"].replace(swap_amazon)

swap_disney = {
    "Music": "Music & Musicals",
    "Musical": "Music & Musicals",
    "Family": "Children & Family",
    "Kids": "Children & Family",
    "Crime": "Thriller",
    "Mystery": "Thriller",
    "Animals & Nature": "Documentary",
    "Science Fiction": "Sci-Fi & Fantasy",
    "Fantasy": "Sci-Fi & Fantasy"
}

disney["listed_in"] = disney["listed_in"].replace(swap_disney)

In [7]:
def set_indexes(platform):
    platform = platform.set_index("country").sort_values(by="country")
    platform["listed_in"] = platform["listed_in"].astype("category")
    return platform

netflix = set_indexes(netflix)
amazon = set_indexes(amazon)
disney = set_indexes(disney)

In [8]:
merged_dataframe = pd.concat([netflix, amazon, disney])
merged_dataframe

Unnamed: 0_level_0,listed_in,title
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,Documentary,The Land of the Enlightened
Albania,Drama,Forgive Us Our Debts
Algeria,Drama,Alexandria ... Why?
Algeria,Drama,Eyes of a Thief
Algeria,Drama,DNA
...,...,...
United States,Children & Family,Almost Angels
United States,Drama,Almost Angels
United States,Coming of Age,Almost Angels
United States,Drama,Alley Cats Strike!


In [9]:
merged_dataframe = merged_dataframe.drop_duplicates(subset=["listed_in", "title"]).sort_values(by=["country", "listed_in"])

In [10]:
merged_dataframe = merged_dataframe.groupby(["country", "listed_in"]).size()
merged_dataframe = merged_dataframe.reset_index()
merged_dataframe = merged_dataframe.set_axis(["Country", "Genre", "Count"], axis=1)
merged_dataframe.set_index(keys="Country")

Unnamed: 0_level_0,Genre,Count
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,Documentary,1
Afghanistan,Drama,1
Albania,Drama,2
Algeria,Drama,3
Angola,Action-Adventure,2
...,...,...
Vietnam,Romance,1
Vietnam,Thriller,1
West Germany,Documentary,1
Zimbabwe,Comedy,1


In [11]:
merged_dataframe

Unnamed: 0,Country,Genre,Count
0,Afghanistan,Documentary,1
1,Afghanistan,Drama,1
2,Albania,Drama,2
3,Algeria,Drama,3
4,Angola,Action-Adventure,2
...,...,...,...
563,Vietnam,Romance,1
564,Vietnam,Thriller,1
565,West Germany,Documentary,1
566,Zimbabwe,Comedy,1
