In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
np.set_printoptions(legacy='1.25')

# Comparing Movies in different streaming services
## Is there a correlation between the genre of a movie and the country in which they are made?

In [2]:
netflix = pd.read_csv("netflix_titles.csv")
amazon = pd.read_csv("amazon_prime_titles.csv")
disney = pd.read_csv("disney_plus_titles.csv")

# Cleaning/Filtering Data

* The filter_data function removes all uneccessary data and cleans out any rows with missing values in the country or listed_in columns

In [3]:
def filter_data(platform):
    # Removing any TV show type
    platform = platform[platform["type"] == "Movie"]
    # Drop rows with missing values in country or listed_in
    platform = platform.dropna(subset=["country", "listed_in"])
    # Focusing on Necessary Columns
    platform = platform[["title", "country", "listed_in"]]
    return platform

netflix = filter_data(netflix)
amazon = filter_data(amazon)
disney = filter_data(disney)

* Split_commas function splits any rows with multiple countries and genres
* It then makes duplicates for each country and genre pair. Example:
  * Spider-Man: Into the Spider-Verse, United States, Action & Adventure
  * Spider-Man: Into the Spider-Verse, United States, Comedies

In [4]:
def split_commas(platform):
    platform["listed_in"] = platform["listed_in"].str.split(", ")
    platform = platform.explode("listed_in")
    platform["country"] = platform["country"].str.split(", ")
    platform = platform.explode("country")
    # Fixes Netflix data set-specific problem where some data has a comma but no values after the comma
    platform["listed_in"] = platform["listed_in"].str.replace(",", "")
    platform["country"] = platform["country"].str.replace(",", "")

    platform["listed_in"] = platform["listed_in"].str.strip()
    platform["country"] = platform["country"].str.strip()
    return platform

netflix = split_commas(netflix)
amazon = split_commas(amazon)
disney = split_commas(disney)

* Removing categories that do not represent a genre
* Additional filtering and cleaning for platform-specific issues

In [5]:
def remove_category(platform, categories_to_remove):
    # Keep only the rows that are NOT in the categories_to_remove list
    return platform[~platform["listed_in"].isin(categories_to_remove)]

netflix = remove_category(netflix, ["International Movies", "Movies", "Independent Movies", 
                                    "Anime Features", "Cult Movies", "LGBTQ Movies", "Classic Movies"])

amazon = remove_category(amazon, ["International", "Special Interest", "Arthouse", 
                                  "Animation", "LGBTQ", "Young Adult Audience", "Music Videos and Concerts"])
swap = {
    "Arts": "Arts, Entertainment, and Culture",
    "Entertainment": "Arts, Entertainment, and Culture",
    "and Culture": "Arts, Entertainment, and Culture"
}

amazon["listed_in"] = amazon["listed_in"].replace(swap)

arts_movies = amazon[amazon['listed_in'] == 'Arts, Entertainment, and Culture']
correct_count = arts_movies['title'].nunique()

In [7]:
def set_indexes(platform):
    platform = platform.set_index(["country", "listed_in"]).sort_values(by=["country", "listed_in"])
    return platform

netflix = set_indexes(netflix)
amazon = set_indexes(amazon)
disney = set_indexes(disney)

In [8]:
amazon

Unnamed: 0_level_0,Unnamed: 1_level_0,title
country,listed_in,Unnamed: 2_level_1
Afghanistan,Drama,Kabullywood
Albania,Drama,Zana
Australia,Action,Jungle
Australia,Documentary,Hotel Coolgardie
Australia,Documentary,"Cryptopia: Bitcoin, Blockchains, and the Futur..."
...,...,...
United States,Suspense,Duplicity
United States,Suspense,Die Hard
United States,Suspense,Prometheus
United States,Western,Stagecoach - The Original John Wayne Classic


In [9]:
pd.options.display.min_rows = 40
amazon.query('listed_in == "Arts, Entertainment, and Culture"')

Unnamed: 0_level_0,Unnamed: 1_level_0,title
country,listed_in,Unnamed: 2_level_1
Canada,"Arts, Entertainment, and Culture",Suck
Canada,"Arts, Entertainment, and Culture",Suck
Canada,"Arts, Entertainment, and Culture",Suck
Canada,"Arts, Entertainment, and Culture",Capone
Canada,"Arts, Entertainment, and Culture",Capone
Canada,"Arts, Entertainment, and Culture",Capone
China,"Arts, Entertainment, and Culture",The Farewell (4K UHD)
China,"Arts, Entertainment, and Culture",The Farewell (4K UHD)
China,"Arts, Entertainment, and Culture",The Farewell (4K UHD)
China,"Arts, Entertainment, and Culture",The Farewell


In [10]:
disney.value_counts("listed_in")

listed_in
Family              627
Comedy              465
Action-Adventure    431
Animation           431
Fantasy             199
Coming of Age       190
Documentary         150
Drama               146
Animals & Nature    122
Science Fiction      91
Biographical         46
Kids                 45
Sports               42
Historical           40
Musical              37
Music                33
Buddy                27
Romance              22
Crime                21
Superhero            20
Mystery               8
Anthology             6
Parody                5
Concert Film          5
Western               5
Dance                 4
Variety               4
Thriller              4
Survival              3
Spy/Espionage         3
Disaster              2
Romantic Comedy       2
Movies                1
Reality               1
Name: count, dtype: int64