In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
np.set_printoptions(legacy='1.25')

# Comparing Movies in different streaming services
* Is there a correlation between the genre of a movie and the country in which they are made?

# Cleaning Data

In [2]:
netflix = pd.read_csv("netflix_titles.csv")
amazon = pd.read_csv("amazon_prime_titles.csv")
disney = pd.read_csv("disney_plus_titles.csv")

In [3]:
def filter_data(platform):
    # Removing any TV show type
    platform = platform[platform["type"] == "Movie"]
    # Drop rows with missing values in country or listed_in
    platform = platform.dropna(subset=["country", "listed_in"])
    # Focusing on Necessary Columns
    platform = platform[["title", "country", "listed_in"]]
    return platform

netflix = filter_data(netflix)
amazon = filter_data(amazon)
disney = filter_data(disney)

In [4]:
def split_commas(platform):
    platform["listed_in"] = platform["listed_in"].str.split(", ")
    platform = platform.explode("listed_in")
    platform["country"] = platform["country"].str.split(", ")
    platform = platform.explode("country")
    return platform

netflix = split_commas(netflix)
amazon = split_commas(amazon)
disney = split_commas(disney)

In [5]:
netflix = netflix[netflix["listed_in"] != "International Movies"]
netflix = netflix[netflix["listed_in"] != "Movies"]
netflix

Unnamed: 0,title,country,listed_in
0,Dick Johnson Is Dead,United States,Documentaries
7,Sankofa,United States,Dramas
7,Sankofa,Ghana,Dramas
7,Sankofa,Burkina Faso,Dramas
7,Sankofa,United Kingdom,Dramas
...,...,...,...
8804,Zombieland,United States,Horror Movies
8805,Zoom,United States,Children & Family Movies
8805,Zoom,United States,Comedies
8806,Zubaan,India,Dramas


In [6]:
def set_indexes(platform):
    platform = platform.set_index(["country", "listed_in"]).sort_values(by=["country", "listed_in"])
    return platform

netflix = set_indexes(netflix)
amazon = set_indexes(amazon)
disney = set_indexes(disney)