# EXPLORATORY DATA ANALYSIS

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl

In [None]:
netflix = pd.read_csv('/content/drive/MyDrive/Datasets/netflix_titles.csv')
disney = pd.read_csv('/content/drive/MyDrive/Datasets/disney_plus_titles.csv')
amazon = pd.read_csv('/content/drive/MyDrive/Datasets/amazon_prime_titles.csv')
hulu = pd.read_csv('/content/drive/MyDrive/Datasets/hulu_titles.csv')

In [None]:
netflix['Platform'] = netflix['Platform'] = "Netflix"
disney['Platform'] = disney['Platform'] = "Disney+"
amazon['Platform'] = amazon['Platform'] = "Amazon Prime"
hulu['Platform'] = hulu['Platform'] = "Hulu"

In [None]:
df = pd.concat([netflix,disney,amazon,hulu])
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,Platform
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",Netflix
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",Netflix
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,Netflix
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",Netflix
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,Netflix
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3068,s3069,TV Show,Star Trek: The Original Series,,,United States,,1966,TV-PG,3 Seasons,"Action, Adventure, Classics",The 23rd century adventures of Captain James T...,Hulu
3069,s3070,TV Show,Star Trek: Voyager,,,United States,,1997,TV-PG,7 Seasons,"Action, Adventure, Science Fiction",Catapulted into the distant sector of the gala...,Hulu
3070,s3071,TV Show,The Fades,,,United Kingdom,,2011,TV-14,1 Season,"Horror, International, Science Fiction",Seventeen-year-old Paul is haunted by apocalyp...,Hulu
3071,s3072,TV Show,The Twilight Zone,,,United States,,1959,TV-PG,5 Seasons,"Classics, Science Fiction, Thriller",Rod Serling's seminal anthology series focused...,Hulu


## Data Cleaning

In [None]:
# check numbers of null values
df.isnull().sum()

Unnamed: 0,0
show_id,0
type,0
title,0
director,8260
cast,5321
country,11499
date_added,9554
release_year,0
rating,864
duration,482


In [None]:
# remove columns that are unnecessary/too many missing values to figure out with Imputing
df = df.drop(columns=['show_id','title','director','cast','country','date_added','description'])

In [None]:
df

Unnamed: 0,type,release_year,rating,duration,listed_in,Platform
0,Movie,2020,PG-13,90 min,Documentaries,Netflix
1,TV Show,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries",Netflix
2,TV Show,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",Netflix
3,TV Show,2021,TV-MA,1 Season,"Docuseries, Reality TV",Netflix
4,TV Show,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",Netflix
...,...,...,...,...,...,...
3068,TV Show,1966,TV-PG,3 Seasons,"Action, Adventure, Classics",Hulu
3069,TV Show,1997,TV-PG,7 Seasons,"Action, Adventure, Science Fiction",Hulu
3070,TV Show,2011,TV-14,1 Season,"Horror, International, Science Fiction",Hulu
3071,TV Show,1959,TV-PG,5 Seasons,"Classics, Science Fiction, Thriller",Hulu


In [None]:
df.isna().sum()

Unnamed: 0,0
type,0
release_year,0
rating,864
duration,482
listed_in,0
Platform,0


In [None]:
cleaned_df = df.dropna()

In [None]:
cleaned_df.isna().sum()

Unnamed: 0,0
type,0
release_year,0
rating,0
duration,0
listed_in,0
Platform,0


In [None]:
cleaned_df

Unnamed: 0,type,release_year,rating,duration,listed_in,Platform
0,Movie,2020,PG-13,90 min,Documentaries,Netflix
1,TV Show,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries",Netflix
2,TV Show,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",Netflix
3,TV Show,2021,TV-MA,1 Season,"Docuseries, Reality TV",Netflix
4,TV Show,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",Netflix
...,...,...,...,...,...,...
3068,TV Show,1966,TV-PG,3 Seasons,"Action, Adventure, Classics",Hulu
3069,TV Show,1997,TV-PG,7 Seasons,"Action, Adventure, Science Fiction",Hulu
3070,TV Show,2011,TV-14,1 Season,"Horror, International, Science Fiction",Hulu
3071,TV Show,1959,TV-PG,5 Seasons,"Classics, Science Fiction, Thriller",Hulu


In [None]:
# check number of duplicates and total records
num_of_dupes = cleaned_df.duplicated().sum()
total_rows = cleaned_df.shape[0]

print(f"Number of duplicate rows: {num_of_dupes}")
print(f"Total number of rows: {total_rows}")

Number of duplicate rows: 2615
Total number of rows: 21758


In [None]:
# drop duplicates
cleaned_df = cleaned_df.drop_duplicates()

In [None]:
# check the changes
num_of_dupes = cleaned_df.duplicated().sum()
total_rows = cleaned_df.shape[0]

print(f"Number of duplicate rows: {num_of_dupes}")
print(f"Total number of rows: {total_rows}")

Number of duplicate rows: 0
Total number of rows: 19143


In [None]:
cleaned_df['listed_in'].unique()

array(['Documentaries', 'International TV Shows, TV Dramas, TV Mysteries',
       'Crime TV Shows, International TV Shows, TV Action & Adventure',
       ..., 'Comedy, Family, History', 'Comedy, Horror, Mystery',
       'Classics, Science Fiction, Thriller'], dtype=object)

In [None]:
# list all of the genre available
grouped_genre_list = cleaned_df['listed_in'].dropna().str.split(', ')

cleaned_df['listed_in'] = cleaned_df['listed_in'].dropna().str.split(', ')

genres = set(genre for sublist in grouped_genre_list for genre in sublist)
genres

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['listed_in'] = cleaned_df['listed_in'].dropna().str.split(', ')


{'Action',
 'Action & Adventure',
 'Action-Adventure',
 'Adult Animation',
 'Adventure',
 'Animals & Nature',
 'Animation',
 'Anime',
 'Anime Features',
 'Anime Series',
 'Anthology',
 'Arthouse',
 'Arts',
 'Biographical',
 'Black Stories',
 'British TV Shows',
 'Buddy',
 'Cartoons',
 'Children & Family Movies',
 'Classic & Cult TV',
 'Classic Movies',
 'Classics',
 'Comedies',
 'Comedy',
 'Coming of Age',
 'Concert Film',
 'Cooking & Food',
 'Crime',
 'Crime TV Shows',
 'Cult Movies',
 'Dance',
 'Disaster',
 'Documentaries',
 'Documentary',
 'Docuseries',
 'Drama',
 'Dramas',
 'Entertainment',
 'Faith & Spirituality',
 'Faith and Spirituality',
 'Family',
 'Fantasy',
 'Fitness',
 'Game Show / Competition',
 'Game Shows',
 'Health & Wellness',
 'Historical',
 'History',
 'Horror',
 'Horror Movies',
 'Independent Movies',
 'International',
 'International Movies',
 'International TV Shows',
 'Kids',
 "Kids' TV",
 'Korean TV Shows',
 'LGBTQ',
 'LGBTQ Movies',
 'LGBTQ+',
 'Late Night',
 '

## One hot encoding

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
genre_dummies = pd.DataFrame(mlb.fit_transform(cleaned_df['listed_in']),
                             columns=mlb.classes_,
                             index=cleaned_df.index)

cleaned_df = pd.concat([cleaned_df, genre_dummies], axis=1)
cleaned_df

Unnamed: 0,type,release_year,rating,duration,listed_in,Platform,Action,Action & Adventure,Action-Adventure,Adult Animation,...,Teen,Teen TV Shows,Thriller,Thrillers,Travel,Unscripted,Variety,Western,Young Adult Audience,and Culture
0,Movie,2020,PG-13,90 min,[Documentaries],Netflix,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,TV Show,2021,TV-MA,2 Seasons,"[International TV Shows, TV Dramas, TV Mysteries]",Netflix,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,TV Show,2021,TV-MA,1 Season,"[Crime TV Shows, International TV Shows, TV Ac...",Netflix,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,TV Show,2021,TV-MA,1 Season,"[Docuseries, Reality TV]",Netflix,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,TV Show,2021,TV-MA,2 Seasons,"[International TV Shows, Romantic TV Shows, TV...",Netflix,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3067,TV Show,1989,TV-PG,7 Seasons,"[Action, Adventure, Classics]",Hulu,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3068,TV Show,1966,TV-PG,3 Seasons,"[Action, Adventure, Classics]",Hulu,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3069,TV Show,1997,TV-PG,7 Seasons,"[Action, Adventure, Science Fiction]",Hulu,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3070,TV Show,2011,TV-14,1 Season,"[Horror, International, Science Fiction]",Hulu,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#list all columns to check if there are columns that should be removed.
columns = cleaned_df.columns.tolist()
for col in columns:
    print(col)

type
release_year
rating
duration
listed_in
Platform
Action
Action & Adventure
Action-Adventure
Adult Animation
Adventure
Animals & Nature
Animation
Anime
Anime Features
Anime Series
Anthology
Arthouse
Arts
Biographical
Black Stories
British TV Shows
Buddy
Cartoons
Children & Family Movies
Classic & Cult TV
Classic Movies
Classics
Comedies
Comedy
Coming of Age
Concert Film
Cooking & Food
Crime
Crime TV Shows
Cult Movies
Dance
Disaster
Documentaries
Documentary
Docuseries
Drama
Dramas
Entertainment
Faith & Spirituality
Faith and Spirituality
Family
Fantasy
Fitness
Game Show / Competition
Game Shows
Health & Wellness
Historical
History
Horror
Horror Movies
Independent Movies
International
International Movies
International TV Shows
Kids
Kids' TV
Korean TV Shows
LGBTQ
LGBTQ Movies
LGBTQ+
Late Night
Latino
Lifestyle
Lifestyle & Culture
Medical
Military and War
Movies
Music
Music & Musicals
Music Videos and Concerts
Musical
Mystery
News
Parody
Police/Cop
Reality
Reality TV
Romance
Romantic 

In [None]:
# these are the genres that are duplicates
cols_to_remove = [
    'Action & Adventure',
    'Action-Adventure',
    'Anime Features',
    'Anime Series',
    'Classic Movies',
    'Comedies',
    'Documentaries',
    'Dramas',
    'Faith & Spirituality',
    'Game Show / Competition',
    'Historical',
    'Horror Movies',
    'International',
    'LGBTQ Movies',
    'LGBTQ+',
    'Lifestyle & Culture',
    'Movies',
    'Music & Musicals',
    'Musical',
    'Romantic Movies',
    'Romantic TV Shows',
    'Sci-Fi & Fantasy',
    'Series',
    'Sketch Comedy',
    'Stand Up',
    'Stand-Up Comedy & Talk Shows',
    'TV Action & Adventure',
    'TV Comedies',
    'TV Dramas',
    'TV Horror',
    'TV Mysteries',
    'TV Sci-Fi & Fantasy',
    'TV Shows',
    'TV Thrillers',
    'Talk Show and Variety',
    'Teen TV Shows',
    'Thrillers'
]

cleaned_df.drop(columns=cols_to_remove, inplace=True, errors='ignore')


In [None]:
cleaned_df

Unnamed: 0,type,release_year,rating,duration,listed_in,Platform,Action,Adult Animation,Adventure,Animals & Nature,...,Suspense,Talk Show,Teen,Thriller,Travel,Unscripted,Variety,Western,Young Adult Audience,and Culture
0,Movie,2020,PG-13,90 min,[Documentaries],Netflix,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,TV Show,2021,TV-MA,2 Seasons,"[International TV Shows, TV Dramas, TV Mysteries]",Netflix,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,TV Show,2021,TV-MA,1 Season,"[Crime TV Shows, International TV Shows, TV Ac...",Netflix,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,TV Show,2021,TV-MA,1 Season,"[Docuseries, Reality TV]",Netflix,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,TV Show,2021,TV-MA,2 Seasons,"[International TV Shows, Romantic TV Shows, TV...",Netflix,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3067,TV Show,1989,TV-PG,7 Seasons,"[Action, Adventure, Classics]",Hulu,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3068,TV Show,1966,TV-PG,3 Seasons,"[Action, Adventure, Classics]",Hulu,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3069,TV Show,1997,TV-PG,7 Seasons,"[Action, Adventure, Science Fiction]",Hulu,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3070,TV Show,2011,TV-14,1 Season,"[Horror, International, Science Fiction]",Hulu,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
cleaned_df.drop('listed_in', axis=1, inplace=True)

In [None]:
#create unique ID since there are no titles
cleaned_df ['Title_ID'] = range(1, len(cleaned_df) + 1)

# Move Title_ID to the front
cols = ['Title_ID'] + [col for col in cleaned_df.columns if col != 'Title_ID']
cleaned_df = cleaned_df[cols]

In [None]:
cleaned_df

Unnamed: 0,Title_ID,type,release_year,rating,duration,Platform,Action,Adult Animation,Adventure,Animals & Nature,...,Suspense,Talk Show,Teen,Thriller,Travel,Unscripted,Variety,Western,Young Adult Audience,and Culture
0,1,Movie,2020,PG-13,90 min,Netflix,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,TV Show,2021,TV-MA,2 Seasons,Netflix,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,TV Show,2021,TV-MA,1 Season,Netflix,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,TV Show,2021,TV-MA,1 Season,Netflix,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,TV Show,2021,TV-MA,2 Seasons,Netflix,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3067,19139,TV Show,1989,TV-PG,7 Seasons,Hulu,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3068,19140,TV Show,1966,TV-PG,3 Seasons,Hulu,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3069,19141,TV Show,1997,TV-PG,7 Seasons,Hulu,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3070,19142,TV Show,2011,TV-14,1 Season,Hulu,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
cleaned_df = cleaned_df.drop(columns=['and Culture'])

In [None]:
cleaned_df.to_csv("Streaming Platform Analysis.csv", index=False)

from google.colab import files
files.download("Streaming Platform Analysis.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
cleaned_df['rating'].unique()

array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
       'TV-G', 'G', 'NC-17', 'NR', 'TV-Y7-FV', 'UR', '13+', 'ALL', '18+',
       '16+', '7+', 'TV-NR', 'UNRATED', '16', 'AGES_16_', 'AGES_18_',
       'ALL_AGES', 'NOT_RATE', 'NOT RATED'], dtype=object)