## Downloading Data

In [191]:
import pandas as pd

In [192]:
# Just uploading and merging 4 datasets from keggle:
# https://www.kaggle.com/datasets/shivamb/netflix-shows
# These are just listings of movies and tv shows from 4 different platforms

netflix = pd.read_csv("netflix_titles.csv")
hulu = pd.read_csv("hulu_titles.csv")
disney = pd.read_csv("disney_plus_titles.csv")
amazon = pd.read_csv("amazon_prime_titles.csv")

netflix['platform'] = 'netflix'
hulu['platform'] = 'hulu'
disney['platform'] = 'disney'
amazon['platform'] = 'amazon'

platform = [netflix, hulu, disney, amazon]
streaming = pd.concat(platform)
    
streaming.head(3)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,platform
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",netflix
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",netflix
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,netflix


In [193]:
len(netflix), len(hulu), len(disney), len(amazon), len(streaming),

(8807, 3073, 1450, 9668, 22998)

In [194]:
streaming = streaming.dropna(subset='description')

In [195]:
streaming.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22994 entries, 0 to 9667
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       22994 non-null  object
 1   type          22994 non-null  object
 2   title         22994 non-null  object
 3   director      14739 non-null  object
 4   cast          17677 non-null  object
 5   country       11499 non-null  object
 6   date_added    13440 non-null  object
 7   release_year  22994 non-null  int64 
 8   rating        22133 non-null  object
 9   duration      22515 non-null  object
 10  listed_in     22994 non-null  object
 11  description   22994 non-null  object
 12  platform      22994 non-null  object
dtypes: int64(1), object(12)
memory usage: 2.5+ MB


## Data Processing - Description (Stemming, Removing Stopwords)

In [196]:
import re
import string
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [197]:
# Punctuation and irrelevant words was excluded (aka stemming) + everything was lowercased, and then put in a separate column

tokens = []
porter = PorterStemmer()
filter = set(stopwords.words('english'))

for i in range(len(streaming)):
    token = streaming['description'].iloc[i]
    if type(token) != str:
        token = token.to_string()
        token = re.sub(f'...\n{i}','',token)
        token = re.sub('\u200b','',token)
        token = re.sub(f'{i} ','',token)
    token = re.sub(r'[^\w\s]','',token)
    token = token.split()
    token = [porter.stem(word) for word in token if not word in filter]
    token = ' '.join(token)        
    tokens.append(token)

streaming['tock_disc'] = tokens

streaming['tock_disc'].head(3)

0    as father near end life filmmak kirsten johnso...
1    after cross path parti cape town teen set prov...
2    to protect famili power drug lord skill thief ...
Name: tock_disc, dtype: object

## Data Processing - Genre (Breakdown Subgenres)

In [198]:
# I grouped everything by genre and esssentially counted all keywords within a genre
# NOTE: Drama is the largest set

streaming_sum = streaming.groupby('listed_in').agg({'tock_disc': ['sum'], 'listed_in': ['count']})
streaming_sum["listed_in"]["count"].sort_values(ascending=False).head(10)

listed_in
Drama                           1061
Comedy                           608
Documentaries                    573
Comedy, Drama                    421
Drama, Suspense                  399
Documentary                      375
Animation, Kids                  373
Kids                             367
Dramas, International Movies     362
Stand-Up Comedy                  334
Name: count, dtype: int64

In [199]:
# Punctionation and irrelevant words was excluded (aka stemming) + everything was lowercased, and then put in a separate column

def quick_func(breakdown):
    breakdown = re.split(' |, | and |-',breakdown)
    to_remove = ['&', '/', 'and', 'of', "Fi", "Up", 'Coming', 'Special', 'Show', 'Shows', 'Late', 'Age', 'Language', "Interest", "Feature"]
    breakdown = [i for i in breakdown if i not in to_remove]
    breakdown = [porter.stem(x) for x in breakdown]
    if "kids'" in breakdown:
        breakdown.remove("kids'")
        breakdown.append("kid")
    if "children" in breakdown:
        breakdown.remove("children")
        breakdown.append("kid")   
    if "lgbtq+" in breakdown:
        breakdown.remove("lgbtq+")
        breakdown.append("lgbtq")
    if "romanc" in breakdown:
        breakdown.remove("romanc")
        breakdown.append("romant")
    if "film" in breakdown:
        breakdown.remove("film")
        breakdown.append("movi")
    if "histor" in breakdown:
        breakdown.remove("histor")
        breakdown.append("histori")
    breakdown = set(breakdown)
    breakdown = list(breakdown)
    return breakdown

streaming['listed_in_breakdown'] = streaming['listed_in'].map(lambda x: quick_func(x))

In [200]:
# I counted a number of instances after breaking down the subgenres
# NOTE: Drama is the most popular subgenre

flat_list = streaming['listed_in_breakdown'].to_list()
flat_list = [item for sublist in flat_list for item in sublist]
flat_list = pd.DataFrame(flat_list)
flat_list.value_counts().head(5)

drama     7918
comedi    5912
intern    4674
movi      4512
action    3691
dtype: int64

In [201]:
# Created a column with values 1 and 0 for whether or not there is drama in the list

streaming['drama'] = streaming.apply(lambda x: 1 if 'drama' in x['listed_in_breakdown'] else 0, axis=1)
streaming.head(3)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,platform,tock_disc,listed_in_breakdown,drama
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",netflix,as father near end life filmmak kirsten johnso...,[documentari],0
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",netflix,after cross path parti cape town teen set prov...,"[intern, mysteri, drama, tv]",1
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,netflix,to protect famili power drug lord skill thief ...,"[action, intern, adventur, tv, crime]",0


## Building Model

In [202]:
# drama_result is the target, flat_list is the processed description

flat_list = streaming['tock_disc'].to_list()
drama_result = streaming.iloc[:,15].values 

# just to demonstrate
drama_result[0:3], flat_list[0:3]

(array([0, 1, 0], dtype=int64),
 ['as father near end life filmmak kirsten johnson stage death invent comic way help face inevit',
  'after cross path parti cape town teen set prove whether privateschool swim star sister abduct birth',
  'to protect famili power drug lord skill thief mehdi expert team robber pull violent deadli turf war'])

In [203]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import time

start = time.time()

cv = CountVectorizer()
X = cv.fit_transform(flat_list).toarray()
y = drama_result

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# RandomForestClassifier - Fitting classifier to the Training set
classifier=RandomForestClassifier(n_estimators = 100,criterion="entropy",random_state =0)
classifier.fit(X_train,y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

end = time.time()

In [210]:
round((end - start) / 60), "minutes"

(6, 'minutes')

In [205]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score

# Making the Confusion Matrix
cm2 = confusion_matrix(y_test, y_pred)
ps = precision_score(y_test, y_pred)
rs = recall_score(y_test, y_pred)

tn, fp, fn, tp = cm2.ravel()
(tn, fn, tp, fp), ps, rs

((2806, 1133, 476, 184), 0.7212121212121212, 0.2958359229334991)

## Drafts and Side Notes