# Import Library

In [2]:
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from wordcloud import WordCloud
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from scipy.sparse import save_npz
import warnings
warnings.filterwarnings('ignore')

# Load Data

In [3]:
df = pd.read_csv('data/netflix_data.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


### Cek data apakah ada missing value

terdapat missing value pada kolom director, cast, country, date_added, rating, duration


In [5]:
df.isna().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

mengisi missing value dengan string kosong agar mudah untuk pemrosesa

In [6]:
df.fillna('', inplace=True)

# EDA

Jumlah film yang dirilis per tahun


In [7]:
movie_counts = df['release_year'].value_counts().sort_index()
fig = go.Figure(data=go.Bar(x=movie_counts.index, y=movie_counts.values))
fig.update_layout(
    plot_bgcolor='rgb(17, 17, 17)',  
    paper_bgcolor='rgb(17, 17, 17)',  
    font_color='white', 
    title='Number of Movies Released Each Year',  
    xaxis=dict(title='Year'),  
    yaxis=dict(title='Number of Movies')
)
fig.update_traces(marker_color='white')
fig.show()

berdasarkan plot diatas, produksi film mencapai pada titik puncak nya pada tahun 2018

Distribusi film berdasarkan negara


In [8]:
top_countries = df['country'].value_counts().head(10)

fig = px.treemap(names=top_countries.index, parents=["" for _ in top_countries.index], values=top_countries.values)

fig.update_layout(
    plot_bgcolor='rgb(17, 17, 17)',  
    paper_bgcolor='rgb(17, 17, 17)', 
    font_color='white',  
    title='Top Countries with Highest Number of Movies',
)
fig.show()

berdasarkan pada treemap diatas, US menempati urutan 1 dengan produksi film dan series terbanyak, kemudian disusul oleh india dan UK

# Data Processing

In [9]:
new_df = df[['title', 'type', 'director', 'cast', 'rating', 'listed_in', 'description']]
new_df.set_index('title', inplace=True)

In [10]:
new_df.head()

Unnamed: 0_level_0,type,director,cast,rating,listed_in,description
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dick Johnson Is Dead,Movie,Kirsten Johnson,,PG-13,Documentaries,"As her father nears the end of his life, filmm..."
Blood & Water,TV Show,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",TV-MA,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
Ganglands,TV Show,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",TV-MA,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
Jailbirds New Orleans,TV Show,,,TV-MA,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
Kota Factory,TV Show,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",TV-MA,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


membuat class untuk memproses teks

In [11]:
class TextCleaner:
    def separate_text(self, texts):
        unique_texts = set()
        for text in texts.split(','):
            unique_texts.add(text.strip().lower())
        return ' '.join(unique_texts)

    def remove_space(self, texts):
        return texts.replace(' ', '').lower()

    def remove_punc(self, texts):
        texts = texts.lower()
        texts = texts.translate(str.maketrans('', '', string.punctuation))
        return ' '.join(texts.split())

    def clean_text(self, texts):
        texts = self.separate_text(texts)
        texts = self.remove_space(texts)
        texts = self.remove_punc(texts)
        return texts

In [12]:
cleaner = TextCleaner()


melakukan peng aplikasian pada tiap kolom

In [13]:
new_df['type']        = new_df['type'].apply(cleaner.remove_space)
new_df['director']    = new_df['director'].apply(cleaner.separate_text)
new_df['cast']        = new_df['cast'].apply(cleaner.separate_text)
new_df['rating']      = new_df['rating'].apply(cleaner.remove_space)
new_df['listed_in']   = new_df['listed_in'].apply(cleaner.separate_text)
new_df['description'] = new_df['description'].apply(cleaner.remove_punc)

# Feature Extraction (TF-IDF)

Menggabungkan fitur teks menjadi Bag of Words (BoW)



In [14]:
new_df['BoW'] = new_df.apply(lambda row: ' '.join(row.dropna().values), axis=1)
new_df.drop(new_df.columns[:-1], axis=1, inplace=True)

In [15]:
new_df.head()  

Unnamed: 0_level_0,BoW
title,Unnamed: 1_level_1
Dick Johnson Is Dead,movie kirsten johnson pg-13 documentaries as ...
Blood & Water,tvshow khosi ngema cindy mahlangu thabang mol...
Ganglands,tvshow julien leclercq noureddine farihi nabih...
Jailbirds New Orleans,tvshow tv-ma docuseries reality tv feuds fli...
Kota Factory,tvshow urvi singh jitendra kumar mayur more a...


Menerapkan TF-IDF Vectorizer

In [16]:
tfid = TfidfVectorizer()
tfid_matrix = tfid.fit_transform(new_df['BoW'])

Menghitung Cosine Similarity antar judul



In [17]:
cosine_sim = cosine_similarity(tfid_matrix, tfid_matrix)
cosine_sim

array([[1.        , 0.00504833, 0.02011193, ..., 0.01065369, 0.02109898,
        0.03048859],
       [0.00504833, 1.        , 0.01714561, ..., 0.00103121, 0.        ,
        0.00481712],
       [0.02011193, 0.01714561, 1.        , ..., 0.00560911, 0.01042642,
        0.0333502 ],
       ...,
       [0.01065369, 0.00103121, 0.00560911, ..., 1.        , 0.05649084,
        0.00600011],
       [0.02109898, 0.        , 0.01042642, ..., 0.05649084, 1.        ,
        0.01046521],
       [0.03048859, 0.00481712, 0.0333502 , ..., 0.00600011, 0.01046521,
        1.        ]])

In [18]:
final_data = df[['title', 'type']]


In [19]:
final_data.head()


Unnamed: 0,title,type
0,Dick Johnson Is Dead,Movie
1,Blood & Water,TV Show
2,Ganglands,TV Show
3,Jailbirds New Orleans,TV Show
4,Kota Factory,TV Show


# Sistem Rekomendasi

Membuat class untuk mencari film atau tv show yang mirip

In [20]:
import re
class FlixHub:
    def __init__(self, df, cosine_sim):
        self.df = df
        self.cosine_sim = cosine_sim
    
    def recommendation(self, title, total_result=5, threshold=0.5):
        idx = self.find_id(title)
        self.df['similarity'] = self.cosine_sim[idx]
        sort_df = self.df.sort_values(by='similarity', ascending=False)[1:total_result+1]
        
        movies = sort_df['title'][sort_df['type'] == 'Movie']
        tv_shows = sort_df['title'][sort_df['type'] == 'TV Show']
        
        similar_movies = []
        similar_tv_shows = []
        
        for i, movie in enumerate(movies):
            similar_movies.append('{}. {}'.format(i+1, movie))
        
        for i, tv_show in enumerate(tv_shows):
            similar_tv_shows.append('{}. {}'.format(i+1, tv_show))
        
        return similar_movies, similar_tv_shows

    def find_id(self, name):
        for index, string in enumerate(self.df['title']):
            if re.search(name, string):
                return index
        return -1

In [24]:
flix_hub = FlixHub(final_data, cosine_sim)
movies, tv_shows = flix_hub.recommendation('Back to 1989', total_result=10, threshold=0.5)

print('Similar Movie(s) list:')
for movie in movies:
    print(movie)

print('\nSimilar TV_show(s) list:')
for tv_show in tv_shows:
    print(tv_show)

Similar Movie(s) list:
1. Little Big Women

Similar TV_show(s) list:
1. Way Back into Love
2. See You in Time
3. The Devil Punisher
4. When I See You Again
5. Queen of No Marriage
6. Miss in Kiss
7. Love Now
8. My MVP Valentine
9. Who's the One


In [21]:
flix_hub = FlixHub(final_data, cosine_sim)
movies, tv_shows = flix_hub.recommendation('Stranger Things', total_result=10, threshold=0.5)

print('Similar Movie(s) list:')
for movie in movies:
    print(movie)

print('\nSimilar TV_show(s) list:')
for tv_show in tv_shows:
    print(tv_show)

Similar Movie(s) list:
1. Safe Haven
2. Homefront
3. Eli
4. Equilibrium
5. Ant-Man and the Wasp

Similar TV_show(s) list:
1. Beyond Stranger Things
2. Prank Encounters
3. The Umbrella Academy
4. Good Witch
5. Anjaan: Special Crimes Unit


# Cek Recall dan Precision

In [22]:
def evaluate_recommendation(flix_hub, title, ground_truth, total_result=10):
    recommended_movies, recommended_tv_shows = flix_hub.recommendation(title, total_result)

    recommended = set([rec.split('. ')[1] for rec in recommended_movies + recommended_tv_shows])
    relevant = set(ground_truth)  

    true_positives = len(recommended & relevant)
    precision = true_positives / len(recommended) if recommended else 0
    recall = true_positives / len(relevant) if relevant else 0

    return precision, recall





In [27]:
ground_truth_movies = ["Ant-Man and the Wasp", "Equilibrium", "Homefront", "Eli", "Safe Haven" ]  
ground_truth_tv_shows = ["Miss in Kiss", "When I See You Again", "Way Back into Love", "Beyond Stranger Things"]  
ground_truth = ground_truth_movies + ground_truth_tv_shows

precision, recall = evaluate_recommendation(flix_hub, "Stranger Things", ground_truth)
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

Precision: 0.60
Recall: 0.67
