In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

# Caminho para o arquivo CSV
caminho_arquivo = 'Data/netflix_titles.csv'

# Carregar o arquivo CSV em um dataframe
df = pd.read_csv('Data/netflix_titles.csv')

# Exibir o dataframe
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,Zombie Dumb,,,,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [3]:
# Dropando colunas que não serão utilizadas
df = df.drop(columns=['type','cast','date_added','rating', 'country', 'director'])
df

Unnamed: 0,show_id,title,release_year,duration,listed_in,description
0,s1,Dick Johnson Is Dead,2020,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,Blood & Water,2021,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,Ganglands,2021,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,Jailbirds New Orleans,2021,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,Kota Factory,2021,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...
8802,s8803,Zodiac,2007,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8803,s8804,Zombie Dumb,2018,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,s8805,Zombieland,2009,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,s8806,Zoom,2006,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


In [4]:
# Verificando valores nulos
df.isnull().sum()

show_id         0
title           0
release_year    0
duration        3
listed_in       0
description     0
dtype: int64

In [5]:
# Dropando valores nulos
df = df.dropna()

In [6]:
# Verificando valores nulos
df.isnull().sum()

show_id         0
title           0
release_year    0
duration        0
listed_in       0
description     0
dtype: int64

In [7]:
# Verificando valores duplicados
df.duplicated().sum()

0

# Pré-processamento dos Dados

In [8]:
# Função para limpar o texto
def limpar_texto(texto):
    # Converter o texto para minúsculo
    texto = texto.lower()
    # Remover caracteres especiais
    texto = re.sub(r'[^a-z0-9]', ' ', texto)
    return texto

# Aplicar a função de limpeza ao dataframe
df['description'] = df['description'].apply(limpar_texto)
df['description']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['description'] = df['description'].apply(limpar_texto)


0       as her father nears the end of his life  filmm...
1       after crossing paths at a party  a cape town t...
2       to protect his family from a powerful drug lor...
3       feuds  flirtations and toilet talk go down amo...
4       in a city of coaching centers known to train i...
                              ...                        
8802    a political cartoonist  a crime reporter and a...
8803    while living alone in a spooky town  a young g...
8804    looking to survive in a world taken over by zo...
8805    dragged from civilian life  a former superhero...
8806    a scrappy but poor boy worms his way into a ty...
Name: description, Length: 8804, dtype: object

In [10]:
# Verificando se sobrou algum description nulo
df.isnull().sum()

show_id         0
title           0
release_year    0
duration        0
listed_in       0
description     0
dtype: int64

# Cálculo da Similaridade

In [12]:
# Criar uma matriz de similaridade

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['description'])

In [14]:
# Calcular a similaridade do cosseno
similaridade = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [15]:
similaridade

array([[1.        , 0.        , 0.        , ..., 0.        , 0.01538122,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.02229903],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.01538122, 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.02229903, ..., 0.        , 0.        ,
        1.        ]])

# Recomendação de Filmes

In [16]:
# Criando um índice para os títulos
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [18]:
indices.index.value_counts()

title
Dick Johnson Is Dead                      1
Tom Segura: Mostly Stories                1
Turbo FAST                                1
Masha's Tales                             1
Chelsea Does                              1
                                         ..
Good Time                                 1
Captain Underpants Epic Choice-o-Rama     1
We Bare Bears                             1
To All the Boys: P.S. I Still Love You    1
Zubaan                                    1
Name: count, Length: 8804, dtype: int64

In [21]:
# Verificando somente os titulos duplicados
df[df['title'].duplicated(keep=False)].sort_values(by='title')

Unnamed: 0,show_id,title,release_year,duration,listed_in,description


In [45]:
# Função para recomendar filmes
def recomendar_filmes(titulo, similaridade=similaridade):
    # Obter o índice do título
    idx = indices[titulo]
    # Obter a pontuação de similaridade
    pontuacoes = list(enumerate(similaridade[idx]))
    # Ordenar os filmes por ordem de pontuação
    pontuacoes = sorted(pontuacoes, key=lambda x: x[1], reverse=True)
    # Obter as pontuações dos 10 filmes mais similares
    pontuacoes = pontuacoes[1:11]
    # Obter os índices dos filmes
    indices_filmes = [i[0] for i in pontuacoes]
    # Retornar os 10 filmes mais similares
    return df['title'].iloc[indices_filmes]

In [48]:
# Recomendar filmes
recomendar_filmes('Transformers Prime')

1482      Transformers: War for Cybertron: Earthrise
2192         Transformers: War For Cybertron Trilogy
4652                Transformers: Robots in Disguise
4927                                  Beyond Skyline
6445                                  Chappaquiddick
2525                             Parasyte: The Maxim
58      Naruto Shippûden the Movie: The Will of Fire
2549                                      John Henry
4500                          The Night Comes for Us
936                      Miniforce: Super Dino Power
Name: title, dtype: object