# Importando bibliotecas

In [1]:
import mlflow
import kagglehub
import os

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

  from .autonotebook import tqdm as notebook_tqdm


## Configurar e iniciar o mlflow

In [2]:
mlflow.set_experiment("kaggle-9000-movies")
mlflow.start_run(run_name="Exploratory-CH01")
mlflow.set_tag("phase", "EDA")


## Utilitários do mlflow

In [None]:
artifacts_parquets_path = "artifacts"
os.makedirs(artifacts_parquets_path, exist_ok=True)  # cria se não existir

def log_artifact(df: pd.DataFrame, name: str):
    path = os.path.join(artifacts_parquets_path, f"{name}.parquet")
    df.to_parquet(path, index=False)
    mlflow.log_artifact(path, artifact_path="data")

# Baixar data set

In [4]:
# Download latest version
mlflow.log_param("dataset", "disham993/9000-movies")
dataset_path = kagglehub.dataset_download("disham993/9000-movies-dataset")
print("Path do dataset:", dataset_path)
print("Arquivos do dataset:", os.listdir(dataset_path))
dataset_csv_path = os.path.join(dataset_path, os.listdir(dataset_path)[0])
print("Path do csv do dataset:", dataset_csv_path)

Path do dataset: /Users/vand/.cache/kagglehub/datasets/disham993/9000-movies-dataset/versions/2
Arquivos do dataset: ['mymoviedb.csv']
Path do csv do dataset: /Users/vand/.cache/kagglehub/datasets/disham993/9000-movies-dataset/versions/2/mymoviedb.csv


## Carrega dataset

In [5]:
df_raw = pd.read_csv(dataset_csv_path, engine="python")
df_raw.head()

log_artifact(df_raw, "raw")

## Infos do dataset?

In [6]:
df_raw.info()
df_raw.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9837 entries, 0 to 9836
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Release_Date       9837 non-null   object 
 1   Title              9828 non-null   object 
 2   Overview           9828 non-null   object 
 3   Popularity         9827 non-null   float64
 4   Vote_Count         9827 non-null   object 
 5   Vote_Average       9827 non-null   object 
 6   Original_Language  9827 non-null   object 
 7   Genre              9826 non-null   object 
 8   Poster_Url         9826 non-null   object 
dtypes: float64(1), object(8)
memory usage: 691.8+ KB


(9837, 9)

## Com isso ja podemos responder as seguintes perguntas
1) Qual tamanho do DataSet? 691.8 kb
2) Quantas linhas? 9837
3) Quantas colunas? 9
4) Qual o tipo de variável de cada coluna? A coluna "Popularity" é float64, as outras 8 colunas são object

## Corrigindo tipos numericos

In [7]:
df_clear = df_raw.copy()
df_clear["Vote_Count"] = pd.to_numeric(df_clear["Vote_Count"], errors="coerce")
mlflow.log_param("convert-Vote_Count", "numeric")

df_clear["Vote_Average"] = pd.to_numeric(df_clear["Vote_Average"], errors="coerce")
mlflow.log_param("convert-Vote_Average", "numeric")

log_artifact(df_clear, "converted")

df_clear.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9837 entries, 0 to 9836
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Release_Date       9837 non-null   object 
 1   Title              9828 non-null   object 
 2   Overview           9828 non-null   object 
 3   Popularity         9827 non-null   float64
 4   Vote_Count         9826 non-null   float64
 5   Vote_Average       9826 non-null   float64
 6   Original_Language  9827 non-null   object 
 7   Genre              9826 non-null   object 
 8   Poster_Url         9826 non-null   object 
dtypes: float64(3), object(6)
memory usage: 691.8+ KB


### Pegando os com maior numero de votação

In [8]:
max_vote_count = df_clear["Vote_Count"].max()
print("Maior numero de votos:", max_vote_count)
top_voted_count = df_clear[df_clear["Vote_Count"] == max_vote_count]
print("Filmes com maior quantidade de votação:", top_voted_count["Title"])

Maior numero de votos: 31077.0
Filmes com maior quantidade de votação: 380    Inception
Name: Title, dtype: object


 5) Qual o filme com maior número de votaçôes? Interception, linha 380

## Pegando filme com maior nota

In [9]:
max_vote_average = df_clear["Vote_Average"].max()
print("Maior nota:", max_vote_average)
df_max_voted_average = df_clear[df_clear["Vote_Average"] == max_vote_average]
print("Filmes com maior nota: \n", df_max_voted_average["Title"])

if len(df_max_voted_average) > 1:
    max_vote_count_with_max_average = df_max_voted_average["Vote_Count"].max()
    print("Maior numero de votos entre os com maior nota:", max_vote_count_with_max_average)
    df_max_voted_average_with_max_average = top_voted_average[df_max_voted_average["Vote_Count"] == max_vote_count_with_max_average]
    print("Filme com maior numero de votos entres os com maior nota:\n", df_max_voted_average_with_max_average["Title"])

Maior nota: 10.0
Filmes com maior nota: 
 9401    Kung Fu Master Huo Yuanjia
Name: Title, dtype: object


6) Qual filme teve a maior nota (critério de desempate é o filme com mais votos)? Kung Fu Master Huo Yuanjia, 

## Verificando valores nulos

In [10]:
display("Itens nulos:", df_clear.isnull().sum().sort_values(ascending=False))
# Remove filmes sem titulo
df_clear = df_clear.dropna(subset=["Title"])
mlflow.log_param("remove-null-Title", True)

#Remove filmes sem genero
df_clear = df_clear.dropna(subset=["Genre"])
mlflow.log_param("remove-null-Genre", True)

# Pega tratamentos para colunas restantes
original_language_mode = df_clear["Original_Language"].mode()[0]
mlflow.log_param("imput-Original_Language", "mode")

vote_count_median = df_clear["Vote_Count"].median()
mlflow.log_param("imput-Vote_Count", "median")

vote_average_median = df_clear["Vote_Average"].median()
mlflow.log_param("imput-Vote_Average", "median")

popularity_median = df_clear["Popularity"].median()
mlflow.log_param("imput-Popularity", "median")

# Aplica os tratamentos
coluns_tretament = {
    "Original_Language": original_language_mode,
    "Vote_Count": vote_count_median,
    "Vote_Average": vote_average_median,
    "Popularity": popularity_median,
    "Poster_Url": ""
}
df_clear = df_clear.fillna(coluns_tretament)

log_artifact(df_clear, "non-null")

display("Itens nulos:", df_clear.isnull().sum().sort_values(ascending=False))

'Itens nulos:'

Vote_Count           11
Genre                11
Poster_Url           11
Vote_Average         11
Popularity           10
Original_Language    10
Title                 9
Overview              9
Release_Date          0
dtype: int64

'Itens nulos:'

Release_Date         0
Title                0
Overview             0
Popularity           0
Vote_Count           0
Vote_Average         0
Original_Language    0
Genre                0
Poster_Url           0
dtype: int64

7) Existem valores nulos? Se sim, qual tratamento irá realizar? (Se não temos nome de algum filme, melhor nem considerar):
   - Removido as linhas de titulos e generos nulos
   - Aplicado moda para o idioma original
   - Aplicado mediana para o numero de votos, nota e popularidade
   - Deixado em branco a ulr do poster

8) Transforme as variaveis categóricas de linguagem e genero em númericas (utilize dummy)

In [11]:
# Monta os dummies para o idioma
df_lang_dummies = pd.get_dummies(df_clear["Original_Language"], prefix = "lang_dummies")

# Splita os generos, transformando em array
df_clear["Genre"] = df_clear["Genre"].str.split(",\s*")
mlflow.log_param("transform-Genre", "split_by_comma")

# # Monta os dummies para o genero
df_genres_dummies = df_clear["Genre"].str.join('|').str.get_dummies()

# # Junta as colunas dummies
df_clear = pd.concat([df_clear, df_lang_dummies, df_genres_dummies], axis=1)
mlflow.log_param("insert-Original_Language", "dummies")
mlflow.log_param("insert-Genre", "dummies")

log_artifact(df_clear, "transformed")

# Normalize as variaveis numéricas

In [12]:
minmax = MinMaxScaler()
columns_to_scale = ["Popularity", "Vote_Count", "Vote_Average"]
scaled_values = minmax.fit_transform(df_clear[columns_to_scale])
mlflow.log_param("normalized-Popularity", "MinMaxScaler")
mlflow.log_param("normalized-Vote_Count", "MinMaxScaler")
mlflow.log_param("normalized-Vote_Average", "MinMaxScaler")

scaled_columns = [f"Scaled_{x}" for x in columns_to_scale]
df_scaled_values = pd.DataFrame(scaled_values, columns=scaled_columns, index=df_clear.index)

df_clear = pd.concat([df_clear, df_scaled_values], axis=1)

log_artifact(df_clear, "scaled")

## Armazene esses valores como um artefato dentro do MLFlow

In [13]:
processed_data_path = "9000-movies-dataset_processed.csv"
df_clear.to_csv(processed_data_path, index=False)
print("Dataset processado salvo localmente.")

mlflow.log_artifact(processed_data_path)
mlflow.end_run()

Dataset processado salvo localmente.


## Quais insights é possivel obter desses dados?
- A maioria dos files tem idioma original em ingles