# Importando bibliotecas

In [62]:
import mlflow
import kagglehub
import os

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

## Configurar e iniciar o mlflow

In [63]:
mlflow.set_experiment("kaggle-9000-movies")
mlflow.start_run(run_name="Exploratory-CH01")
mlflow.set_tag("phase", "EDA")


ERROR! Session/line number was not unique in database. History logging moved to new session 7


Exception: Run with UUID 1c682df7841345ff86cb29898e1244f9 is already active. To start a new run, first end the current run with mlflow.end_run(). To start a nested run, call start_run with nested=True

## Utilitários do mlflow

In [None]:
artifacts_parquets_path = "artifacts_parquets"
os.makedirs(artifacts_parquets_path, exist_ok=True)  # cria se não existir

def log_artifact(df: pd.DataFrame, name: str):
    path = os.path.join(artifacts_parquets_path, f"{name}.parquet")
    df.to_parquet(path, index=False)
    mlflow.log_artifact(path, artifact_path="data")

# Baixar data set

In [None]:
# Download latest version
mlflow.log_param("dataset", "disham993/9000-movies")
dataset_path = kagglehub.dataset_download("disham993/9000-movies-dataset")
print("Path do dataset:", dataset_path)
print("Arquivos do dataset:", os.listdir(dataset_path))
dataset_csv_path = os.path.join(dataset_path, os.listdir(dataset_path)[0])
print("Path do csv do dataset:", dataset_csv_path)

## Carrega dataset

In [None]:
df_raw = pd.read_csv(dataset_csv_path, engine="python")
df_raw.head()

log_artifact(df_raw, "raw")

## Infos do dataset?

In [None]:
df_raw.info()
df_raw.shape

## Com isso ja podemos responder as seguintes perguntas
1) Qual tamanho do DataSet? 691.8 kb
2) Quantas linhas? 9837
3) Quantas colunas? 9
4) Qual o tipo de variável de cada coluna? A coluna "Popularity" é float64, as outras 8 colunas são object

## Corrigindo tipos numericos

In [None]:
df_clear = df_raw.copy()
df_clear["Vote_Count"] = pd.to_numeric(df_clear["Vote_Count"], errors="coerce")
mlflow.log_param("convert-Vote_Count", "numeric")

df_clear["Vote_Average"] = pd.to_numeric(df_clear["Vote_Average"], errors="coerce")
mlflow.log_param("convert-Vote_Average", "numeric")

log_artifact(df_clear, "converted")

df_clear.info()

### Pegando os com maior numero de votação

In [None]:
max_vote_count = df_clear["Vote_Count"].max()
print("Maior numero de votos:", max_vote_count)
top_voted_count = df_clear[df_clear["Vote_Count"] == max_vote_count]
print("Filmes com maior quantidade de votação:", top_voted_count["Title"])

 5) Qual o filme com maior número de votaçôes? Interception, linha 380

## Pegando filme com maior nota

In [None]:
max_vote_average = df_clear["Vote_Average"].max()
print("Maior nota:", max_vote_average)
df_max_voted_average = df_clear[df_clear["Vote_Average"] == max_vote_average]
print("Filmes com maior nota: \n", df_max_voted_average["Title"])

if len(df_max_voted_average) > 1:
    max_vote_count_with_max_average = df_max_voted_average["Vote_Count"].max()
    print("Maior numero de votos entre os com maior nota:", max_vote_count_with_max_average)
    df_max_voted_average_with_max_average = top_voted_average[df_max_voted_average["Vote_Count"] == max_vote_count_with_max_average]
    print("Filme com maior numero de votos entres os com maior nota:\n", df_max_voted_average_with_max_average["Title"])

6) Qual filme teve a maior nota (critério de desempate é o filme com mais votos)? Kung Fu Master Huo Yuanjia, 

## Verificando valores nulos

In [None]:
display("Itens nulos:", df_clear.isnull().sum().sort_values(ascending=False))
# Remove filmes sem titulo
df_clear = df_clear.dropna(subset=["Title"])
mlflow.log_param("remove-null-Title", True)

#Remove filmes sem genero
df_clear = df_clear.dropna(subset=["Genre"])
mlflow.log_param("remove-null-Genre", True)

# Pega tratamentos para colunas restantes
original_language_mode = df_clear["Original_Language"].mode()[0]
mlflow.log_param("imput-Original_Language", "mode")

vote_count_median = df_clear["Vote_Count"].median()
mlflow.log_param("imput-Vote_Count", "median")

vote_average_median = df_clear["Vote_Average"].median()
mlflow.log_param("imput-Vote_Average", "median")

popularity_median = df_clear["Popularity"].median()
mlflow.log_param("imput-Popularity", "median")

# Aplica os tratamentos
coluns_tretament = {
    "Original_Language": original_language_mode,
    "Vote_Count": vote_count_median,
    "Vote_Average": vote_average_median,
    "Popularity": popularity_median,
    "Poster_Url": ""
}
df_clear = df_clear.fillna(coluns_tretament)

log_artifact(df_clear, "non-null")

display("Itens nulos:", df_clear.isnull().sum().sort_values(ascending=False))

7) Existem valores nulos? Se sim, qual tratamento irá realizar? (Se não temos nome de algum filme, melhor nem considerar):
   - Removido as linhas de titulos e generos nulos
   - Aplicado moda para o idioma original
   - Aplicado mediana para o numero de votos, nota e popularidade
   - Deixado em branco a ulr do poster

8) Transforme as variaveis categóricas de linguagem e genero em númericas (utilize dummy)

In [None]:
# Monta os dummies para o idioma
df_lang_dummies = pd.get_dummies(df_clear["Original_Language"], prefix = "lang_dummies")

# Splita os generos, transformando em array
df_clear["Genre"] = df_clear["Genre"].str.split(",\s*")
mlflow.log_param("transform-Genre", "split_by_comma")

# # Monta os dummies para o genero
df_genres_dummies = df_clear["Genre"].str.join('|').str.get_dummies()

# # Junta as colunas dummies
df_clear = pd.concat([df_clear, df_lang_dummies, df_genres_dummies], axis=1)
mlflow.log_param("insert-Original_Language", "dummies")
mlflow.log_param("insert-Genre", "dummies")

log_artifact(df_clear, "transformed")

# Normalize as variaveis numéricas

In [None]:
minmax = MinMaxScaler()
columns_to_scale = ["Popularity", "Vote_Count", "Vote_Average"]
scaled_values = minmax.fit_transform(df_clear[columns_to_scale])
mlflow.log_param("normalized-Popularity", "MinMaxScaler")
mlflow.log_param("normalized-Vote_Count", "MinMaxScaler")
mlflow.log_param("normalized-Vote_Average", "MinMaxScaler")

scaled_columns = [f"Scaled_{x}" for x in columns_to_scale]
df_scaled_values = pd.DataFrame(scaled_values, columns=scaled_columns, index=df_clear.index)

df_clear = pd.concat([df_clear, df_scaled_values], axis=1)

log_artifact(df_clear, "scaled")

## Armazene esses valores como um artefato dentro do MLFlow

In [None]:
processed_data_path = "9000-movies-dataset_processed.csv"
df_clear.to_csv(processed_data_path, index=False)
print("Dataset processado salvo localmente.")

mlflow.log_artifact(processed_data_path)
mlflow.end_run()

## Quais insights é possivel obter desses dados?
- A maioria dos files tem idioma original em ingles