In [2]:
!pip install pandas python-dotenv psycopg2-binary



In [9]:
import os
import pandas as pd
from dotenv import load_dotenv

# Aqui eu carreguei as variáveis de ambiente do .env
load_dotenv(dotenv_path=".env")

IMDB_CSV_PATH = os.getenv("IMDB_CSV_PATH")
print("IMDB_CSV_PATH:", IMDB_CSV_PATH)

# Aqui eu li o CSV original
df_raw = pd.read_csv(IMDB_CSV_PATH)
print(f"Li {len(df_raw)} linhas do CSV.")

# Aqui eu renomeei as colunas para snake_case
df = df_raw.rename(columns={
    "Poster_Link": "poster_link",
    "Series_Title": "series_title",
    "Released_Year": "released_year",
    "Certificate": "certificate",
    "Runtime": "runtime",
    "Genre": "genre",
    "IMDB_Rating": "imdb_rating",
    "Overview": "overview",
    "Meta_score": "meta_score",
    "Director": "director",
    "Star1": "star1",
    "Star2": "star2",
    "Star3": "star3",
    "Star4": "star4",
    "No_of_Votes": "no_of_votes",
    "Gross": "gross"
})

# --- CONVERSÕES NUMÉRICAS CORRETAS ---

# Aqui eu extrai apenas o número dos minutos (ex: "142 min" -> 142)
df["runtime_min"] = (
    df["runtime"]
    .astype(str)
    .str.extract(r"(\d+)", expand=False)
)

# Aqui eu converti o ano de lançamento para número
df["released_year"] = pd.to_numeric(df["released_year"], errors="coerce")

# Aqui eu converti a nota do IMDB para float
df["imdb_rating"] = pd.to_numeric(df["imdb_rating"], errors="coerce")

# Aqui eu converti o metascore para inteiro (alguns valores podem ser NaN)
df["meta_score"] = pd.to_numeric(df["meta_score"], errors="coerce")

# Aqui eu tratei o número de votos (já vem como número, mas garanti)
df["no_of_votes"] = pd.to_numeric(df["no_of_votes"], errors="coerce")

# Aqui eu limpei a coluna Gross: removi vírgulas e converti para int
def parse_gross(x):
    if pd.isna(x):
        return None
    x = str(x).replace(",", "").strip()
    if x == "":
        return None
    return int(x)

df["gross_usd"] = df["gross"].apply(parse_gross)

# IMPORTANTE: a coluna certificate continua como TEXTO, eu NÃO converti para int
# df["certificate"] continua string (A, U, UA, PG, etc.)

# Aqui eu selecionei apenas as colunas finais, na mesma ordem da tabela
df_final = df[[
    "poster_link",
    "series_title",
    "released_year",
    "certificate",
    "runtime_min",
    "genre",
    "imdb_rating",
    "overview",
    "meta_score",
    "director",
    "star1",
    "star2",
    "star3",
    "star4",
    "no_of_votes",
    "gross_usd"
]]

df_final.head()

IMDB_CSV_PATH: C:/Users/Willgnner/Documents/Atividades-Aprendizado-de-Máquina/Notebooks/Entrega-slides-2/Dados/imdb_top_1000.csv
Li 1000 linhas do CSV.


Unnamed: 0,poster_link,series_title,released_year,certificate,runtime_min,genre,imdb_rating,overview,meta_score,director,star1,star2,star3,star4,no_of_votes,gross_usd
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994.0,A,142,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469.0
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972.0,A,175,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411.0
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008.0,UA,152,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444.0
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974.0,A,202,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000.0
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957.0,U,96,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000.0


In [10]:
import psycopg2
from psycopg2.extras import execute_values
import numpy as np

# Aqui eu criei a conexão usando o .env
conn = psycopg2.connect(
    host=os.getenv("PGHOST"),
    port=int(os.getenv("PGPORT")),
    dbname=os.getenv("PGDATABASE"),
    user=os.getenv("PGUSER"),
    password=os.getenv("PGPASSWORD")
)

print("Conectei no Postgres.")

def to_python_int(x):
    """Aqui eu garanti que tudo que vai como inteiro vira int puro do Python ou None."""
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return None
    return int(x)

def to_python_float(x):
    """Aqui eu converti para float nativo do Python ou None."""
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return None
    return float(x)

with conn:
    with conn.cursor() as cur:
        # Aqui eu limpei a tabela (opcional, só fiz se eu queria recarregar)
        cur.execute("TRUNCATE TABLE imdb_top_1000 RESTART IDENTITY;")

        # Aqui eu preparei as linhas já com tipos nativos
        rows = []
        for row in df_final.itertuples(index=False):
            rows.append((
                row.poster_link,
                row.series_title,
                to_python_int(row.released_year),
                row.certificate,                  # <- continua string, NADA de int() aqui
                to_python_int(row.runtime_min),
                row.genre,
                to_python_float(row.imdb_rating),
                row.overview,
                to_python_int(row.meta_score),
                row.director,
                row.star1,
                row.star2,
                row.star3,
                row.star4,
                to_python_int(row.no_of_votes),
                to_python_int(row.gross_usd),
            ))

        # Aqui eu inseri tudo de uma vez
        execute_values(cur, """
            INSERT INTO imdb_top_1000 (
                poster_link,
                series_title,
                released_year,
                certificate,
                runtime_min,
                genre,
                imdb_rating,
                overview,
                meta_score,
                director,
                star1,
                star2,
                star3,
                star4,
                no_of_votes,
                gross_usd
            ) VALUES %s;
        """, rows)

print("Ingestão concluída.")
conn.close()
print("Conexão com Postgres encerrada.")

Conectei no Postgres.
Ingestão concluída.
Conexão com Postgres encerrada.
