# Análise exploratória

Este é a análise exploratória do Steam Games Database que pode ser encontrado aqui: https://www.kaggle.com/datasets/fronkongames/steam-games-dataset

Antes de fazer a análise, coloque os arquivos dentro da pasta `/datasets`.

## Preparando ambiente

In [None]:
import csv
import sys
import numpy
import pandas
import sqlalchemy
import sqlalchemy.orm as orm
from dotenv import load_dotenv
import os
import json
import re
import html
import unicodedata
import ast
from datetime import datetime

In [None]:
BATCH_SIZE = 2_500

In [None]:
# remove csv fild limit
csv.field_size_limit(sys.maxsize)

# mostra todas as colunas
pandas.set_option('display.max_columns', None)

In [None]:
dataset_csv_path = "./datasets/games.csv"
dataset_csv_fixed_path = "./datasets/games_fixed.csv"


## Preparando datasets

O arquivo `games.csv` está quebrado, então precisa arrumar ele antes de usar de vez no pandas. O erro é que a quantidade de colunas no cabeçalho está diferente da quantidade de colunas nas rows em si. Para resolver esse problema eu apliquei uma correção:

In [None]:
# ver 5 primeiras linhas do arquivo
with open(dataset_csv_path, 'r', encoding='utf-8') as f:
    for i in range(5):
        print(f.readline())


In [None]:
# Ler o arquivo
with open(dataset_csv_path, 'r', encoding='utf-8') as f:
    linhas = f.readlines()

# Corrigir o cabeçalho
linhas[0] = linhas[0].replace('DiscountDLC count', 'Discount,DLC count')

# Salvar o arquivo corrigido
with open(dataset_csv_fixed_path, 'w', encoding='utf-8') as f:
    f.writelines(linhas)

print("✅ Arquivo corrigido salvo como: games_fixed.csv")

Carregando o dataset corrigido para dentro do pandas:

In [None]:
games_dataset = pandas.read_csv(
  dataset_csv_fixed_path,
  sep=",",
  quotechar='"',
  quoting=csv.QUOTE_MINIMAL,
  engine="python",
  encoding="utf-8-sig",
)

## Análise inicial

In [None]:
games_dataset.info() # Ver tipos e valores nulos

Podemos ver que a tabela é formada por 40 colunas.

In [None]:
# Ver primeiras linhas
games_dataset.head()

In [None]:
games_dataset.describe() # Estatísticas básicas

In [None]:
games_dataset.shape # Dimensões

In [None]:
games_dataset.columns.tolist() # Lista de colunas

## Migrar dataset

O dataset está em um arquivo `.csv`. Para facilitar a análise vamos migrar ele para dentro de um banco Postgres.

In [None]:
load_dotenv()

# Preparar conexão com o banco
POSTGRES_HOST=os.getenv("POSTGRES_HOST")
POSTGRES_PORT=os.getenv("POSTGRES_PORT")
POSTGRES_USER=os.getenv("POSTGRES_USER")
POSTGRES_PASSWORD=os.getenv("POSTGRES_PASSWORD")
POSTGRES_DB=os.getenv("POSTGRES_DB")

print(POSTGRES_HOST, POSTGRES_PORT, POSTGRES_USER, POSTGRES_PASSWORD, POSTGRES_DB)

DATABASE_URL=f"postgresql+psycopg://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}"

# Criar engine
engine = sqlalchemy.create_engine(DATABASE_URL)

# testar conexão
try:
    with engine.connect() as connection:
        print("✅ Conexão com o banco de dados estabelecida com sucesso!")
except Exception as e:
    print(f"❌ Erro ao conectar ao banco de dados: {e}")


Agora precisamos preparar as tabelas usando migrations

In [None]:
Base = orm.declarative_base()

# =========================================
# TABELAS PIVÔ (ASSOCIAÇÃO) M:N
# =========================================

game_developer = sqlalchemy.Table(
    "game_developer",
    Base.metadata,
    sqlalchemy.Column("game_id", sqlalchemy.Integer, sqlalchemy.ForeignKey("games.id"), primary_key=True),
    sqlalchemy.Column("developer_id", sqlalchemy.Integer, sqlalchemy.ForeignKey("developers.id"), primary_key=True)
)

game_publisher = sqlalchemy.Table(
    "game_publisher",
    Base.metadata,
    sqlalchemy.Column("game_id", sqlalchemy.Integer, sqlalchemy.ForeignKey("games.id"), primary_key=True),
    sqlalchemy.Column("publisher_id", sqlalchemy.Integer, sqlalchemy.ForeignKey("publishers.id"), primary_key=True)
)

game_category = sqlalchemy.Table(
    "game_category",
    Base.metadata,
    sqlalchemy.Column("game_id", sqlalchemy.Integer, sqlalchemy.ForeignKey("games.id"), primary_key=True),
    sqlalchemy.Column("category_id", sqlalchemy.Integer, sqlalchemy.ForeignKey("categories.id"), primary_key=True)
)

game_genre = sqlalchemy.Table(
    "game_genre",
    Base.metadata,
    sqlalchemy.Column("game_id", sqlalchemy.Integer, sqlalchemy.ForeignKey("games.id"), primary_key=True),
    sqlalchemy.Column("genre_id", sqlalchemy.Integer, sqlalchemy.ForeignKey("genres.id"), primary_key=True)
)

game_tag = sqlalchemy.Table(
    "game_tag",
    Base.metadata,
    sqlalchemy.Column("game_id", sqlalchemy.Integer, sqlalchemy.ForeignKey("games.id"), primary_key=True),
    sqlalchemy.Column("tag_id", sqlalchemy.Integer, sqlalchemy.ForeignKey("tags.id"), primary_key=True)
)

game_language = sqlalchemy.Table(
    "game_language",
    Base.metadata,
    sqlalchemy.Column("game_id", sqlalchemy.Integer, sqlalchemy.ForeignKey("games.id"), primary_key=True),
    sqlalchemy.Column("language_id", sqlalchemy.Integer, sqlalchemy.ForeignKey("languages.id"), primary_key=True)
)

# =========================================
# MODELS
# =========================================

class Developer(Base):
    __tablename__ = "developers"

    id = sqlalchemy.Column(sqlalchemy.Integer, primary_key=True)
    name = sqlalchemy.Column(sqlalchemy.Text, nullable=False)

    games = orm.relationship("Game", secondary=game_developer, back_populates="developers")

    @classmethod
    def get_or_create(cls, name, session):
        parsed = name.strip().lower()
        obj = session.query(cls).filter_by(name=parsed).first()
        if obj:
            return obj
        obj = cls(name=parsed)
        session.add(obj)
        session.commit()
        return obj


class Publisher(Base):
    __tablename__ = "publishers"

    id = sqlalchemy.Column(sqlalchemy.Integer, primary_key=True)
    name = sqlalchemy.Column(sqlalchemy.Text, nullable=False)

    games = orm.relationship("Game", secondary=game_publisher, back_populates="publishers")

    @classmethod
    def get_or_create(cls, name, session):
        parsed = name.strip().lower()
        obj = session.query(cls).filter_by(name=parsed).first()
        if obj:
            return obj
        obj = cls(name=parsed)
        session.add(obj)
        session.commit()
        return obj


class Category(Base):
    __tablename__ = "categories"

    id = sqlalchemy.Column(sqlalchemy.Integer, primary_key=True)
    name = sqlalchemy.Column(sqlalchemy.String(255), nullable=False)

    games = orm.relationship("Game", secondary=game_category, back_populates="categories")

    @classmethod
    def get_or_create(cls, name, session):
        parsed = name.strip().lower()
        obj = session.query(cls).filter_by(name=parsed).first()
        if obj:
            return obj
        obj = cls(name=parsed)
        session.add(obj)
        session.commit()
        return obj


class Genre(Base):
    __tablename__ = "genres"

    id = sqlalchemy.Column(sqlalchemy.Integer, primary_key=True)
    name = sqlalchemy.Column(sqlalchemy.String(255), nullable=False)

    games = orm.relationship("Game", secondary=game_genre, back_populates="genres")

    @classmethod
    def get_or_create(cls, name, session):
        parsed = name.strip().lower()
        obj = session.query(cls).filter_by(name=parsed).first()
        if obj:
            return obj
        obj = cls(name=parsed)
        session.add(obj)
        session.commit()
        return obj


class Tag(Base):
    __tablename__ = "tags"

    id = sqlalchemy.Column(sqlalchemy.Integer, primary_key=True)
    name = sqlalchemy.Column(sqlalchemy.String(255), nullable=False)

    games = orm.relationship("Game", secondary=game_tag, back_populates="tags")

    @classmethod
    def get_or_create(cls, name, session):
        parsed = name.strip().lower()
        obj = session.query(cls).filter_by(name=parsed).first()
        if obj:
            return obj
        obj = cls(name=parsed)
        session.add(obj)
        session.commit()
        return obj


class Language(Base):
    __tablename__ = "languages"

    id = sqlalchemy.Column(sqlalchemy.Integer, primary_key=True)
    name = sqlalchemy.Column(sqlalchemy.String(255), nullable=False)

    games = orm.relationship("Game", secondary=game_language, back_populates="languages")

    @classmethod
    def get_or_create(cls, name, session):
        parsed = name.strip().lower()
        obj = session.query(cls).filter_by(name=parsed).first()
        if obj:
            return obj
        obj = cls(name=parsed)
        session.add(obj)
        session.commit()
        return obj


class Screenshot(Base):
    __tablename__ = "screenshots"

    id = sqlalchemy.Column(sqlalchemy.Integer, primary_key=True)
    game_id = sqlalchemy.Column(sqlalchemy.Integer, sqlalchemy.ForeignKey("games.id"), nullable=False)
    screenshot_url = sqlalchemy.Column(sqlalchemy.String(255), nullable=False)


class Movie(Base):
    __tablename__ = "movies"

    id = sqlalchemy.Column(sqlalchemy.Integer, primary_key=True)
    game_id = sqlalchemy.Column(sqlalchemy.Integer, sqlalchemy.ForeignKey("games.id"), nullable=False)
    movie_url = sqlalchemy.Column(sqlalchemy.String(255), nullable=False)


class Game(Base):
    __tablename__ = "games"

    id = sqlalchemy.Column(sqlalchemy.Integer, primary_key=True)
    app_id = sqlalchemy.Column(sqlalchemy.Integer, nullable=False)
    name = sqlalchemy.Column(sqlalchemy.Text, nullable=False)
    release_date = sqlalchemy.Column(sqlalchemy.Date, nullable=False)
    estimated_owners_lower = sqlalchemy.Column(sqlalchemy.Integer, nullable=False)
    estimated_owners_upper = sqlalchemy.Column(sqlalchemy.Integer, nullable=False)
    peak_ccu = sqlalchemy.Column(sqlalchemy.Integer, nullable=False, default=0)
    required_age = sqlalchemy.Column(sqlalchemy.Integer, nullable=False, default=0)
    price = sqlalchemy.Column(sqlalchemy.Float, nullable=False, default=0.0)
    discount = sqlalchemy.Column(sqlalchemy.Float, nullable=False, default=0.0)
    dlc_count = sqlalchemy.Column(sqlalchemy.Integer, nullable=False, default=0)
    about_the_game = sqlalchemy.Column(sqlalchemy.Text, nullable=True)
    header_image = sqlalchemy.Column(sqlalchemy.Text, nullable=True)
    website = sqlalchemy.Column(sqlalchemy.Text, nullable=True)
    support_url = sqlalchemy.Column(sqlalchemy.Text, nullable=True)
    support_email = sqlalchemy.Column(sqlalchemy.Text, nullable=True)
    windows = sqlalchemy.Column(sqlalchemy.Boolean, nullable=False, default=False)
    mac = sqlalchemy.Column(sqlalchemy.Boolean, nullable=False, default=False)
    linux = sqlalchemy.Column(sqlalchemy.Boolean, nullable=False, default=False)
    metacritic_score = sqlalchemy.Column(sqlalchemy.Float, nullable=True)
    metacritic_url = sqlalchemy.Column(sqlalchemy.Text, nullable=True)
    user_score = sqlalchemy.Column(sqlalchemy.Float, nullable=True)
    positive = sqlalchemy.Column(sqlalchemy.Integer, nullable=True)
    negative = sqlalchemy.Column(sqlalchemy.Integer, nullable=True)
    score_rank = sqlalchemy.Column(sqlalchemy.Integer, nullable=True)
    achievements = sqlalchemy.Column(sqlalchemy.Integer, nullable=True)
    recommendations = sqlalchemy.Column(sqlalchemy.Integer, nullable=True)
    average_playtime_forever = sqlalchemy.Column(sqlalchemy.Integer, nullable=True)
    average_playtime_2weeks = sqlalchemy.Column(sqlalchemy.Integer, nullable=True)
    median_playtime_forever = sqlalchemy.Column(sqlalchemy.Integer, nullable=True)
    median_playtime_2weeks = sqlalchemy.Column(sqlalchemy.Integer, nullable=True)

    # RELACIONAMENTOS M:N
    developers = orm.relationship("Developer", secondary=game_developer, back_populates="games")
    publishers = orm.relationship("Publisher", secondary=game_publisher, back_populates="games")
    categories = orm.relationship("Category", secondary=game_category, back_populates="games")
    genres = orm.relationship("Genre", secondary=game_genre, back_populates="games")
    tags = orm.relationship("Tag", secondary=game_tag, back_populates="games")
    languages = orm.relationship("Language", secondary=game_language, back_populates="games")


In [None]:
Base.metadata.create_all(engine)
session = sqlalchemy.orm.Session(engine)

In [None]:
# drop all tables
# Base.metadata.drop_all(engine)

In [None]:
games_dataset.head(1)

### Cadastrar tags no postgres

In [None]:
# pega todas as tags da coluna, remove NaN e garante strings
tag_column = games_dataset['Tags'].dropna().astype(str)

unique_tags = set()

# extrai tags únicas
for tag_string in tag_column:
    names = (name.strip().lower() for name in tag_string.split(','))
    unique_tags.update(names)

print("Total únicos encontrados:", len(unique_tags))


# pega do banco as tags já existentes
existing_tags = {
    t.name.lower()
    for t in session.query(Tag).all()
}

# mantém só as realmente novas
new_tags = [
    Tag(name=tag)
    for tag in unique_tags
    if tag not in existing_tags
]

# insere tudo de uma vez (muito mais rápido)
session.bulk_save_objects(new_tags)
session.commit()

print(f"Adicionadas: {len(new_tags)} tags")


In [None]:
# cria um dicionário {nome_lowercase: id} para lookup rápido
tag_map = {
    tag.name.lower(): tag.id
    for tag in session.query(Tag).all()
}

def get_tag_id(tag_names):
    if not isinstance(tag_names, str):
        return None
    
    ids = []
    for name in map(str.strip, tag_names.lower().split(',')):
        tag_id = tag_map.get(name)
        if tag_id is None:
            print(f"Tag {name} não encontrada")
        else:
            ids.append(tag_id)
    return ids

# aplica ao dataset
games_dataset['Tags'] = games_dataset['Tags'].apply(get_tag_id)

print("Tags trocadas com sucesso!")

### Cadastrar developers no Postgres

In [None]:
# pega todos os valores da coluna
developer_column = games_dataset['Developers'].dropna().astype(str)

unique_developers = set()

# extrai todos os developers únicos
for dev_string in developer_column:
    names = (name.strip().lower() for name in dev_string.split(','))
    unique_developers.update(names)

print("Total únicos encontrados:", len(unique_developers))


# pega do banco de dados os developers já existentes
existing = {
    d.name.lower()
    for d in session.query(Developer).all()
}

# filtra apenas os novos
new_developers = [
    Developer(name=name)
    for name in unique_developers
    if name not in existing
]

# insere em lote (muito mais rápido do que inserir um a um)
for i in range(0, len(new_developers), BATCH_SIZE):
    batch = new_developers[i:i+BATCH_SIZE]
    session.add_all(batch)
    session.commit()

print(f"Adicionados: {len(new_developers)} developers")


In [None]:
# cria um dicionário {nome_lowercase: id}
developer_map = {
    dev.name.lower(): dev.id
    for dev in session.query(Developer).all()
}

def get_developer_id(developer_names):
    if not isinstance(developer_names, str):
        return None
    
    ids = []
    for name in map(str.strip, developer_names.lower().split(',')):
        dev_id = developer_map.get(name)
        if dev_id is None:
            print(f"Developer {name} não encontrado")
        else:
            ids.append(dev_id)
    return ids

# aplica ao dataset
games_dataset['Developers'] = games_dataset['Developers'].apply(get_developer_id)

print("Developers trocados com sucesso!")


### Cadastrar publishers no Postgres

In [None]:
# Pega todos os valores da coluna
publisher_column = games_dataset['Publishers'].dropna().astype(str)

unique_publishers = set()

# extrai todos os publishers únicos
for pub_string in publisher_column:
    names = (name.strip().lower() for name in pub_string.split(','))
    unique_publishers.update(names)

print("Total únicos encontrados:", len(unique_publishers))


# pega do banco de dados os publishers já existentes
existing = {
    p.name.lower()
    for p in session.query(Publisher).all()
}

# filtra apenas os novos
new_publishers = [
    Publisher(name=name)
    for name in unique_publishers
    if name not in existing
]

# insere em lote (muito mais rápido do que inserir um a um)
for i in range(0, len(new_publishers), BATCH_SIZE):
    batch = new_publishers[i:i+BATCH_SIZE]
    session.add_all(batch)
    session.commit()

print(f"Adicionados: {len(new_publishers)} publishers")

In [None]:
# cria um dicionário {nome_lowercase: id}
publisher_map = {
    pub.name.lower(): pub.id
    for pub in session.query(Publisher).all()
}

def get_publisher_id(publisher_names):
    if not isinstance(publisher_names, str):
        return None
    
    ids = []
    for name in map(str.strip, publisher_names.lower().split(',')):
        pub_id = publisher_map.get(name)
        if pub_id is None:
            print(f"Publisher {name} não encontrado")
        else:
            ids.append(pub_id)
    return ids

# aplica ao dataset
games_dataset['Publishers'] = games_dataset['Publishers'].apply(get_publisher_id)

print("Publishers trocados com sucesso!")


### Cadastrar categorias no Postgres

In [None]:
# pega todos os valores da coluna
categories_column = games_dataset['Categories'].dropna().astype(str)

unique_categories = set()

# extrai todos os categories únicos
for cat_string in categories_column:
    names = (name.strip().lower() for name in cat_string.split(','))
    unique_categories.update(names)

print("Total únicos encontrados:", len(unique_categories))


# pega do banco de dados os categories já existentes
existing = {
    c.name.lower()
    for c in session.query(Category).all()
}

# filtra apenas os novos
new_categories = [
    Category(name=name)
    for name in unique_categories
    if name not in existing
]

# insere em lote (muito mais rápido do que inserir um a um)
for i in range(0, len(new_categories), BATCH_SIZE):
    batch = new_categories[i:i+BATCH_SIZE]
    session.add_all(batch)
    session.commit()

print(f"Adicionados: {len(new_categories)} categories")

In [None]:
category_map = {
    cat.name.lower(): cat.id
    for cat in session.query(Category).all()
}

def get_category_id(category_names):
    if not isinstance(category_names, str):
        return None
    
    ids = []
    for name in map(str.strip, category_names.lower().split(',')):
        cat_id = category_map.get(name)
        if cat_id is None:
            print(f"Category {name} não encontrada")
        else:
            ids.append(cat_id)
    return ids

# aplica ao dataset
games_dataset['Categories'] = games_dataset['Categories'].apply(get_category_id)

print("Categories trocadas com sucesso!")


### Cadastrar generos no Postgres

In [None]:
# Pega todos os valores da coluna
genres_column = games_dataset['Genres'].dropna().astype(str)

unique_genres = set()

# extrai todos os genres únicos
for genre_string in genres_column:
    names = (name.strip().lower() for name in genre_string.split(','))
    unique_genres.update(names)

print("Total únicos encontrados:", len(unique_genres))

# Pega do banco de dados os genres já existentes
existing = {
    g.name.lower()
    for g in session.query(Genre).all()
}

# filtra apenas os novos
new_genres = [
    Genre(name=name)
    for name in unique_genres
    if name not in existing
]

# insere em lote (muito mais rápido do que inserir um a um)
for i in range(0, len(new_genres), BATCH_SIZE):
    batch = new_genres[i:i+BATCH_SIZE]
    session.add_all(batch)
    session.commit()

print(f"Adicionados: {len(new_genres)} genres")

In [None]:
genre_map = {
    genre.name.lower(): genre.id
    for genre in session.query(Genre).all()
}

def get_genre_id(genre_names):
    if not isinstance(genre_names, str):
        return None
    
    ids = []
    for name in map(str.strip, genre_names.lower().split(',')):
        genre_id = genre_map.get(name)
        if genre_id is None:
            print(f"Genre {name} não encontrado")
        else:
            ids.append(genre_id)
    return ids

# aplica ao dataset
games_dataset['Genres'] = games_dataset['Genres'].apply(get_genre_id)

print("Genres trocados com sucesso!")

### Cadastrar languages no Postgres

In [None]:
def fix_brackets(m):
    items = m.group(1).split(",")
    items = [f'"{i.strip().strip("\"")}"' for i in items]
    return "[" + ",".join(items) + "]"

def safe_parse_languages(s):
    original = s.strip()

    # 1. Tentar JSON direto (apenas se começar com [)
    if original.startswith("["):
        try:
            return json.loads(original)
        except:
            pass

    # 2. Tentar literal_eval direto
    try:
        return ast.literal_eval(original)
    except:
        pass

    # 3. Tentar corrigir strings malformadas
    fixed = original

    # 3.1 Trocar aspas simples por duplas
    fixed = fixed.replace("'", '"')

    # 3.2 Garantir que itens sem aspas fiquem entre aspas
    # ex: K"iche" -> "K\"iche\""
    fixed = re.sub(r'(\w+)"', r'"\1"', fixed)

    # 3.3 Garantir que itens isolados fiquem entre aspas
    # ex: [English, French] → ["English", "French"]
    # fixed = re.sub(r'\[(.*?)\]', lambda m: "[" + ",".join(f'"{x.strip().strip(\'"\')}"' for x in m.group(1).split(",")) + "]", fixed)
    fixed = re.sub(r"\[(.*?)\]", fix_brackets, fixed)

    # 4. Tentar JSON novamente após correções
    try:
        return json.loads(fixed)
    except:
        pass

    # 5. Fallback manual — remove colchetes e divide por vírgula
    fallback = original.strip("[]").split(",")
    fallback = [x.strip().strip('"').strip("'") for x in fallback]
    return fallback

def clean_language(raw):
    if not isinstance(raw, str):
        return []

    # ---- 1) Decode de HTML entities ----
    text = html.unescape(raw)

    # ---- 2) Remover tags HTML e BBCode ----
    text = re.sub(r'<[^>]*>', '', text)
    text = re.sub(r'\[/?[a-zA-Z0-9]+\]', '', text)

    # ---- 3) Trocar quebras de linha por vírgula ----
    text = text.replace("\r", ",").replace("\n", ",")

    # ---- 4) Separar itens que vêm grudados ----
    parts = re.split(r'[,\|;/]+', text)

    cleaned = []

    for item in parts:
        item = item.strip().lower()

        if not item:
            continue

        # Remover hashtags (#lang_français)
        if item.startswith("#"):
            continue

        # Remover sobras de HTML mal formadas (lt, gt, amp)
        item = re.sub(r'\b(lt|gt|amp|strong)\b', '', item)
        item = item.replace("&lt", "").replace("&gt", "").replace("&amp", "")

        # Remover símbolos no começo/fim
        item = re.sub(r'^[^a-z0-9]+|[^a-z0-9]+$', '', item)

        # Normalizar Unicode (corrige francês → français)
        item = unicodedata.normalize("NFKC", item)

        # Recolocar idiomas compostos comuns
        item = item.replace("simplified chinese text only", "simplified chinese")
        item = item.replace("traditional chinese text only", "traditional chinese")

        # Remover duplicações internas
        item = re.sub(r'\b(\w+)\s+\1\b', r'\1', item)

        # Tratar casos como english dutch english
        words = item.split()
        if len(words) > 1 and all(w.isalpha() for w in words):
            # Se for uma sequência de idiomas sem vírgula, quebrar
            for w in words:
                cleaned.append(w)
            continue

        # Arrumar k'iche (sem remover apóstrofo)
        if "k'iche" in item:
            item = "k'iche'"

        # Arrumar idiomas que ficaram sem ')'
        if "(" in item and ")" not in item:
            item += ")"  

        # Descartar se ficou vazio
        item = item.strip()
        if item:
            cleaned.append(item)

    return cleaned


# ---- processamento da coluna ----

languages_column = games_dataset['Supported languages'].dropna().astype(str)

unique_languages = set()

for lang_string in languages_column:
    lang_list = safe_parse_languages(lang_string)

    for name in lang_list:
        for cleaned in clean_language(name):
            if cleaned:
                unique_languages.add(cleaned)

sorted_unique_languages = sorted(unique_languages)
print(sorted_unique_languages)
print("Total únicos encontrados:", len(unique_languages))

# pega do banco de dados os languages já existentes
existing = {
    l.name.lower()
    for l in session.query(Language).all()
}

# filtra apenas os novos
new_languages = [
    Language(name=name)
    for name in unique_languages
    if name not in existing
]

# insere em lote (muito mais rápido do que inserir um a um)
for i in range(0, len(new_languages), BATCH_SIZE):
    batch = new_languages[i:i+BATCH_SIZE]
    session.add_all(batch)
    session.commit()

print(f"Adicionados: {len(new_languages)} languages")


In [None]:
language_map = {
    language.name.lower(): language.id
    for language in session.query(Language).all()
}

def get_language_id(language_names):
    if not isinstance(language_names, str):
        return None
    
    ids = []
    # name of languages are very dirty on dataset, so we need to clean it before apply the map
    lang_list = safe_parse_languages(language_names)
    for name in lang_list:
        for cleaned in clean_language(name):
            if cleaned:
                lang_id = language_map.get(cleaned)
                if lang_id is None:
                    print(f"Language {cleaned} not found")
                else:
                    ids.append(lang_id)
    return ids

games_dataset['Supported languages'] = games_dataset['Supported languages'].apply(get_language_id)

print("Languages trocados com sucesso!")

### Cadastrar games no Postgres

In [None]:

def get_estimated_owners(estimated_owners):
    # break the string into lower and upper
    lower, upper = estimated_owners.split('-')
    lower = lower.strip()
    upper = upper.strip()

    if lower == '':
        lower = 0
    if upper == '':
        upper = 0

    return int(lower), int(upper)


def parse_steam_date(s):
    s = s.strip()

    if not s or s.lower() == "coming soon":
        return None

    try:
        return datetime.strptime(s, "%b %d, %Y").date()
    except ValueError:
        # Steam às vezes usa formatos diferentes, exemplo: "Aug 2023"
        try:
            return datetime.strptime(s, "%b %Y").date()
        except:
            return None

# for each row in games_dataset, create a new game in postgres
database_games = []
for index, row in games_dataset.iterrows():
    id = row['AppID']
    name = row['Name']

    # convert release_date to date
    release_date = parse_steam_date(row['Release date'])

    estimated_owners = row['Estimated owners'] # string as example: "1000000 - 2000000" or "0 - 1000" or "1000000"
    estimated_owners_lower, estimated_owners_upper = get_estimated_owners(estimated_owners)
    peak_ccu = row['Peak CCU']
    required_age = row['Required age']
    price = row['Price']
    discount = row['Discount']
    dlc_count = row['DLC count']

    # if about_the_game is empty, set it to None
    about_the_game = row['About the game']
    if about_the_game == '' or about_the_game == 'nan' or pandas.isna(about_the_game):
        about_the_game = None

    # if header_image is empty, set it to None
    header_image = row['Header image']
    if header_image == '' or header_image == 'nan' or pandas.isna(header_image):
        header_image = None

    # if website is empty, set it to None
    website = row['Website']
    if website == '' or website == 'nan' or pandas.isna(website):
        website = None

    # if support_url is empty, set it to None
    support_url = row['Support url']
    if support_url == '' or support_url == 'nan' or pandas.isna(support_url):
        support_url = None

    # if support_email is empty, set it to None
    support_email = row['Support email']
    if support_email == '' or support_email == 'nan' or pandas.isna(support_email):
        support_email = None

    windows = row['Windows']
    mac = row['Mac']
    linux = row['Linux']

    # if metacritic_score is empty, set it to None
    metacritic_score = row['Metacritic score']
    if metacritic_score == '' or metacritic_score == 'nan' or pandas.isna(metacritic_score):
        metacritic_score = None

    # if metacritic_url is empty, set it to None
    metacritic_url = row['Metacritic url']
    if metacritic_url == '' or metacritic_url == 'nan' or pandas.isna(metacritic_url):
        metacritic_url = None

    user_score = row['User score']
    positive = row['Positive']
    negative = row['Negative']

    # if score_rank is empty, set it to None
    score_rank = row['Score rank']
    if score_rank == '' or score_rank == 'nan' or pandas.isna(score_rank):
        score_rank = None

    achievements = row['Achievements']
    recommendations = row['Recommendations']
    average_playtime_forever = row['Average playtime forever']
    average_playtime_2weeks = row['Average playtime two weeks']
    median_playtime_forever = row['Median playtime forever']
    median_playtime_2weeks = row['Median playtime two weeks']

    game = Game(
        app_id=id,
        name=name,
        release_date=release_date,
        estimated_owners_lower=estimated_owners_lower,
        estimated_owners_upper=estimated_owners_upper,
        peak_ccu=peak_ccu,
        required_age=required_age,
        price=price,
        discount=discount,
        dlc_count=dlc_count,
        about_the_game=about_the_game,
        header_image=header_image,
        website=website,
        support_url=support_url,
        support_email=support_email,
        windows=windows,
        mac=mac,
        linux=linux,
        metacritic_score=metacritic_score,
        metacritic_url=metacritic_url,
        user_score=user_score,
        positive=positive,
        negative=negative,
        score_rank=score_rank,
        achievements=achievements,
        recommendations=recommendations,
        average_playtime_forever=average_playtime_forever,
        average_playtime_2weeks=average_playtime_2weeks,
        median_playtime_forever=median_playtime_forever,
        median_playtime_2weeks=median_playtime_2weeks,
    )

    database_games.append(game)

# insere em lotes
for i in range(0, len(database_games), BATCH_SIZE):
    batch = database_games[i:i+BATCH_SIZE]
    session.add_all(batch)
    session.commit()
    print(f"Processando {i} de {len(database_games)} games")

print(f"Adicionados: {len(database_games)} games")


#### Vincular Games com Desenvolvedores

In [None]:
# CONFIGURAÇÕES
association_table = Game.__table__.metadata.tables["game_developer"]

# 1. Carregar all developers e games
developers = {d.id: d for d in session.query(Developer).all()}
games = {g.app_id: g for g in session.query(Game).all()}

# 2. Criar tracking para evitar duplicações
existing_pairs = set()   # (game_id, developer_id)

# Também é útil carregar os pares já existentes no banco:
rows = session.execute(association_table.select())
for game_id, developer_id in rows:
    existing_pairs.add((game_id, developer_id))

batch = []
total_inserted = 0

# 3. Processar dataset
for row in games_dataset.itertuples():
    game = games.get(row.AppID)
    if not game or not isinstance(row.Developers, list):
        continue

    for dev_id in row.Developers:
        if dev_id not in developers:
            continue

        pair = (game.id, dev_id)
        if pair in existing_pairs:
            continue  # já existe, pula

        existing_pairs.add(pair)  # marcar como já incluído
        batch.append({"game_id": game.id, "developer_id": dev_id})

        if len(batch) >= BATCH_SIZE:
            session.execute(sqlalchemy.insert(association_table), batch)
            total_inserted += len(batch)
            batch.clear()

# 4. Inserir último batch
if batch:
    session.execute(sqlalchemy.insert(association_table), batch)
    total_inserted += len(batch)
    batch.clear()

session.commit()

print(f"Inseridos {total_inserted} novos relacionamentos Game–Developer.")


#### Vincular Games com Publicadores

In [None]:
# Link publishers to games
association_table = Game.__table__.metadata.tables["game_publisher"]

# 1. Carregar all publishers e games
publishers = {p.id: p for p in session.query(Publisher).all()}
games = {g.app_id: g for g in session.query(Game).all()}

# 2. Criar tracking para evitar duplicações
existing_pairs = set()   # (game_id, publisher_id)

# Também é útil carregar os pares já existentes no banco:
rows = session.execute(association_table.select())
for game_id, publisher_id in rows:
    existing_pairs.add((game_id, publisher_id))

batch = []
total_inserted = 0

# 3. Processar dataset
for row in games_dataset.itertuples():
    game = games.get(row.AppID)
    if not game or not isinstance(row.Publishers, list):
        continue

    for pub_id in row.Publishers:
        if pub_id not in publishers:
            continue

        pair = (game.id, pub_id)
        if pair in existing_pairs:
            continue  # já existe, pula

        existing_pairs.add(pair)  # marcar como já incluído
        batch.append({"game_id": game.id, "publisher_id": pub_id})

        if len(batch) >= BATCH_SIZE:
            session.execute(sqlalchemy.insert(association_table), batch)
            total_inserted += len(batch)
            batch.clear()

# 4. Inserir último batch
if batch:
    session.execute(sqlalchemy.insert(association_table), batch)
    total_inserted += len(batch)
    batch.clear()

session.commit()

print(f"Inseridos {total_inserted} novos relacionamentos Game Publisher.")

#### Vincular Games com Categorias

In [None]:
association_table = Game.__table__.metadata.tables["game_category"]

# 1. Carregar all categories e games
categories = {c.id: c for c in session.query(Category).all()}
games = {g.app_id: g for g in session.query(Game).all()}

# 2. Criar tracking para evitar duplicações
existing_pairs = set()   # (game_id, category_id)

# Também é útil carregar os pares já existentes no banco:
rows = session.execute(association_table.select())
for game_id, category_id in rows:
    existing_pairs.add((game_id, category_id))

batch = []
total_inserted = 0

# 3. Processar dataset
for row in games_dataset.itertuples():
    game = games.get(row.AppID)
    if not game or not isinstance(row.Categories, list):
        continue

    for cat_id in row.Categories:
        if cat_id not in categories:
            continue
        
        pair = (game.id, cat_id)
        if pair in existing_pairs:
            continue  # já existe, pula

        existing_pairs.add(pair)  # marcar como já incluído
        batch.append({"game_id": game.id, "category_id": cat_id})
        
        if len(batch) >= BATCH_SIZE:
            session.execute(sqlalchemy.insert(association_table), batch)
            total_inserted += len(batch)
            batch.clear()

# 4. Inserir último batch
if batch:
    session.execute(sqlalchemy.insert(association_table), batch)
    total_inserted += len(batch)
    batch.clear()

session.commit()

print(f"Inseridos {total_inserted} novos relacionamentos Game Category.")

#### Vincular Games com Generos

In [None]:
association_table = Game.__table__.metadata.tables["game_genre"]

# 1. Carregar all genres e games
genres = {g.id: g for g in session.query(Genre).all()}
games = {g.app_id: g for g in session.query(Game).all()}

existing_pairs = set()   # (game_id, genre_id)

rows = session.execute(association_table.select())
for game_id, genre_id in rows:
    existing_pairs.add((game_id, genre_id))

batch = []
total_inserted = 0

for row in games_dataset.itertuples():
    game = games.get(row.AppID)
    if not game or not isinstance(row.Genres, list):
        continue
    
    for genre_id in row.Genres:
        if genre_id not in genres:
            continue
        
        pair = (game.id, genre_id)
        if pair in existing_pairs:
            continue  # já existe, pula
        
        existing_pairs.add(pair)  # marcar como já incluído
        batch.append({"game_id": game.id, "genre_id": genre_id})
        
        if len(batch) >= BATCH_SIZE:
            session.execute(sqlalchemy.insert(association_table), batch)
            total_inserted += len(batch)
            batch.clear()

if batch:
    session.execute(sqlalchemy.insert(association_table), batch)
    total_inserted += len(batch)
    batch.clear()

session.commit()

print(f"Inseridos {total_inserted} novos relacionamentos Game Genre.")

#### Vincular Games com Tags

In [None]:
association_table = Game.__table__.metadata.tables["game_tag"]

# 1. Carregar all tags e games
tags = {t.id: t for t in session.query(Tag).all()}
games = {g.app_id: g for g in session.query(Game).all()}

existing_pairs = set()   # (game_id, tag_id)

rows = session.execute(association_table.select())
for game_id, tag_id in rows:
    existing_pairs.add((game_id, tag_id))

batch = []
total_inserted = 0

for row in games_dataset.itertuples():
    game = games.get(row.AppID)
    if not game or not isinstance(row.Tags, list):
        continue
    
    for tag_id in row.Tags:
        if tag_id not in tags:
            continue
        
        pair = (game.id, tag_id)
        if pair in existing_pairs:
            continue  # já existe, pula
        
        existing_pairs.add(pair)  # marcar como já incluído
        batch.append({"game_id": game.id, "tag_id": tag_id})
        
        if len(batch) >= BATCH_SIZE:
            session.execute(sqlalchemy.insert(association_table), batch)
            total_inserted += len(batch)
            batch.clear()

if batch:
    session.execute(sqlalchemy.insert(association_table), batch)
    total_inserted += len(batch)
    batch.clear()

session.commit()

print(f"Inseridos {total_inserted} novos relacionamentos Game Tag.")

#### Vincular Games com Linguas

In [None]:
association_table = Game.__table__.metadata.tables["game_language"]

# 1. Carregar all languages e games
languages = {l.id: l for l in session.query(Language).all()}
games = {g.app_id: g for g in session.query(Game).all()}

existing_pairs = set()   # (game_id, language_id)

rows = session.execute(association_table.select())
for game_id, language_id in rows:
    existing_pairs.add((game_id, language_id))

batch = []
total_inserted = 0

for row in games_dataset.itertuples():
    
    print(row.Supported_languages)
#     game = games.get(row.AppID)
#     if not game or not isinstance(row_dict["Supported languages"], list):
#         continue
    
#     for lang_id in row_dict["Supported languages"]:
#         if lang_id not in languages:
#             continue
        
#         pair = (game.id, lang_id)
#         if pair in existing_pairs:
#             continue  # já existe, pula
        
#         existing_pairs.add(pair)  # marcar como já incluído
#         batch.append({"game_id": game.id, "language_id": lang_id})
        
#         if len(batch) >= BATCH_SIZE:
#             session.execute(sqlalchemy.insert(association_table), batch)
#             total_inserted += len(batch)
#             batch.clear()

# if batch:
#     session.execute(sqlalchemy.insert(association_table), batch)
#     total_inserted += len(batch)
#     batch.clear()

# session.commit()

# print(f"Inseridos {total_inserted} novos relacionamentos Game Language.")

### Cadastrar movies no Postgres

### Cadastrar screenshots no Postgres

In [None]:
games_dataset.head()