In [1]:
# Imports

import pandas as pd
import numpy as np

# Tirar warnigns
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Abre o CSV em um DataFrame
data = pd.read_csv('steam.csv')

# Remove identificador interno do jogo na steam
data = data.drop(columns="appid")

# Transforma todas as strings dentro do dataframe em minusculo
for coluna in ["developer", "publisher", "platforms", "categories", "genres", "steamspy_tags"]:
    data[coluna] = data[coluna].str.lower()

# Printa um exemplo
data.head(1)

Unnamed: 0,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
0,Counter-Strike,2000-11-01,1,valve,valve,windows;mac;linux,0,multi-player;online multi-player;local multi-p...,action,action;fps;multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19


In [3]:
# Separa a quantidade de pessoas que possuim o jogo em seu maximo e minimo

# Lista de suporte
owners_min = []
owners_max = []

# Itera sobre os valores das quatidades
for amount in data["owners"]:
    # Separa os valores de minimo e maximo
    amounts = amount.split("-")
    
    # Adiciona o valor respectivo a lista
    owners_min.append(int(amounts[0]))
    owners_max.append(int(amounts[1]))

# Gera as duas novas colunas
data["owners min"] = owners_min    
data["owners max"] = owners_max

# Remove coluna antiga
data = data.drop(columns="owners")

# Printa um exemplo
data.head(1)

Unnamed: 0,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,price,owners min,owners max
0,Counter-Strike,2000-11-01,1,valve,valve,windows;mac;linux,0,multi-player;online multi-player;local multi-p...,action,action;fps;multiplayer,0,124534,3339,17612,317,7.19,10000000,20000000


In [4]:
# Separa a data em ano dia e mês

# Listas de suporte
year = []
month = []
day = []

# Itera sobre a data de lancemento
for time in data["release_date"]:
    # Separa em seus valores
    times = time.split("-")
    
    # Separa em ano mês e dia
    year.append(times[0])
    month.append(times[1])
    day.append(times[2])

# Gera as novas colunas
data["release_year"] = year
data["release_month"] = month
data["release_day"] = day

# Remove coluna antiga
data = data.drop(columns="release_date")

# Printa um exemplo
data.head(1)

Unnamed: 0,name,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,price,owners min,owners max,release_year,release_month,release_day
0,Counter-Strike,1,valve,valve,windows;mac;linux,0,multi-player;online multi-player;local multi-p...,action,action;fps;multiplayer,0,124534,3339,17612,317,7.19,10000000,20000000,2000,11,1


In [5]:
# Transforma todas as colunas quali em dummy

for quali in ["platforms","categories","genres", "steamspy_tags"]:
    new_c = 0
    
    # Para cada elemento da coluna quali
    for element in set(data[quali].str.cat(sep=";").split(";")):
        # Verfica para eveitar duplicatas (SteamSpy)
        if element not in data: 
        # Transforma em uma coluna dummy
            data[element] = np.where(data[quali].str.contains(element), 1, 0)
            new_c += 1
    
    print("Coluna Quali \"{0}\" processada, gerando {1} colunas dummy".format(quali, new_c))
    data = data.drop(columns=quali)
    
data.head(1)

Coluna Quali "platforms" processada, gerando 3 colunas dummy
Coluna Quali "categories" processada, gerando 29 colunas dummy
Coluna Quali "genres" processada, gerando 29 colunas dummy
Coluna Quali "steamspy_tags" processada, gerando 309 colunas dummy


Unnamed: 0,name,english,developer,publisher,required_age,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,...,cartoony,sniper,choose your own adventure,economy,memes,side scroller,diplomacy,sci-fi,2.5d,third person
0,Counter-Strike,1,valve,valve,0,0,124534,3339,17612,317,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# Separa os desenvolvedores e publicadores

options = []
options.extend(data["developer"].str.cat(sep=";").split(";"))
options.extend(data["publisher"].str.cat(sep=";").split(";"))

new_c = 0
error = 0

for quali in set(options):
    if quali not in data:
        try:
            data[quali] = np.where(data["publisher"].str.contains(quali) | data["developer"].str.contains(quali), 1, 0)
            new_c += 1
        except:
            print("ANTI-BUG: Ignorando Linha \"{0}\"".format(quali))
            print()
            error += 1
    
print("Desenvolvedores e publicadores processados, gerando {0} colunas dummy com {1} linhas puladas".format(new_c, error))

data = data.drop(columns="developer")
data = data.drop(columns="publisher")

data.head(1)

ANTI-BUG: Ignorando Linha "][ games inc"

ANTI-BUG: Ignorando Linha "++good games"

ANTI-BUG: Ignorando Linha "+mpact games, llc."

ANTI-BUG: Ignorando Linha "+7 software"

ANTI-BUG: Ignorando Linha "games++"

Desenvolvedores e publicadores processados, gerando 20044 colunas dummy com 5 linhas puladas


Unnamed: 0,name,english,required_age,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,price,owners min,...,feperd games,axis granted creations,polinc games,flight systems llc,newrealitygames,101xp,tgi game studio,yang,funktronic labs,gbelo games
0,Counter-Strike,1,0,0,124534,3339,17612,317,7.19,10000000,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Salva o DataFram para um CSV

data.to_csv("steam_processada.csv")

print("CSV SALVADO!")

CSV SALVADO!


.