# PASO 0: Importar bibliotecas y leer datos.

In [105]:
# Importar las librerias a utilizar.

import ast
import pandas as pd

In [106]:
# Ruta del archivo JSON.
rt_json = 'steam_games.json'

# Carga del archivo JSON en un DataFrame.

rows = []               # Lista vacía para almacenar las filas (registros) de datos que obtendremos del archivo JSON.
with open(rt_json) as f:        # Abrimos el archivo JSON en modo lectura usando un bloque 'with', lo que asegura que se cierre correctamente después de su uso.
    for line in f.readlines():          # Iteramos sobre cada línea del archivo JSON usando el método 'readlines()', que devuelve una lista de todas las líneas del archivo.
        rows.append(ast.literal_eval(line))     # Para cada línea, convertimos su contenido en un diccionario Python utilizando 'ast.literal_eval()'.

steam_games = pd.DataFrame(rows)

# PASO 1: Comprensión de datos.

* Dataframe shape
* head y tail
* dtypes
* describe

In [107]:
steam_games.shape

(32135, 16)

In [108]:
steam_games.head()

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,discount_price,reviews_url,specs,price,early_access,id,developer,sentiment,metascore
0,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",4.49,http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro,,
1,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",,http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free To Play,False,643980,Secret Level SRL,Mostly Positive,
2,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",,http://steamcommunity.com/app/670290/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free to Play,False,670290,Poolians.com,Mostly Positive,
3,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,弹炸人2222,http://store.steampowered.com/app/767400/2222/,2017-12-07,"[Action, Adventure, Casual]",0.83,http://steamcommunity.com/app/767400/reviews/?...,[Single-player],0.99,False,767400,彼岸领域,,
4,,,Log Challenge,,http://store.steampowered.com/app/773570/Log_C...,,"[Action, Indie, Casual, Sports]",1.79,http://steamcommunity.com/app/773570/reviews/?...,"[Single-player, Full controller support, HTC V...",2.99,False,773570,,,


In [109]:
steam_games.columns

Index(['publisher', 'genres', 'app_name', 'title', 'url', 'release_date',
       'tags', 'discount_price', 'reviews_url', 'specs', 'price',
       'early_access', 'id', 'developer', 'sentiment', 'metascore'],
      dtype='object')

In [110]:
steam_games.dtypes

publisher          object
genres             object
app_name           object
title              object
url                object
release_date       object
tags               object
discount_price    float64
reviews_url        object
specs              object
price              object
early_access         bool
id                 object
developer          object
sentiment          object
metascore          object
dtype: object

In [111]:
steam_games.describe()

Unnamed: 0,discount_price
count,225.0
mean,11.930533
std,17.492643
min,0.49
25%,1.39
50%,4.19
75%,22.66
max,139.99


# PASO 2: Preparación de datos.

* Quitar columnas y filas irrelevantes
* Renombrar columnas
* Identificar columnas y filas duplicadas

In [112]:
steam_games.columns

Index(['publisher', 'genres', 'app_name', 'title', 'url', 'release_date',
       'tags', 'discount_price', 'reviews_url', 'specs', 'price',
       'early_access', 'id', 'developer', 'sentiment', 'metascore'],
      dtype='object')

In [113]:
#Quitamos las columnas que no necesitamos, en este caso solo pegamos todas y las quitamos manualmente

steam_games = steam_games[['publisher', 'genres', 'title', 'release_date',
                           'tags', 'discount_price', 'specs', 'price',
                           'early_access', 'id', 'developer', 'sentiment', 'metascore']].copy()

In [114]:
steam_games.columns

Index(['publisher', 'genres', 'title', 'release_date', 'tags',
       'discount_price', 'specs', 'price', 'early_access', 'id', 'developer',
       'sentiment', 'metascore'],
      dtype='object')

In [115]:
# Preparamos el tipo de dato en la columna "release_date".

steam_games = steam_games.dropna(subset = ['release_date'])
steam_games['release_date'] = pd.to_datetime(steam_games['release_date'], format='%Y-%m-%d', errors='coerce')
steam_games.dtypes

publisher                 object
genres                    object
title                     object
release_date      datetime64[ns]
tags                      object
discount_price           float64
specs                     object
price                     object
early_access                bool
id                        object
developer                 object
sentiment                 object
metascore                 object
dtype: object

In [116]:
steam_games['release_date'] = steam_games['release_date'].dt.year

In [117]:
steam_games.dropna(subset=['release_date'], inplace=True)
steam_games['release_date'] = steam_games['release_date'].astype(int)
steam_games.dtypes

publisher          object
genres             object
title              object
release_date        int32
tags               object
discount_price    float64
specs              object
price              object
early_access         bool
id                 object
developer          object
sentiment          object
metascore          object
dtype: object

In [125]:
steam_games['metascore'] = pd.to_numeric(steam_games['metascore'], errors = 'coerce')
steam_games.dtypes

publisher          object
genres             object
title              object
release_date        int32
tags               object
discount_price    float64
specs              object
price              object
early_access         bool
id                 object
developer          object
sentiment          object
metascore         float64
dtype: object

In [124]:
# Checkeamos Nulos por columna

total_datos = steam_games.size
datos_no_nulos = steam_games.count()
datos_nulos = steam_games.isnull().sum()
informacion_columnas = pd.DataFrame({'Cantidad de datos': total_datos, 'Datos nulos': datos_nulos, 'Datos no nulos': datos_no_nulos})
informacion_columnas

Unnamed: 0,Cantidad de datos,Datos nulos,Datos no nulos
publisher,387179,5990,23793
genres,387179,1234,28549
title,387179,1,29782
release_date,387179,0,29783
tags,387179,161,29622
discount_price,387179,29579,204
specs,387179,669,29114
price,387179,1001,28782
early_access,387179,0,29783
id,387179,1,29782


In [119]:
steam_games.to_csv('steam_games_procesado.csv', index = False)