In [1]:
import json
import pandas as pd
import ast
import re
from textblob import TextBlob

In [2]:
# Ruta al archivo JSON descomprimido dentro de la carpeta "data"
file_path_reviews = 'data/australian_users_items.json'

In [3]:
# Abrir el archivo y procesar cada línea
def leer_archivo(archivo):
    data_list = []
    with open(archivo, 'r',encoding='utf-8') as file:
        for line in file:
            try:
                # Usar ast.literal_eval para convertir la línea en un diccionario
                json_data = ast.literal_eval(line)
                data_list.append(json_data)
            except ValueError as e:
                print(f"Error en la línea: {line}")
                continue
    return data_list

In [4]:
df_items = pd.DataFrame(leer_archivo(file_path_reviews))

In [5]:
# Especifica el nombre del archivo Parquet en el que deseas guardar el DataFrame
nombre_archivo_parquet = "data_items.parquet"

# Guarda el DataFrame como un archivo Parquet
df_items.to_parquet(nombre_archivo_parquet)

In [6]:
df = pd.read_parquet('data_items.parquet')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88310 entries, 0 to 88309
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      88310 non-null  object
 1   items_count  88310 non-null  int64 
 2   steam_id     88310 non-null  object
 3   user_url     88310 non-null  object
 4   items        88310 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.4+ MB


In [8]:
df

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712555,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445855,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099482,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."
...,...,...,...,...,...
88305,76561198323066619,22,76561198323066619,http://steamcommunity.com/profiles/76561198323...,"[{'item_id': '413850', 'item_name': 'CS:GO Pla..."
88306,76561198326700687,177,76561198326700687,http://steamcommunity.com/profiles/76561198326...,"[{'item_id': '11020', 'item_name': 'TrackMania..."
88307,XxLaughingJackClown77xX,0,76561198328759259,http://steamcommunity.com/id/XxLaughingJackClo...,[]
88308,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,"[{'item_id': '304930', 'item_name': 'Unturned'..."


In [9]:
df.iloc[0].items

<bound method Series.items of user_id                                        76561197970982479
items_count                                                  277
steam_id                                       76561197970982479
user_url       http://steamcommunity.com/profiles/76561197970...
items          [{'item_id': '10', 'item_name': 'Counter-Strik...
Name: 0, dtype: object>

In [10]:
df = df.explode('items',ignore_index=True)

In [11]:
df = pd.concat([df, pd.json_normalize(df['items'])], axis=1)

In [12]:
df

Unnamed: 0,user_id,items_count,steam_id,user_url,items,item_id,item_name,playtime_2weeks,playtime_forever
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'item_id': '10', 'item_name': 'Counter-Strike...",10,Counter-Strike,0.0,6.0
1,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'item_id': '20', 'item_name': 'Team Fortress ...",20,Team Fortress Classic,0.0,0.0
2,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'item_id': '30', 'item_name': 'Day of Defeat'...",30,Day of Defeat,0.0,7.0
3,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'item_id': '40', 'item_name': 'Deathmatch Cla...",40,Deathmatch Classic,0.0,0.0
4,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'item_id': '50', 'item_name': 'Half-Life: Opp...",50,Half-Life: Opposing Force,0.0,0.0
...,...,...,...,...,...,...,...,...,...
5170010,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,"{'item_id': '373330', 'item_name': 'All Is Dus...",373330,All Is Dust,0.0,0.0
5170011,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,"{'item_id': '388490', 'item_name': 'One Way To...",388490,One Way To Die: Steam Edition,3.0,3.0
5170012,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,"{'item_id': '521570', 'item_name': 'You Have 1...",521570,You Have 10 Seconds 2,4.0,4.0
5170013,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,"{'item_id': '519140', 'item_name': 'Minds Eyes...",519140,Minds Eyes,3.0,3.0


In [13]:
df.isnull().sum()

user_id                 0
items_count             0
steam_id                0
user_url                0
items               16806
item_id             16806
item_name           16806
playtime_2weeks     16806
playtime_forever    16806
dtype: int64

In [14]:
#eliminamos los NaN que quedan y las columnas que no necesitamos 
df.drop(['items_count', 'playtime_2weeks', 'items','steam_id','user_url', 'playtime_2weeks'],  inplace=True,axis=1)
df = df.dropna()
df.reset_index(drop=True, inplace=True)

In [15]:
df.columns

Index(['user_id', 'item_id', 'item_name', 'playtime_forever'], dtype='object')

In [16]:
df

Unnamed: 0,user_id,item_id,item_name,playtime_forever
0,76561197970982479,10,Counter-Strike,6.0
1,76561197970982479,20,Team Fortress Classic,0.0
2,76561197970982479,30,Day of Defeat,7.0
3,76561197970982479,40,Deathmatch Classic,0.0
4,76561197970982479,50,Half-Life: Opposing Force,0.0
...,...,...,...,...
5153204,76561198329548331,346330,BrainBread 2,0.0
5153205,76561198329548331,373330,All Is Dust,0.0
5153206,76561198329548331,388490,One Way To Die: Steam Edition,3.0
5153207,76561198329548331,521570,You Have 10 Seconds 2,4.0


In [17]:
# Guarda el DataFrame como un archivo Parquet
df.to_parquet("data_items.parquet")