# ETL archivo steam_games

In [1]:
import pandas as pd
import gzip,ast
import json
from textblob import TextBlob




In [2]:
# Extracción y descompresión del archivo steam_games.json.gz

with gzip.open('steam_games.json.gz','rt', encoding='utf-8') as f:
    for line in f:
        dfsteam_games=pd.read_json(f, lines=True)

In [3]:
# reviso el dataframe dfsteam_games
dfsteam_games.head()


Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,


In [4]:
dfsteam_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120444 entries, 0 to 120443
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   publisher     24083 non-null  object 
 1   genres        28852 non-null  object 
 2   app_name      32133 non-null  object 
 3   title         30085 non-null  object 
 4   url           32135 non-null  object 
 5   release_date  30068 non-null  object 
 6   tags          31972 non-null  object 
 7   reviews_url   32133 non-null  object 
 8   specs         31465 non-null  object 
 9   price         30758 non-null  object 
 10  early_access  32135 non-null  float64
 11  id            32133 non-null  float64
 12  developer     28836 non-null  object 
dtypes: float64(2), object(11)
memory usage: 11.9+ MB


In [5]:
# Elimino datos faltantes y se restablecen los indices

data = []

with gzip.open('steam_games.json.gz', 'rt', encoding='utf-8') as f:
    for line in f:
        item = json.loads(line)
        data.append(item)

dfsteam_games = pd.DataFrame(data)

dfsteam_games.dropna(how='any', inplace=True)

dfsteam_games.reset_index(drop=True, inplace=True)

print(dfsteam_games.index)








RangeIndex(start=0, stop=22530, step=1)


In [6]:
# Reviso el dataframe para verificar los campos al eliminar los datos nulos
dfsteam_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22530 entries, 0 to 22529
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   publisher     22530 non-null  object
 1   genres        22530 non-null  object
 2   app_name      22530 non-null  object
 3   title         22530 non-null  object
 4   url           22530 non-null  object
 5   release_date  22530 non-null  object
 6   tags          22530 non-null  object
 7   reviews_url   22530 non-null  object
 8   specs         22530 non-null  object
 9   price         22530 non-null  object
 10  early_access  22530 non-null  object
 11  id            22530 non-null  object
 12  developer     22530 non-null  object
dtypes: object(13)
memory usage: 2.2+ MB


In [7]:
# Se Convierte la columna 'release_date' a tipo de dato datetime 

data = []

with gzip.open('steam_games.json.gz', 'rt', encoding='utf-8') as f:
    for line in f:
        item = json.loads(line)
        data.append(item)

dfsteam_games = pd.DataFrame(data)

dfsteam_games['release_date'] = pd.to_datetime(dfsteam_games['release_date'], errors='coerce')
dfsteam_games['price'] = pd.to_numeric(dfsteam_games['price'], errors='coerce')



dfsteam_games.dropna(inplace=True)

dfsteam_games.reset_index(drop=True, inplace=True)

print(dfsteam_games.dtypes)

publisher               object
genres                  object
app_name                object
title                   object
url                     object
release_date    datetime64[ns]
tags                    object
reviews_url             object
specs                   object
price                  float64
early_access            object
id                      object
developer               object
dtype: object


In [8]:
print(dfsteam_games['id'].nunique())


21193


In [9]:
dfsteam_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21194 entries, 0 to 21193
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   publisher     21194 non-null  object        
 1   genres        21194 non-null  object        
 2   app_name      21194 non-null  object        
 3   title         21194 non-null  object        
 4   url           21194 non-null  object        
 5   release_date  21194 non-null  datetime64[ns]
 6   tags          21194 non-null  object        
 7   reviews_url   21194 non-null  object        
 8   specs         21194 non-null  object        
 9   price         21194 non-null  float64       
 10  early_access  21194 non-null  object        
 11  id            21194 non-null  object        
 12  developer     21194 non-null  object        
dtypes: datetime64[ns](1), float64(1), object(11)
memory usage: 2.1+ MB


In [10]:
# Especifico la ruta y el nombre del archivo Parquet en la carpeta del proyecto
archivo_parquet = "games.parquet"

dfsteam_games.to_parquet(archivo_parquet, index=False)

## ETL archivo user_reviews

In [11]:
# Extracción y descomprensión del archivo user_reviews
data=[]
with gzip.open('user_reviews.json.gz','rb') as file:
    for line in file:
        lineaUnicode=line.decode('utf-8')
        data.append(lineaUnicode)

In [12]:
# creo y reviso el data frame reviwes
df_reviews = pd.DataFrame(data)
df_reviews.head()

Unnamed: 0,0
0,"{'user_id': '76561197970982479', 'user_url': '..."
1,"{'user_id': 'js41637', 'user_url': 'http://ste..."
2,"{'user_id': 'evcentric', 'user_url': 'http://s..."
3,"{'user_id': 'doctr', 'user_url': 'http://steam..."
4,"{'user_id': 'maplemage', 'user_url': 'http://s..."


In [13]:
  # Elimino caracteres no válidos en la cadena

def clean_string(text):
   
    cleaned_text = ''.join(char for char in text if char.isprintable())
    return cleaned_text

cleaned_data = [clean_string(item) for item in data]

print(cleaned_data[0])

{'user_id': '76561197970982479', 'user_url': 'http://steamcommunity.com/profiles/76561197970982479', 'reviews': [{'funny': '', 'posted': 'Posted November 5, 2011.', 'last_edited': '', 'item_id': '1250', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'Simple yet with great replayability. In my opinion does "zombie" hordes and team work better than left 4 dead plus has a global leveling system. Alot of down to earth "zombie" splattering fun for the whole family. Amazed this sort of FPS is so rare.'}, {'funny': '', 'posted': 'Posted July 15, 2011.', 'last_edited': '', 'item_id': '22200', 'helpful': 'No ratings yet', 'recommend': True, 'review': "It's unique and worth a playthrough."}, {'funny': '', 'posted': 'Posted April 21, 2011.', 'last_edited': '', 'item_id': '43110', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'Great atmosphere. The gunplay can be a bit chunky at times but at the end of the day this game is definitely worth it and I hope they do a sequel...so

In [14]:
#  Desanidando diccionarios

for i in range(len(data)):
    data[i]=ast.literal_eval(data[i])

In [15]:
# Creo el DatFrame desanidado y lo reviso 
df_User_Reviews=pd.DataFrame(data)
df_User_Reviews.head()

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."


In [16]:
# desanido la lista de diccinario de la columna reviwes

def desanidar_lista(lista):
    if isinstance(lista,list):
        diccionario_resultante={}
        for diccionario in lista:
            diccionario_resultante.update(diccionario)
        return diccionario_resultante
    else:
        return lista

In [17]:
df_User_Reviews['reviews']=df_User_Reviews['reviews'].apply(desanidar_lista)

In [18]:
# Se crea copia del dataframe para la nueva informacion
dfCopyReviews=df_User_Reviews.copy()

In [19]:
# Se concatena el dataframe con el campo reviews
dfCopyReviews = pd.concat([dfCopyReviews.drop(['reviews'], axis=1), dfCopyReviews['reviews'].apply(pd.Series)], axis=1)

In [20]:
# Se revisa el dataframe para verificar el resultado 
dfCopyReviews.head()

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
1,js41637,http://steamcommunity.com/id/js41637,,"Posted November 29, 2013.",,239030,1 of 4 people (25%) found this review helpful,True,Very fun little game to play when your bored o...
2,evcentric,http://steamcommunity.com/id/evcentric,,"Posted October 15, 2014.",,224500,No ratings yet,True,"Fun world builder, with plenty of option of ho..."
3,doctr,http://steamcommunity.com/id/doctr,,"Posted February 23, 2012.",,108710,No ratings yet,True,"Alan wake is a really good game, the light eff..."
4,maplemage,http://steamcommunity.com/id/maplemage,,"Posted July 11, 2013.",,204300,No ratings yet,True,"OH YES, THIS GAME IS THE BEST, THEY ADD STUFF ..."


In [21]:
# Elimino los datos faltantes y se reorganizan los indices
dfCopyReviews.dropna(inplace=True)
dfCopyReviews.reset_index(drop=True, inplace=True)

In [22]:
# Reviso los datos del dataframe
dfCopyReviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25771 entries, 0 to 25770
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      25771 non-null  object
 1   user_url     25771 non-null  object
 2   funny        25771 non-null  object
 3   posted       25771 non-null  object
 4   last_edited  25771 non-null  object
 5   item_id      25771 non-null  object
 6   helpful      25771 non-null  object
 7   recommend    25771 non-null  object
 8   review       25771 non-null  object
dtypes: object(9)
memory usage: 1.8+ MB


In [23]:
# Se cambia el tipo de dato de la columna itme_id a int 64
dfCopyReviews['item_id'] = dfCopyReviews['item_id'].apply(pd.to_numeric, errors='coerce')


In [24]:
dfCopyReviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25771 entries, 0 to 25770
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      25771 non-null  object
 1   user_url     25771 non-null  object
 2   funny        25771 non-null  object
 3   posted       25771 non-null  object
 4   last_edited  25771 non-null  object
 5   item_id      25771 non-null  int64 
 6   helpful      25771 non-null  object
 7   recommend    25771 non-null  object
 8   review       25771 non-null  object
dtypes: int64(1), object(8)
memory usage: 1.8+ MB


In [25]:

# Se convierte la columna 'posted' en una cadena, Se extrae el mes y el día utilizando expresiones regulares
# Se Mapean los nombres de los meses a números
# Se calcula el año más común en la columna 'posted'
# Se rellenan los valores faltantes y se convierten a enteros
# Se combinan las columnas 'Year', 'Month', y 'Day' en 'posted_date'
# y se revisan las primeras filas del DataFrame

dfCopyReviews['posted'] = dfCopyReviews['posted'].astype(str)

month_day = dfCopyReviews['posted'].str.extract(r'Posted (\w+) (\d{1,2})\.')
dfCopyReviews['Month'] = month_day[0]
dfCopyReviews['Day'] = month_day[1]

months = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4,
    'May': 5, 'June': 6, 'July': 7, 'August': 8,
    'September': 9, 'October': 10, 'November': 11, 'December': 12
}
dfCopyReviews['Month'] = dfCopyReviews['Month'].map(months)

most_common_year = dfCopyReviews['posted'].str.extract(r'(\d{4})')[0].mode()[0]

default_month = 1
default_day = 1
dfCopyReviews['Month'] = dfCopyReviews['Month'].fillna(default_month).astype(int)
dfCopyReviews['Day'] = dfCopyReviews['Day'].fillna(default_day).astype(int)
dfCopyReviews['Year'] = dfCopyReviews['posted'].str.extract(r'(\d{4})')[0].fillna(most_common_year).astype(int)

dfCopyReviews['posted_date'] = pd.to_datetime(dfCopyReviews[['Year', 'Month', 'Day']], errors='coerce')

dfCopyReviews.head()


Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review,Month,Day,Year,posted_date
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...,1,1,2011,2011-01-01
1,js41637,http://steamcommunity.com/id/js41637,,"Posted November 29, 2013.",,239030,1 of 4 people (25%) found this review helpful,True,Very fun little game to play when your bored o...,1,1,2013,2013-01-01
2,evcentric,http://steamcommunity.com/id/evcentric,,"Posted October 15, 2014.",,224500,No ratings yet,True,"Fun world builder, with plenty of option of ho...",1,1,2014,2014-01-01
3,doctr,http://steamcommunity.com/id/doctr,,"Posted February 23, 2012.",,108710,No ratings yet,True,"Alan wake is a really good game, the light eff...",1,1,2012,2012-01-01
4,maplemage,http://steamcommunity.com/id/maplemage,,"Posted July 11, 2013.",,204300,No ratings yet,True,"OH YES, THIS GAME IS THE BEST, THEY ADD STUFF ...",1,1,2013,2013-01-01


In [26]:
dfCopyReviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25771 entries, 0 to 25770
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   user_id      25771 non-null  object        
 1   user_url     25771 non-null  object        
 2   funny        25771 non-null  object        
 3   posted       25771 non-null  object        
 4   last_edited  25771 non-null  object        
 5   item_id      25771 non-null  int64         
 6   helpful      25771 non-null  object        
 7   recommend    25771 non-null  object        
 8   review       25771 non-null  object        
 9   Month        25771 non-null  int32         
 10  Day          25771 non-null  int32         
 11  Year         25771 non-null  int32         
 12  posted_date  25762 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int32(3), int64(1), object(8)
memory usage: 2.3+ MB


In [27]:
#  función para analizar el sentimiento y crear la columna 'sentiment_analysis' y creo el  dataframe dfCopyReviews

def analyze_sentiment(text):
    if pd.isna(text):
        return 1 
    analysis = TextBlob(text)
    sentiment = analysis.sentiment.polarity
    if sentiment < 0:
        return 0  # Malo
    elif sentiment == 0:
        return 1  # Neutral
    else:
        return 2  # Positivo
    
dfCopyReviews['sentiment_analysis'] = dfCopyReviews['review'].apply(analyze_sentiment)

In [28]:
dfCopyReviews.head()

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review,Month,Day,Year,posted_date,sentiment_analysis
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...,1,1,2011,2011-01-01,2
1,js41637,http://steamcommunity.com/id/js41637,,"Posted November 29, 2013.",,239030,1 of 4 people (25%) found this review helpful,True,Very fun little game to play when your bored o...,1,1,2013,2013-01-01,0
2,evcentric,http://steamcommunity.com/id/evcentric,,"Posted October 15, 2014.",,224500,No ratings yet,True,"Fun world builder, with plenty of option of ho...",1,1,2014,2014-01-01,2
3,doctr,http://steamcommunity.com/id/doctr,,"Posted February 23, 2012.",,108710,No ratings yet,True,"Alan wake is a really good game, the light eff...",1,1,2012,2012-01-01,2
4,maplemage,http://steamcommunity.com/id/maplemage,,"Posted July 11, 2013.",,204300,No ratings yet,True,"OH YES, THIS GAME IS THE BEST, THEY ADD STUFF ...",1,1,2013,2013-01-01,2


In [29]:
dfCopyReviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25771 entries, 0 to 25770
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   user_id             25771 non-null  object        
 1   user_url            25771 non-null  object        
 2   funny               25771 non-null  object        
 3   posted              25771 non-null  object        
 4   last_edited         25771 non-null  object        
 5   item_id             25771 non-null  int64         
 6   helpful             25771 non-null  object        
 7   recommend           25771 non-null  object        
 8   review              25771 non-null  object        
 9   Month               25771 non-null  int32         
 10  Day                 25771 non-null  int32         
 11  Year                25771 non-null  int32         
 12  posted_date         25762 non-null  datetime64[ns]
 13  sentiment_analysis  25771 non-null  int64     

In [31]:
# Guardo el DataFrame en un archivo Parquet 
archivo_parquet = "reviews.parquet"
dfCopyReviews.to_parquet(archivo_parquet, index=False)

## ETL archivo users_items

In [32]:
# Extracción y descomprensión del archivo user_reviews
data=[]
with gzip.open('users_items.json.gz','rb') as file:
    for line in file:
        lineaUnicode=line.decode('utf-8')
        data.append(lineaUnicode)

In [33]:
# Elimino la decodificacion no valida del archivo json
for i in range(len(data)):
    data[i]=data[i].encode('utf-8','ignore').decode('utf-8')

In [34]:
# desanido las listas
for i in range(len(data)):
    data[i]=ast.literal_eval(data[i])

In [35]:
# creo una copia del DataFrame 
dfcopy_items=pd.DataFrame(data)

In [36]:
# Reviso el dataframe
dfcopy_items.head()

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712555,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445855,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099482,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."


In [37]:
dfcopy_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88310 entries, 0 to 88309
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      88310 non-null  object
 1   items_count  88310 non-null  int64 
 2   steam_id     88310 non-null  object
 3   user_url     88310 non-null  object
 4   items        88310 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.4+ MB


In [38]:
# creo un nuevo DataFrame con la información del diccionario
df=pd.DataFrame(data)

df= df.explode('items', ignore_index=True)

df = pd.json_normalize(df['items'])

df.head()

Unnamed: 0,item_id,item_name,playtime_forever,playtime_2weeks
0,10,Counter-Strike,6.0,0.0
1,20,Team Fortress Classic,0.0,0.0
2,30,Day of Defeat,7.0,0.0
3,40,Deathmatch Classic,0.0,0.0
4,50,Half-Life: Opposing Force,0.0,0.0


In [39]:
# Concateno los DataFrames

dfcopia_selected = dfcopy_items[['user_id', 'items_count', 'steam_id', 'user_url']]

df_selected = df[['item_id', 'item_name', 'playtime_forever', 'playtime_2weeks']]

df_concatenado = pd.concat([dfcopia_selected, df_selected], axis=1)

df_concatenado.head()

Unnamed: 0,user_id,items_count,steam_id,user_url,item_id,item_name,playtime_forever,playtime_2weeks
0,76561197970982479,277.0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,10,Counter-Strike,6.0,0.0
1,js41637,888.0,76561198035864385,http://steamcommunity.com/id/js41637,20,Team Fortress Classic,0.0,0.0
2,evcentric,137.0,76561198007712555,http://steamcommunity.com/id/evcentric,30,Day of Defeat,7.0,0.0
3,Riot-Punch,328.0,76561197963445855,http://steamcommunity.com/id/Riot-Punch,40,Deathmatch Classic,0.0,0.0
4,doctr,541.0,76561198002099482,http://steamcommunity.com/id/doctr,50,Half-Life: Opposing Force,0.0,0.0


In [40]:
df_concatenado.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5170015 entries, 0 to 5170014
Data columns (total 8 columns):
 #   Column            Dtype  
---  ------            -----  
 0   user_id           object 
 1   items_count       float64
 2   steam_id          object 
 3   user_url          object 
 4   item_id           object 
 5   item_name         object 
 6   playtime_forever  float64
 7   playtime_2weeks   float64
dtypes: float64(3), object(5)
memory usage: 315.6+ MB


In [41]:
# Eliminado datos faltantes
df_concatenado.dropna(inplace=True)

In [42]:
concatenated_df = df_concatenado.reset_index(drop=True)
print(concatenated_df.index)

RangeIndex(start=0, stop=88176, step=1)


In [43]:
df_concatenado.info()

<class 'pandas.core.frame.DataFrame'>
Index: 88176 entries, 0 to 88309
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   user_id           88176 non-null  object 
 1   items_count       88176 non-null  float64
 2   steam_id          88176 non-null  object 
 3   user_url          88176 non-null  object 
 4   item_id           88176 non-null  object 
 5   item_name         88176 non-null  object 
 6   playtime_forever  88176 non-null  float64
 7   playtime_2weeks   88176 non-null  float64
dtypes: float64(3), object(5)
memory usage: 6.1+ MB


In [44]:
# Convierto la columna "item_id" a tipo numérico
df_concatenado['item_id'] = pd.to_numeric(df_concatenado['item_id'], errors='coerce')

In [45]:
# reviso el dataframe
df_concatenado.head()

Unnamed: 0,user_id,items_count,steam_id,user_url,item_id,item_name,playtime_forever,playtime_2weeks
0,76561197970982479,277.0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,10,Counter-Strike,6.0,0.0
1,js41637,888.0,76561198035864385,http://steamcommunity.com/id/js41637,20,Team Fortress Classic,0.0,0.0
2,evcentric,137.0,76561198007712555,http://steamcommunity.com/id/evcentric,30,Day of Defeat,7.0,0.0
3,Riot-Punch,328.0,76561197963445855,http://steamcommunity.com/id/Riot-Punch,40,Deathmatch Classic,0.0,0.0
4,doctr,541.0,76561198002099482,http://steamcommunity.com/id/doctr,50,Half-Life: Opposing Force,0.0,0.0


In [46]:
df_concatenado.info()

<class 'pandas.core.frame.DataFrame'>
Index: 88176 entries, 0 to 88309
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   user_id           88176 non-null  object 
 1   items_count       88176 non-null  float64
 2   steam_id          88176 non-null  object 
 3   user_url          88176 non-null  object 
 4   item_id           88176 non-null  int64  
 5   item_name         88176 non-null  object 
 6   playtime_forever  88176 non-null  float64
 7   playtime_2weeks   88176 non-null  float64
dtypes: float64(3), int64(1), object(4)
memory usage: 6.1+ MB


In [47]:
# Guardo el DataFrame en un archivo Parquet
archivo_parquet = "items.parquet"

df_concatenado.to_parquet(archivo_parquet, index=False)
