In [2]:
import json
import pandas as pd
import ast
import re
from textblob import TextBlob

In [8]:
# Ruta al archivo JSON descomprimido dentro de la carpeta "data"
file_path_reviews = 'data/australian_user_reviews.json'

In [9]:
# Abrir el archivo y procesar cada línea
def leer_archivo(archivo):
    data_list = []
    with open(archivo, 'r',encoding='utf-8') as file:
        for line in file:
            try:
                # Usar ast.literal_eval para convertir la línea en un diccionario
                json_data = ast.literal_eval(line)
                data_list.append(json_data)
            except ValueError as e:
                print(f"Error en la línea: {line}")
                continue
    return data_list

In [10]:
df_reviews = pd.DataFrame(leer_archivo(file_path_reviews))

In [11]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25799 entries, 0 to 25798
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   25799 non-null  object
 1   user_url  25799 non-null  object
 2   reviews   25799 non-null  object
dtypes: object(3)
memory usage: 604.8+ KB


In [12]:
# Usar la función explode para convertir la columna "reviews" en un DataFrame independiente
df_reviews = df_reviews.explode('reviews', ignore_index=True)

In [13]:
# Expandir las claves de los diccionarios en columnas
df_reviews = pd.concat([df_reviews, df_reviews['reviews'].apply(pd.Series)], axis=1)

# Eliminar la columna 'reviews' original
df_reviews.drop(columns='reviews', inplace=True)



In [14]:
df_reviews.head(2)

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review,0
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.,


In [15]:
df_reviews.drop(columns=[0,'last_edited','helpful','funny'],inplace = True)
df_reviews.head(2)

Unnamed: 0,user_id,user_url,posted,item_id,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"Posted November 5, 2011.",1250,True,Simple yet with great replayability. In my opi...
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"Posted July 15, 2011.",22200,True,It's unique and worth a playthrough.


In [16]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59333 entries, 0 to 59332
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   user_id    59333 non-null  object
 1   user_url   59333 non-null  object
 2   posted     59305 non-null  object
 3   item_id    59305 non-null  object
 4   recommend  59305 non-null  object
 5   review     59305 non-null  object
dtypes: object(6)
memory usage: 2.7+ MB


In [17]:
df_reviews.isnull().sum()

user_id       0
user_url      0
posted       28
item_id      28
recommend    28
review       28
dtype: int64

In [18]:
df_reviews.dropna(inplace=True)

In [19]:
df_reviews.reset_index(drop = True, inplace=True)

In [20]:
df_reviews.head()

Unnamed: 0,user_id,user_url,posted,item_id,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"Posted November 5, 2011.",1250,True,Simple yet with great replayability. In my opi...
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"Posted July 15, 2011.",22200,True,It's unique and worth a playthrough.
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"Posted April 21, 2011.",43110,True,Great atmosphere. The gunplay can be a bit chu...
3,js41637,http://steamcommunity.com/id/js41637,"Posted June 24, 2014.",251610,True,I know what you think when you see this title ...
4,js41637,http://steamcommunity.com/id/js41637,"Posted September 8, 2013.",227300,True,For a simple (it's actually not all that simpl...


In [21]:
df = df_reviews.copy()

In [22]:
df['posted'] = df['posted'].str.replace('Posted ', '').str.split()   


In [23]:
df.head()

Unnamed: 0,user_id,user_url,posted,item_id,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[November, 5,, 2011.]",1250,True,Simple yet with great replayability. In my opi...
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[July, 15,, 2011.]",22200,True,It's unique and worth a playthrough.
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[April, 21,, 2011.]",43110,True,Great atmosphere. The gunplay can be a bit chu...
3,js41637,http://steamcommunity.com/id/js41637,"[June, 24,, 2014.]",251610,True,I know what you think when you see this title ...
4,js41637,http://steamcommunity.com/id/js41637,"[September, 8,, 2013.]",227300,True,For a simple (it's actually not all that simpl...


In [24]:
#Usamos expresiones regulares para llevar a un formato mas parecido a una fecha
def convertir_fecha(fecha_lista):
    fecha_str = ' '.join(fecha_lista)
    match = re.search(r'(\w+)\s+(\d+)(?:,\s+(\d+))?', fecha_str)
    
    if match:
        mes = match.group(1)
        dia = int(match.group(2))
        año = match.group(3)
        
        if año:
            año = int(año)
        else:
            
            año = 2016  # Asumimos que es el año de los comentarios en les falta
            
        return f'{año:04d}-{mes}-{dia:02d}'
    else:
        return None 


In [25]:
# Aplicar la función para convertir la columna 'fecha'
df['posted'] = df['posted'].apply(convertir_fecha)

In [26]:
# Convertir la columna 'fecha' a objetos de fecha de pandas
df['posted'] = pd.to_datetime(df['posted'], errors='coerce')

In [27]:
df.head()

Unnamed: 0,user_id,user_url,posted,item_id,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,2011-11-05,1250,True,Simple yet with great replayability. In my opi...
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,2011-07-15,22200,True,It's unique and worth a playthrough.
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,2011-04-21,43110,True,Great atmosphere. The gunplay can be a bit chu...
3,js41637,http://steamcommunity.com/id/js41637,2014-06-24,251610,True,I know what you think when you see this title ...
4,js41637,http://steamcommunity.com/id/js41637,2013-09-08,227300,True,For a simple (it's actually not all that simpl...


In [28]:
df.dropna(inplace=True)

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59305 entries, 0 to 59304
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   user_id    59305 non-null  object        
 1   user_url   59305 non-null  object        
 2   posted     59305 non-null  datetime64[ns]
 3   item_id    59305 non-null  object        
 4   recommend  59305 non-null  object        
 5   review     59305 non-null  object        
dtypes: datetime64[ns](1), object(5)
memory usage: 2.7+ MB


In [30]:
df['review'] = df['review'].apply(lambda x: x.lower())

In [31]:
#Función para realizar el análisis de sentimiento y asignar valores numéricos
def analizar_sentimiento(texto):
    analysis = TextBlob(texto)
    if analysis.sentiment.polarity < 0:
        return 0  # Malo
    elif analysis.sentiment.polarity == 0:
        return 1  # Neutral
    else:
        return 2  # Positivo

In [32]:
# Aplicar la función para crear la nueva columna 'sentiment_analysis'
df['sentiment_analysis'] = df['review'].apply(analizar_sentimiento)


In [33]:
# Eliminar la columna 'review' original si ya no la necesitas
df.drop(columns=['review','user_url'], inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,user_id,posted,item_id,recommend,sentiment_analysis
0,76561197970982479,2011-11-05,1250,True,2
1,76561197970982479,2011-07-15,22200,True,2
2,76561197970982479,2011-04-21,43110,True,2
3,js41637,2014-06-24,251610,True,2
4,js41637,2013-09-08,227300,True,0


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59305 entries, 0 to 59304
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   user_id             59305 non-null  object        
 1   posted              59305 non-null  datetime64[ns]
 2   item_id             59305 non-null  object        
 3   recommend           59305 non-null  object        
 4   sentiment_analysis  59305 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 2.3+ MB


In [35]:
# Especifica el nombre del archivo Parquet en el que deseas guardar el DataFrame
nombre_archivo_parquet = "data_reviews.parquet"

# Guarda el DataFrame como un archivo Parquet
df.to_parquet(nombre_archivo_parquet)
