Carga de datos

In [133]:
# Importación de librerías
import json
import pandas as pd
import gzip
import ast
import nltk
import string
from textblob import TextBlob
from pandas import json_normalize

# Ruta del archivo
file = '../Datasets/user_reviews.json.gz'

# Lista para llenar con los datos del archivo
data = []

# Descompresión, lectura y escritura del archivo
with gzip.open(file, 'rt', encoding='utf-8') as file:
    for line in file:
        try:
# Conversión de la línea a un diccionario
            json_data = ast.literal_eval(line)
            data.append(json_data)
# Manejo de errores
        except ValueError as e:
            print(f"Error en la línea: {line}")
            continue

df = pd.DataFrame(data)


In [134]:
df

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."
...,...,...,...
25794,76561198306599751,http://steamcommunity.com/profiles/76561198306...,"[{'funny': '', 'posted': 'Posted May 31.', 'la..."
25795,Ghoustik,http://steamcommunity.com/id/Ghoustik,"[{'funny': '', 'posted': 'Posted June 17.', 'l..."
25796,76561198310819422,http://steamcommunity.com/profiles/76561198310...,"[{'funny': '1 person found this review funny',..."
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"[{'funny': '', 'posted': 'Posted July 21.', 'l..."


Normalización

In [135]:
# Desanidamos los datos de reviews
df_list = [json_normalize(user, 'reviews', ['user_id', 'user_url']) for user in data]

# Concatenamos la lista de dfs
df = pd.concat(df_list, ignore_index=True)

In [136]:
df.head()

Unnamed: 0,funny,posted,last_edited,item_id,helpful,recommend,review,user_id,user_url
0,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,76561197970982479,http://steamcommunity.com/profiles/76561197970...
1,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.,76561197970982479,http://steamcommunity.com/profiles/76561197970...
2,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...,76561197970982479,http://steamcommunity.com/profiles/76561197970...
3,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...,js41637,http://steamcommunity.com/id/js41637
4,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...,js41637,http://steamcommunity.com/id/js41637


Eliminación de columnas irrelevantes

In [137]:
df.drop('user_url', axis=1, inplace=True)
df.drop('funny', axis=1, inplace=True)
df.drop('helpful', axis=1, inplace=True)
df.drop('last_edited', axis=1, inplace=True)

In [138]:
df.head()

Unnamed: 0,posted,item_id,recommend,review,user_id
0,"Posted November 5, 2011.",1250,True,Simple yet with great replayability. In my opi...,76561197970982479
1,"Posted July 15, 2011.",22200,True,It's unique and worth a playthrough.,76561197970982479
2,"Posted April 21, 2011.",43110,True,Great atmosphere. The gunplay can be a bit chu...,76561197970982479
3,"Posted June 24, 2014.",251610,True,I know what you think when you see this title ...,js41637
4,"Posted September 8, 2013.",227300,True,For a simple (it's actually not all that simpl...,js41637


Chequeo de nulos

In [139]:
df['user_id'].isna().sum()

0

In [140]:
df['review'].isna().sum()

0

In [141]:
df['recommend'].isna().sum()

0

In [142]:
df['item_id'].isna().sum()

0

In [143]:
df['posted'].isna().sum()

0

Preprocesamiento de texto

In [144]:
empty_reviews = df['review'][df['review'] == '']

In [145]:
empty_reviews.count()

30

NLP

In [146]:
def get_sentiment(text):
    # Si el texto está vacío, retornamos 1 (neutro)
    if text == '':
        return 1
    else:
    # Creamos un objeto TextBlob con la entrada como parámetro
        blob = TextBlob(text)
    # Obtenemos la polaridad del texto
        polarity = blob.sentiment.polarity
    # Convertimos la polaridad en una escala de 0, 1, 2
        if polarity < 0:
            return 0  # Negativo
        elif polarity == 0:
            return 1  # Neutro
        else:
            return 2  # Positivo

# Aplicamos la función a la columna 'review'
df['sentiment_analysis'] = df['review'].apply(get_sentiment)

In [147]:
df['sentiment_analysis'].value_counts()

sentiment_analysis
2    33531
1    13028
0    12746
Name: count, dtype: int64

In [148]:
df.head()

Unnamed: 0,posted,item_id,recommend,review,user_id,sentiment_analysis
0,"Posted November 5, 2011.",1250,True,Simple yet with great replayability. In my opi...,76561197970982479,2
1,"Posted July 15, 2011.",22200,True,It's unique and worth a playthrough.,76561197970982479,2
2,"Posted April 21, 2011.",43110,True,Great atmosphere. The gunplay can be a bit chu...,76561197970982479,2
3,"Posted June 24, 2014.",251610,True,I know what you think when you see this title ...,js41637,2
4,"Posted September 8, 2013.",227300,True,For a simple (it's actually not all that simpl...,js41637,0


In [149]:
df.drop('review', axis=1, inplace=True)

In [150]:
df.head()

Unnamed: 0,posted,item_id,recommend,user_id,sentiment_analysis
0,"Posted November 5, 2011.",1250,True,76561197970982479,2
1,"Posted July 15, 2011.",22200,True,76561197970982479,2
2,"Posted April 21, 2011.",43110,True,76561197970982479,2
3,"Posted June 24, 2014.",251610,True,js41637,2
4,"Posted September 8, 2013.",227300,True,js41637,0


Exportación del dataset

In [151]:
df.to_parquet('../Datasets/user_reviews_preprocessed.parquet', compression='snappy',index=False)