In [1]:
#importamos las librerías necesarias para trabajar
import nltk
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import json
import pandas as pd
import ast
import matplotlib.pyplot as plt
from textblob import TextBlob
import pyarrow

In [2]:
#cargamos el archivo json
with open('C:\\Users\\Tom\\Desktop\\PIMLOps-STEAM\\dataset\\australian_user_reviews.json', 'r', encoding='utf-8') as file:
    data = [eval(line) for line in file]

#convertimos en un dataframe el archivo y lo desanidamos completamente
user_reviews = pd.DataFrame(data)
user_reviews_explored = user_reviews.explode('reviews')
user_reviews_normalized = pd.json_normalize(user_reviews_explored['reviews'])
user_reviews_explored.reset_index(drop=True, inplace=True)
user_reviews_normalized.reset_index(drop=True, inplace=True)
user_reviews_final = pd.concat([user_reviews_explored.drop('reviews',axis=1), user_reviews_normalized], axis=1)

In [3]:
#creamos la funcion de analisis de sentimiento
def analizar_sentimiento(review):
    if pd.notnull(review):  # Verificar si la reseña no está ausente
        analysis = TextBlob(review)
        # Asignar valor según la polaridad de la reseña
        if analysis.sentiment.polarity < 0:
            return 0  # Malo
        elif analysis.sentiment.polarity == 0:
            return 1  # Neutral
        else:
            return 2  # Positivo
    else:
        return 1  # Valor predeterminado si la reseña está ausente

#creamos la columna "sentiment_analysis" en base a la columna "review" a la cual se le aplicó la funcion de arriba
user_reviews_final['sentiment_analysis'] = user_reviews_final['review'].apply(analizar_sentimiento)

#eliminamos la columna "review"
user_reviews_final.drop(columns=['review'], inplace=True)

In [4]:
#aquí renombramos las columnas para que sea vean más prolijas y esteticas
user_reviews_final.rename(columns={'sentiment_analysis': 'Sentiment Analysis'}, inplace=True)
user_reviews_final.rename(columns={'recommend': 'Recommend'}, inplace=True)
user_reviews_final.rename(columns={'helpful': 'Helpful'}, inplace=True)
user_reviews_final.rename(columns={'item_id': 'Item ID'}, inplace=True)
user_reviews_final.rename(columns={'last_edited': 'Last Edited'}, inplace=True)
user_reviews_final.rename(columns={'funny': 'Funny'}, inplace=True)
user_reviews_final.rename(columns={'user_url': 'User URL'}, inplace=True)
user_reviews_final.rename(columns={'user_id': 'User ID'}, inplace=True)

In [5]:
#creamos la columna "Year Posted" que llenaremos con datos extraidos de la columna "posted"
user_reviews_final['Year Posted'] = user_reviews_final['posted'].str.extract(r'(\d{4})')
#eliminamos la columna que ya no utilizamos
user_reviews_final.drop('posted' , axis = 1, inplace = True)

In [6]:
#eliminamos las siguientes columnas las cuales no utilizaremos
user_reviews_final.drop(columns="Funny", inplace=True)
user_reviews_final.drop(columns="Last Edited", inplace=True)

In [7]:
#eliminamos los datos nulos que hayan en la columna "Year Posted"
user_reviews_final.dropna(subset="Year Posted", inplace=True)

In [8]:
#eliminamos datos duplicados que hayan en el dataframe
user_reviews_final.drop_duplicates(inplace=True)

In [9]:
#llamamos al dataframe para ver como quedó
user_reviews_final.head(15)

Unnamed: 0,User ID,User URL,Item ID,Helpful,Recommend,Sentiment Analysis,Year Posted
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1250,No ratings yet,True,2,2011
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,22200,No ratings yet,True,2,2011
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,43110,No ratings yet,True,2,2011
3,js41637,http://steamcommunity.com/id/js41637,251610,15 of 20 people (75%) found this review helpful,True,2,2014
4,js41637,http://steamcommunity.com/id/js41637,227300,0 of 1 people (0%) found this review helpful,True,0,2013
5,js41637,http://steamcommunity.com/id/js41637,239030,1 of 4 people (25%) found this review helpful,True,0,2013
7,evcentric,http://steamcommunity.com/id/evcentric,370360,No ratings yet,True,2,2015
8,evcentric,http://steamcommunity.com/id/evcentric,237930,No ratings yet,True,2,2014
9,evcentric,http://steamcommunity.com/id/evcentric,263360,No ratings yet,True,0,2014
10,evcentric,http://steamcommunity.com/id/evcentric,107200,No ratings yet,True,2,2014


In [10]:
#creamos un archivo parquet del dataframe
user_reviews_final.to_parquet('User_Reviews.parquet')