ETL USER_REVIEWS

In [1]:
#Importamos librerías a utilizar
import pandas as pd
import json
import ast
import re

In [2]:
#Convertimos el archivo json en un dataframe
def l_a_c():
    with open('australian_user_reviews.json','rt', encoding='utf-8') as file:
        return[ast.literal_eval(line.strip())
            for line in file]
        
datos = l_a_c()

df_user_reviews = pd.DataFrame(datos)

df_user_reviews

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."
...,...,...,...
25794,76561198306599751,http://steamcommunity.com/profiles/76561198306...,"[{'funny': '', 'posted': 'Posted May 31.', 'la..."
25795,Ghoustik,http://steamcommunity.com/id/Ghoustik,"[{'funny': '', 'posted': 'Posted June 17.', 'l..."
25796,76561198310819422,http://steamcommunity.com/profiles/76561198310...,"[{'funny': '1 person found this review funny',..."
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"[{'funny': '', 'posted': 'Posted July 21.', 'l..."


In [3]:
#Importamos la librería a usar para realizar el análisis de sentimientos
from textblob import TextBlob

In [4]:
#Creamos la función para realizar el análisis de sentimientos
def sentiment_analysis(review):
    if isinstance(review, list) and len(review) > 0:
        text = review[0].get('review', '')
        sentiment = TextBlob(text).sentiment.polarity
        
        if sentiment < -0.2:
            return 0 # Negativo
        elif sentiment >= -0.2 and sentiment <= -0.2:
            return 1 # Neutral
        else:
            return 2 # Positivo
    else:
        return 1 # Valor que se asume cuando la reseña está ausente

In [5]:
#Aplicamos la función creada a la coliumna reviwes, para realizar el análisis y generaer la columna solicitada Sentiment_analysis
df_user_reviews['sentiment_analysis'] = df_user_reviews['reviews'].apply(sentiment_analysis)

In [6]:
#Desanidamos REVIEWS

#Creamos una lista vacía donde vamos a almacenar los datos de la columna 'reviews'
data_review = []

In [7]:

for index, fila in df_user_reviews.iterrows():
    for review in fila['reviews']:
        # Verificar si 'review' es un diccionario
        if isinstance(review, dict):
            data_review.append({
                "user_id": fila["user_id"],
                "user_url": fila["user_url"],
                "sentiment_analysis": fila["sentiment_analysis"],
                "funny": review.get("funny", ""),
                "posted": review.get("posted", ""),
                "last_edited": review.get("last_edited", ""),  
                "item_id": review.get("item_id", ""),
                "helpful": review.get("helpful", ""),
                "recommend": review.get("recommend", ""),
                "review": review.get("review", "")
            })

# Convertimos la lista de datos en un nuevo DataFrame
df_user_new= pd.DataFrame(data_review)

# Mostramos el DataFrame resultante
df_user_new.head(5)

Unnamed: 0,user_id,user_url,sentiment_analysis,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,2,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,2,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,2,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,js41637,http://steamcommunity.com/id/js41637,2,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,js41637,http://steamcommunity.com/id/js41637,2,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...


In [8]:
#Eliminamos las columnas innecesarias
df_user_new.drop(['helpful','funny','last_edited','review'], axis=1, inplace=True)

In [9]:
#Eliminamos los puntos de la columna 'posted'
df_user_new['posted'] = df_user_new['posted'].str.replace('.', '')
df_user_new['posted'] = df_user_new['posted'].str.replace(',', '')
df_user_new['posted'] = df_user_new['posted'].str.replace('Posted', '')

In [10]:
# Función para comprobar y transformar el formato de las fechas
def analizar_fechas(df, columna):
    for index, fecha in enumerate(df[columna]):
        # Usar una expresión regular para verificar el formato deseado (dd-mm-AAAA)
        if not re.match(r'\d{2}-\d{2}-\d{4}', fecha):
            # Si no cumple con el formato, transformarlo en el formato correcto
            try:
                fecha_obj = pd.to_datetime(fecha, errors='raise')
                fecha_transformada = fecha_obj.strftime('%d-%m-%Y')
                df.at[index, columna] = fecha_transformada
            except ValueError:
                # Si no se puede transformar en una fecha válida, dejarlo como está
                pass

# Llamamos a la función para verificar y transformar las fechas
analizar_fechas(df_user_new, 'posted')

# Imprimimos el DataFrame resultante
print(df_user_new['posted'])

0        05-11-2011
1        15-07-2011
2        21-04-2011
3        24-06-2014
4        08-09-2013
            ...    
59300       July 10
59301        July 8
59302        July 3
59303       July 20
59304        July 2
Name: posted, Length: 59305, dtype: object


In [11]:
df_user_new['posted'] = pd.to_datetime(df_user_new['posted'], format='mixed', dayfirst=True, errors='coerce')

In [12]:
# Convertimos la columna 'posted' a tipo string (en caso que no lo sea)
df_user_new['posted'] = df_user_new['posted'].astype(str)

# Verificamos si todos los valores en 'posted' tienen el mismo formato (dd-mm-AAAA)
formato_esperado = r'\d{2}-\d{2}-\d{4}'  # Expresión regular para el formato dd-mm-AAAA

df_user_new.head(5)

Unnamed: 0,user_id,user_url,sentiment_analysis,posted,item_id,recommend
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,2,2011-11-05,1250,True
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,2,2011-07-15,22200,True
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,2,2011-04-21,43110,True
3,js41637,http://steamcommunity.com/id/js41637,2,2014-06-24,251610,True
4,js41637,http://steamcommunity.com/id/js41637,2,2013-09-08,227300,True


In [13]:
#Eliminamos los registros de fecha con dato NaN
df_user_new=df_user_new.dropna(subset=['posted'])
df_user_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59305 entries, 0 to 59304
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   user_id             59305 non-null  object
 1   user_url            59305 non-null  object
 2   sentiment_analysis  59305 non-null  int64 
 3   posted              59305 non-null  object
 4   item_id             59305 non-null  object
 5   recommend           59305 non-null  bool  
dtypes: bool(1), int64(1), object(4)
memory usage: 2.3+ MB


In [14]:
#Transformamos la columna 'recommend'
df_user_new['recommend'] = df_user_new['recommend'].replace({True:1, False:0})

In [15]:
#Eliminamos duplicados
df_user_new=df_user_new.drop_duplicates(subset=['user_id'])

In [16]:
df_user_new.head(5)

Unnamed: 0,user_id,user_url,sentiment_analysis,posted,item_id,recommend
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,2,2011-11-05,1250,1
3,js41637,http://steamcommunity.com/id/js41637,2,2014-06-24,251610,1
6,evcentric,http://steamcommunity.com/id/evcentric,2,NaT,248820,1
12,doctr,http://steamcommunity.com/id/doctr,2,2013-10-14,250320,1
18,maplemage,http://steamcommunity.com/id/maplemage,2,2014-04-15,211420,1


In [17]:
#Guardamos el archivo
df_user_new= df_user_new.to_csv("df_user_reviews_desanidada_final.csv", index=False)