<h1>ETL</h1>

In [32]:
import pandas as pd
import json
from datetime import datetime
from textblob import TextBlob

<h3>Dataset "user_reviews"</h3>

In [33]:
user_reviews = r"..\Datasets\user_reviews\australian_user_reviews.json"

# Abrimos el archivo JSON en modo lectura:
with open(user_reviews, "r", encoding="utf-8") as ur_lectura:
    lineas_json = ur_lectura.readlines() # Convertimos el archivo de lectura en una lista de lineas JSON.

# Convertimos las lineas JSON en objetos Python:
datos_json = [eval(linea.strip()) for linea in lineas_json]

# Cargamos el archivo en un DataFrame:
df_ur = pd.DataFrame(datos_json)
df_ur.head()

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."


In [34]:
# Eliminamos la columna "user_url":
del df_ur["user_url"]

In [35]:
df_ur.head(3)

Unnamed: 0,user_id,reviews
0,76561197970982479,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."


Tratamiento de la columna "reviews"

In [36]:
# Desanidamos la columna "reviews" y lo unimos al DataFrame original:
df_ur = df_ur.explode('reviews')
df_ur = pd.concat([df_ur.drop(['reviews'], axis=1), df_ur['reviews'].apply(pd.Series)], axis=1)

In [37]:
df_ur.head()

Unnamed: 0,user_id,funny,posted,last_edited,item_id,helpful,recommend,review,0
0,76561197970982479,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,
0,76561197970982479,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.,
0,76561197970982479,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...,
1,js41637,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...,
1,js41637,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...,


In [38]:
# Eliminamos las columnas innecesarias:
del df_ur["funny"]
del df_ur["last_edited"]
del df_ur["helpful"]
del df_ur[0]

In [39]:
df_ur.head()

Unnamed: 0,user_id,posted,item_id,recommend,review
0,76561197970982479,"Posted November 5, 2011.",1250,True,Simple yet with great replayability. In my opi...
0,76561197970982479,"Posted July 15, 2011.",22200,True,It's unique and worth a playthrough.
0,76561197970982479,"Posted April 21, 2011.",43110,True,Great atmosphere. The gunplay can be a bit chu...
1,js41637,"Posted June 24, 2014.",251610,True,I know what you think when you see this title ...
1,js41637,"Posted September 8, 2013.",227300,True,For a simple (it's actually not all that simpl...


In [40]:
# Convertimos la columna "posted" en formato de año:
formato = "Posted %B %d, %Y."

def obtener_year_desde_cadena(fecha_str): # Esta función hace la conversión de "str" a año.
    try:
        fecha_datetime = datetime.strptime(fecha_str, formato)
        return fecha_datetime.year
    except ValueError: # Manejar el caso en que no se pueda convertir la fecha.
        return None

# Limpiamos la columna: convertimos todos los valores de la columna "posted" a "str":
df_ur["posted"] = df_ur["posted"].astype(str)

# Aplicamos la función:
df_ur["year"] = df_ur["posted"].apply(obtener_year_desde_cadena)

# Convertimos la columna en tipo "Int64"
df_ur["year"] = df_ur["year"].astype("Int64")

In [41]:
# Eliminamos la columna "posted":
del df_ur["posted"]

In [42]:
# Transformamos la ccolumna "item_id" de tipo str a tipo int:
df_ur["item_id"] = df_ur["item_id"].astype("Int64")

In [43]:
df_ur.head()

Unnamed: 0,user_id,item_id,recommend,review,year
0,76561197970982479,1250,True,Simple yet with great replayability. In my opi...,2011
0,76561197970982479,22200,True,It's unique and worth a playthrough.,2011
0,76561197970982479,43110,True,Great atmosphere. The gunplay can be a bit chu...,2011
1,js41637,251610,True,I know what you think when you see this title ...,2014
1,js41637,227300,True,For a simple (it's actually not all that simpl...,2013


Tratamiento de la columna "item_id"

In [44]:
# Eliminaremos todos los registros cuyo juego (item_id), no se encuentre en el registro de juegos de Dataset "steam_games":

# Cargamos el archivo de "steam_games" ya transformado en "ETL_steam_games.ipynb":
games_df = pd.read_csv(r"../Archivos Producidos/games.csv")

In [45]:
# Verificamos qué item_id en "df_ur" no están en "games_df":
items_a_eliminar = df_ur[~df_ur['item_id'].isin(games_df['item_id'])]

# Eliminamos los registros correspondientes de "df_ur":
df_ur = df_ur[df_ur['item_id'].isin(games_df['item_id'])]

Análisis de sentimiento

In [46]:
# Creamos una función para realizar el análisis de sentimiento y asignar etiquetas:
def analyze_sentiment(text):
    if pd.isnull(text): # Si la reseña está ausente, devolverá 1 (neutral)
        return 1
    analysis = TextBlob(text) # Utilizamos TextBlob para realizar el análisis de sentimiento
    polarity = analysis.sentiment.polarity # Obtenemos la polaridad del sentimiento

    # Asignar etiquetas según la polaridad:
    if polarity > 0.1:  # Polaridad positiva
        return 2
    elif polarity < -0.1:  # Polaridad negativa
        return 0
    else:  # Si la polaridad está cerca de cero, lo consideramos como neutro
        return 1

# Aplicamos la función a la columna "review":
df_ur["sentiment_analysis"] = df_ur["review"].apply(analyze_sentiment)

In [14]:
# Extraemos la columna "review":
reviews_texto = df_ur.pop("review")

In [15]:
df_ur.head()

Unnamed: 0,user_id,item_id,recommend,year,sentiment_analysis
0,76561197970982479,1250,True,2011,2
0,76561197970982479,22200,True,2011,2
0,76561197970982479,43110,True,2011,1
1,js41637,251610,True,2014,2
1,js41637,227300,True,2013,1


In [16]:
# Eliminamos los registros duplicados:
df_ur = df_ur.drop_duplicates()

In [17]:
# Convertimos la columna "recommend" de tipo "str" a tipo "bool":
df_ur["recommend"] = df_ur["recommend"].astype(bool)

Capitalizamos y quitamos espacios en la columnas "user_id"

In [24]:
df_ur["user_id"] = df_ur["user_id"].astype(str).str.capitalize().str.strip()

Guardado como CSV

In [47]:
df_ur.to_csv(r"..\Archivos Producidos\reviews.csv", index=False)

In [48]:
# Guardamos también la columna "review":
reviews_texto.to_csv(r"..\Archivos Producidos\text_reviews.csv", index=False)