# Creación de los Datasets y funciones, para los Endpoint de la API y modelo de recomendación

<br>

### Se decide realizar datasets orientados a las funciones de los endpoits, para facilitar las consultas, y para evitar los problemas de capacidad tanto de render, como de github.

<br>
<br>

Importamos las librerías

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pyarrow as pa
import pyarrow.parquet as pq
import warnings
warnings.filterwarnings("ignore")

<br>
<br>

Importamos los Datasets limpios

In [2]:
df_g = pd.read_parquet(r"C:\Users\Cebol\OneDrive\Escritorio\PI_01_steam\Datasets\steam_games_limpio.parquet")
df_r = pd.read_parquet(r"C:\Users\Cebol\OneDrive\Escritorio\PI_01_steam\Datasets\australian_user_reviews_limpio.parquet")
df_i = pd.read_parquet(r"C:\Users\Cebol\OneDrive\Escritorio\PI_01_steam\Datasets\australian_users_items_limpio_snappy.parquet")

<br>
<br>

## 01 Def_developer

<br>

#### Cantidad items y porcentaje contenido free por año, por empresa desarrolladora

<br>
<br>

Se obtienen las columnas necesarias del dataframe de steam_games

In [3]:
df_filtrado = df_g[['price', 'release_year', 'developer', 'item_id']]
# se eliminan los duplicados
df_developer = df_filtrado.drop_duplicates()
df_developer

Unnamed: 0,price,release_year,developer,item_id
0,4.99,2018,kotoshiro,761140
5,0.00,2018,secret level srl,643980
9,0.00,2017,poolians.com,670290
14,0.99,2017,彼岸领域,767400
17,3.99,2018,trickjump games ltd,772540
...,...,...,...,...
71535,1.99,2018,bidoniera games,745400
71539,1.99,2018,"nikita ""ghost_rus""",773640
71543,4.99,2018,sacada,733530
71546,1.99,2018,laush dmitriy sergeevich,610660


<br>
<br>

#### Se guarda como parquet

In [4]:
# Convierte el DataFrame a un objeto Table de PyArrow
table = pa.Table.from_pandas(df_developer)

# Define las opciones de compresión con Snappy
compression = 'None'

# Guarda el objeto Table en formato Parquet con compresión Snappy
pq.write_table(table, r"C:\Users\Cebol\OneDrive\Escritorio\PI_01_steam\Datasets\def_developer.parquet", compression=compression)

<br>
<br>

## 02 Def user_data

<br>

#### Cantidad de dinero gastado por el usuario, el porcentaje de recomendación (reviews.recommend) y cantidad de items.

<br>
<br>

Se obtienen las columnas 'price' e 'item_id' del Dataframe de steam_games, descartando los valores duplicados

In [5]:
dinero_gastado = df_g[['price', 'item_id']]
dinero_gastado = dinero_gastado.drop_duplicates(subset='item_id', keep='first')
dinero_gastado

Unnamed: 0,price,item_id
0,4.99,761140
5,0.00,643980
9,0.00,670290
14,0.99,767400
17,3.99,772540
...,...,...
71535,1.99,745400
71539,1.99,773640
71543,4.99,733530
71546,1.99,610660


<br>
<br>

Se obtienen las columnas 'item_id', 'item_count' y 'user_id' del Dataframe users_items

In [6]:
cantidad_items = df_i[['items_count', 'user_id', 'item_id']]
cantidad_items

Unnamed: 0,items_count,user_id,item_id
0,277,76561197970982479,10.0
1,277,76561197970982479,20.0
2,277,76561197970982479,30.0
3,277,76561197970982479,40.0
4,277,76561197970982479,50.0
...,...,...,...
5094087,7,76561198329548331,346330.0
5094088,7,76561198329548331,373330.0
5094089,7,76561198329548331,388490.0
5094090,7,76561198329548331,521570.0


<br>
<br>

#### Se hace el merge de los primeros dos dataframes

In [7]:
df_user_data = cantidad_items.merge(dinero_gastado, on='item_id', how='left')
df_user_data

Unnamed: 0,items_count,user_id,item_id,price
0,277,76561197970982479,10.0,9.99
1,277,76561197970982479,20.0,4.99
2,277,76561197970982479,30.0,4.99
3,277,76561197970982479,40.0,4.99
4,277,76561197970982479,50.0,4.99
...,...,...,...,...
5094087,7,76561198329548331,346330.0,0.00
5094088,7,76561198329548331,373330.0,
5094089,7,76561198329548331,388490.0,0.00
5094090,7,76561198329548331,521570.0,0.00


<br>
<br>

Se rellenan los valores nulos

In [8]:
df_user_data['price'] = df_user_data['price'].fillna(0.0)

<br>
<br>

Se hace la suma del dinero gastado por usuario, agrupando por user_id. La copia que se realiza es para poder hacer el merge con el 3er dataframe necesario.

In [9]:
df_user_data_parcial = df_user_data.groupby('user_id').agg({'price': 'sum', 'items_count': 'first'}).reset_index()

# Cambiar el nombre de la columna 'precio' a 'dinero_gastado'
df_user_data_parcial = df_user_data_parcial.rename(columns={'price': 'total_spent'})



In [10]:
df_user_data_parcial

Unnamed: 0,user_id,total_spent,items_count
0,--000--,397.78,58
1,--ace--,166.82,44
2,--ionex--,99.93,23
3,-2SV-vuLB-Kg,427.50,68
4,-404PageNotFound-,1509.32,149
...,...,...,...
70907,zzonci,19.98,5
70908,zzoptimuszz,64.98,61
70909,zzydrax,99.94,13
70910,zzyfo,828.51,84


<br>
<br>

Se obtienen las columnas 'item_id', 'recommend' y 'user_id' del Dataframe user_reviews

In [11]:
recomienda_items = df_r[['recommend', 'user_id', 'item_id']]
recomienda_items

Unnamed: 0,recommend,user_id,item_id
0,True,76561197970982479,1250.0
1,True,76561197970982479,22200.0
2,True,76561197970982479,43110.0
3,True,js41637,251610.0
4,True,js41637,227300.0
...,...,...,...
1495,True,76561198074347537,4000.0
1496,True,SurfChicken,730.0
1497,False,76561198126142125,236390.0
1498,False,Clouwolf,730.0


<br>
<br>

Se calcula el porcentaje de recomendacion por cada usuario

In [12]:
# Calcular el porcentaje de recommend True por user_id
total_count = recomienda_items.groupby('user_id').size()
recommend_true_count = recomienda_items[recomienda_items['recommend'] == True].groupby('user_id').size() 

# Llenar con 0 los valores faltantes
recommend_true_count = recommend_true_count.reindex(total_count.index, fill_value=0)

# Calcular el porcentaje
percentage = (recommend_true_count / total_count) * 100

# Crear un nuevo dataframe con los resultados
recomendacion = pd.DataFrame({'user_id': percentage.index, 'recommend': percentage.values})
recomendacion


Unnamed: 0,user_id,recommend
0,-Azsael-,100.000000
1,-GM-Dragon,50.000000
2,1122305938,75.000000
3,1234865654,100.000000
4,1337lolroflmao,100.000000
...,...,...
600,washington_,83.333333
601,whiteofwolfing,100.000000
602,whodafuqisthisguilao,100.000000
603,xfluttersx,85.714286


<br>
<br>

#### Se hace el segundo merge de los Dataframes creados

In [13]:
df_user_data_final = df_user_data_parcial.merge(recomendacion, on='user_id', how='left')
df_user_data_final['recommend'].fillna(0, inplace=True)
df_user_data_final

Unnamed: 0,user_id,total_spent,items_count,recommend
0,--000--,397.78,58,0.0
1,--ace--,166.82,44,0.0
2,--ionex--,99.93,23,0.0
3,-2SV-vuLB-Kg,427.50,68,0.0
4,-404PageNotFound-,1509.32,149,0.0
...,...,...,...,...
70907,zzonci,19.98,5,0.0
70908,zzoptimuszz,64.98,61,0.0
70909,zzydrax,99.94,13,0.0
70910,zzyfo,828.51,84,0.0


<br>
<br>

#### Se guarda como parquet

In [14]:
# Convierte el DataFrame a un objeto Table de PyArrow
table = pa.Table.from_pandas(df_user_data_final)

# Define las opciones de compresión con Snappy
compression = 'None'

# Guarda el objeto Table en formato Parquet con compresión Snappy
pq.write_table(table, r"C:\Users\Cebol\OneDrive\Escritorio\PI_01_steam\Datasets\def_user_data.parquet", compression=compression)


<br>
<br>

## 03 Def user_for_genre

<br>

#### Usuario que acumula mas horas jugadas por genero dado y lista de acumulación de horas jugadas por año de lanzamiento.

<br>
<br>

Se obtienen las columnas 'playtime_forever', 'user_id', e 'item_id' del dataset de users_items, recordando que los valores de 'playtime_forever' ya se encuentran expresados en horas.

In [15]:
df_user = df_i[['playtime_forever', 'user_id', 'item_id']]
df_user

Unnamed: 0,playtime_forever,user_id,item_id
0,0.100000,76561197970982479,10.0
1,0.000000,76561197970982479,20.0
2,0.116667,76561197970982479,30.0
3,0.000000,76561197970982479,40.0
4,0.000000,76561197970982479,50.0
...,...,...,...
5094087,0.000000,76561198329548331,346330.0
5094088,0.000000,76561198329548331,373330.0
5094089,0.050000,76561198329548331,388490.0
5094090,0.066667,76561198329548331,521570.0


<br>
<br>

Se obtienen las columnas 'item_id', 'release_year' y 'genre' del dataset steam_games

In [16]:
df_genero = df_g[['release_year', 'genres', 'item_id']]
df_genero

Unnamed: 0,release_year,genres,item_id
0,2018,Action,761140
1,2018,Casual,761140
2,2018,Indie,761140
3,2018,Simulation,761140
4,2018,Strategy,761140
...,...,...,...
71546,2018,Indie,610660
71547,2018,Racing,610660
71548,2018,Simulation,610660
71549,2017,Casual,658870


<br>
<br>

#### Se realiza el merge de los datasets a traves del item_id

In [17]:
df_user_genre = df_genero.merge(df_user, on='item_id')
df_user_genre

Unnamed: 0,release_year,genres,item_id,playtime_forever,user_id
0,1997,Action,282010,0.083333,UTNerd24
1,1997,Action,282010,0.000000,I_DID_911_JUST_SAYING
2,1997,Action,282010,0.000000,76561197962104795
3,1997,Action,282010,0.000000,r3ap3r78
4,1997,Action,282010,0.216667,saint556
...,...,...,...,...,...
9877276,2004,Action,80,0.000000,76561198273508956
9877277,2004,Action,80,0.000000,76561198282090798
9877278,2004,Action,80,0.000000,943525
9877279,2004,Action,80,0.150000,76561198283312749


<br>
<br>

#### Se guarda como parquet

In [18]:
# Convierte el DataFrame a un objeto Table de PyArrow
table = pa.Table.from_pandas(df_user_genre)

# Define las opciones de compresión con Snappy
compression = 'None'

# Guarda el objeto Table en formato Parquet con compresión Snappy
pq.write_table(table, r"C:\Users\Cebol\OneDrive\Escritorio\PI_01_steam\Datasets\def_user_for_genre.parquet", compression=compression)

<br>
<br>

## 04 def best_developer_year

<br>

#### Devuelve top 3 de desarrolladora con más recomendaciones por usuario, por el año dado.

<br>
<br>

Se obtienen las columnas 'developer', 'release_year', e 'item_id' del dataset de steam_games.

In [19]:
df_dev = df_g[['release_year', 'developer', 'item_id']]
df_dev

Unnamed: 0,release_year,developer,item_id
0,2018,kotoshiro,761140
1,2018,kotoshiro,761140
2,2018,kotoshiro,761140
3,2018,kotoshiro,761140
4,2018,kotoshiro,761140
...,...,...,...
71546,2018,laush dmitriy sergeevich,610660
71547,2018,laush dmitriy sergeevich,610660
71548,2018,laush dmitriy sergeevich,610660
71549,2017,"xropi,stev3ns",658870


<br>
<br>

Se obtienen las columnas 'recommend', 'user_id' e 'item_id' del dataset de user_reviews.

In [20]:
df_recomen = df_r[['recommend', 'user_id', 'item_id']]
df_recomen

Unnamed: 0,recommend,user_id,item_id
0,True,76561197970982479,1250.0
1,True,76561197970982479,22200.0
2,True,76561197970982479,43110.0
3,True,js41637,251610.0
4,True,js41637,227300.0
...,...,...,...
1495,True,76561198074347537,4000.0
1496,True,SurfChicken,730.0
1497,False,76561198126142125,236390.0
1498,False,Clouwolf,730.0


<br>
<br>

#### Se realiza el merge de los datasets a traves del item_id.

In [21]:
df_best_developer = df_dev.merge(df_recomen, on='item_id')
df_best_developer

Unnamed: 0,release_year,developer,item_id,recommend,user_id
0,1998,valve,70,True,EizanAratoFujimaki
1,1998,valve,70,True,GamerFag
2,1998,valve,70,True,76561198020928326
3,1998,valve,70,True,Bluegills
4,2006,facepunch studios,4000,True,WeiEDKrSat
...,...,...,...,...,...
3320,2004,valve,220,True,chidvd
3321,2006,valve,380,True,GamerFag
3322,1999,valve,20,False,76561198039441595
3323,2000,valve,10,True,Bennysaputra


In [22]:
# Elimino los registros con release_year = sin_dato_fecha 
df_best_developer.drop(df_best_developer[df_best_developer['release_year'] == "Sin_dato_fecha"].index, inplace=True)

# convierto la columna a entero, para poder utilizar bien la función 
df_best_developer['release_year'] = df_best_developer['release_year'].astype(int)

<br>
<br>

#### Se carga como parquet

In [23]:
# Convierte el DataFrame a un objeto Table de PyArrow
table = pa.Table.from_pandas(df_best_developer)

# Define las opciones de compresión con Snappy
compression = 'None'

# Guarda el objeto Table en formato Parquet con compresión Snappy
pq.write_table(table, r"C:\Users\Cebol\OneDrive\Escritorio\PI_01_steam\Datasets\def_best_developer_year.parquet", compression=compression)

<br>
<br>

## 05 def developer_reviews_analysis

<br>

#### Según la desarrolladora dada, devuelve un diccionario con clave que es la desarrolladora y valor como una lista de cantidad de recomendaciones positivas y cantidad de recomendaciones negativas.

<br>
<br>

Se obtienen las columnas 'sentiment_analysis' e 'item_id' del dataset user_reviews

In [24]:
df_analisis = df_r[['sentiment_analysis', 'item_id']]
df_analisis

Unnamed: 0,sentiment_analysis,item_id
0,2,1250.0
1,2,22200.0
2,2,43110.0
3,2,251610.0
4,2,227300.0
...,...,...
1495,1,4000.0
1496,2,730.0
1497,0,236390.0
1498,2,730.0


<br>
<br>

Se obtienen las columnas 'developer' e 'item_id' del dataset steam_games

In [25]:
df_develop = df_g[['developer', 'item_id']]
df_develop

Unnamed: 0,developer,item_id
0,kotoshiro,761140
1,kotoshiro,761140
2,kotoshiro,761140
3,kotoshiro,761140
4,kotoshiro,761140
...,...,...
71546,laush dmitriy sergeevich,610660
71547,laush dmitriy sergeevich,610660
71548,laush dmitriy sergeevich,610660
71549,"xropi,stev3ns",658870


<br>
<br>

#### Se realiza el merge de los datasets a traves del item_id.

In [26]:
df_developer_reviews = df_develop.merge(df_analisis, on='item_id')
df_developer_reviews

Unnamed: 0,developer,item_id,sentiment_analysis
0,valve,70,2
1,valve,70,0
2,valve,70,2
3,valve,70,2
4,facepunch studios,4000,1
...,...,...,...
3320,valve,220,0
3321,valve,380,2
3322,valve,20,0
3323,valve,10,2


<br>
<br>

#### Se carga como parquet

In [27]:
# Convierte el DataFrame a un objeto Table de PyArrow
table = pa.Table.from_pandas(df_developer_reviews)

# Define las opciones de compresión con Snappy
compression = 'None'

# Guarda el objeto Table en formato Parquet con compresión Snappy
pq.write_table(table, r"C:\Users\Cebol\OneDrive\Escritorio\PI_01_steam\Datasets\def_developer_reviews_analysis.parquet", compression=compression)

<br>
<br>

# Modelo de Recomendación

<br>

#### Con nuestro modelo de recomndación se busca obtener una lista de 5 juegos a partir de otro juego, de similares características.

<br>
<br>

Se analizan los Dataframes para seleccionar las variables

In [28]:
df_r.head()

Unnamed: 0,item_id,recommend,review,user_id,sentiment_analysis
0,1250.0,True,simple yet with great replayability. in my opi...,76561197970982479,2
1,22200.0,True,it's unique and worth a playthrough.,76561197970982479,2
2,43110.0,True,great atmosphere. the gunplay can be a bit chu...,76561197970982479,2
3,251610.0,True,i know what you think when you see this title ...,js41637,2
4,227300.0,True,for a simple (it's actually not all that simpl...,js41637,2


In [29]:
df_g.head()

Unnamed: 0,genres,title,price,item_id,developer,release_year
0,Action,lost summoner kitty,4.99,761140,kotoshiro,2018
1,Casual,lost summoner kitty,4.99,761140,kotoshiro,2018
2,Indie,lost summoner kitty,4.99,761140,kotoshiro,2018
3,Simulation,lost summoner kitty,4.99,761140,kotoshiro,2018
4,Strategy,lost summoner kitty,4.99,761140,kotoshiro,2018


<br>
<br>

Se filtran las columnas necesarias

In [30]:
df_pre_modelo_g = df_g[['genres', 'title', 'item_id', 'developer']]
df_pre_modelo_g

Unnamed: 0,genres,title,item_id,developer
0,Action,lost summoner kitty,761140,kotoshiro
1,Casual,lost summoner kitty,761140,kotoshiro
2,Indie,lost summoner kitty,761140,kotoshiro
3,Simulation,lost summoner kitty,761140,kotoshiro
4,Strategy,lost summoner kitty,761140,kotoshiro
...,...,...,...,...
71546,Indie,russian roads,610660,laush dmitriy sergeevich
71547,Racing,russian roads,610660,laush dmitriy sergeevich
71548,Simulation,russian roads,610660,laush dmitriy sergeevich
71549,Casual,exit 2 - directions,658870,"xropi,stev3ns"


In [31]:
df_pre_modelo_r = df_r[['item_id', 'recommend']]
df_pre_modelo_r

Unnamed: 0,item_id,recommend
0,1250.0,True
1,22200.0,True
2,43110.0,True
3,251610.0,True
4,227300.0,True
...,...,...
1495,4000.0,True
1496,730.0,True
1497,236390.0,False
1498,730.0,False


<br>
<br>

#### Se realiza el merge

In [32]:
df_modelo = df_pre_modelo_g.merge(df_pre_modelo_r, on='item_id')
df_modelo

Unnamed: 0,genres,title,item_id,developer,recommend
0,Action,half-life,70,valve,True
1,Action,half-life,70,valve,True
2,Action,half-life,70,valve,True
3,Action,half-life,70,valve,True
4,Indie,garry's mod,4000,facepunch studios,True
...,...,...,...,...,...
3320,Action,half-life 2,220,valve,True
3321,Action,half-life 2: episode one,380,valve,True
3322,Action,team fortress classic,20,valve,False
3323,Action,counter-strike,10,valve,True


<br>
<br>

Se trabaja el Dataframe

In [33]:
# Calcular el porcentaje de recomendación agrupando por title
recommendation_percentage = (df_modelo.groupby('title')['recommend'].mean() * 100).round(1)

# Fusionar los resultados con el DataFrame original
df_filtrado = df_modelo.merge(recommendation_percentage, on='title')

# Eliminar los valores duplicados y quedarse con una fila por genres-title
df_modelo_recomendacion = df_filtrado.drop_duplicates(subset=['genres', 'title'])
df_modelo_recomendacion


Unnamed: 0,genres,title,item_id,developer,recommend_x,recommend_y
0,Action,half-life,70,valve,True,100.0
4,Indie,garry's mod,4000,facepunch studios,True,97.9
52,Simulation,garry's mod,4000,facepunch studios,True,97.9
100,Action,ultimate doom,2280,id software,True,100.0
101,Racing,midnight club 2,12160,rockstar san diego,True,100.0
...,...,...,...,...,...,...
3315,Action,half-life 2,220,valve,True,100.0
3321,Action,half-life 2: episode one,380,valve,True,100.0
3322,Action,team fortress classic,20,valve,False,0.0
3323,Action,counter-strike,10,valve,True,100.0


<br>
<br>

Se elimina la columna recommend_x

In [34]:
# Eliminar la columna 'recommend_x'
df_modelo_recomendacion = df_modelo_recomendacion.drop('recommend_x', axis=1)

<br>
<br>

#### Se carga el dataframe

In [35]:
# Convierte el DataFrame a un objeto Table de PyArrow
table = pa.Table.from_pandas(df_modelo_recomendacion)

# Define las opciones de compresión con Snappy
compression = 'None'

# Guarda el objeto Table en formato Parquet con compresión Snappy
pq.write_table(table, r"C:\Users\Cebol\OneDrive\Escritorio\PI_01_steam\Datasets\def_recomendacion_juego.parquet", compression=compression)