# RankGeoFM

In [2]:
!pip3 install fastFM

Collecting fastFM
  Downloading fastFM-0.2.10.tar.gz (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fastFM
  Building wheel for fastFM (setup.py) ... [?25l[?25hdone
  Created wheel for fastFM: filename=fastFM-0.2.10-cp310-cp310-linux_x86_64.whl size=591644 sha256=49cf401b9132d38748fe30f7f5024ac420ec555217c44f81ad9929dfe93fed36
  Stored in directory: /root/.cache/pip/wheels/93/92/52/2da7997fcb7a7ce9042ff3b33836ef0c2fd47aa95382d7a113
Successfully built fastFM
Installing collected packages: fastFM
Successfully installed fastFM-0.2.10


In [3]:
import numpy as np
import pandas as pd
import fastFM
from fastFM.datasets import make_user_item_regression
from sklearn.model_selection import train_test_split
from fastFM import sgd
from fastFM import als
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from scipy.sparse import csc_matrix
from fastFM import mcmc
import functools as fct
import itertools as itools
import random, scipy

from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
metadata = pd.read_feather('/content/drive/MyDrive/proyecto_rec_sys_IIC3633/metadata.feather')
reviews = pd.read_feather('/content/drive/MyDrive/proyecto_rec_sys_IIC3633/reviews.feather')

In [6]:
def bayesian_rating(row, C, m):
    R = row['avg_rating']
    v = row['num_of_reviews']
    return (v / (v + m)) * R + (m / (v + m)) * C

In [7]:
C = metadata['avg_rating'].mean()
m = metadata['num_of_reviews'].quantile(0.5) # Valor umbral, mediana de las reviews

metadata['rating_bayesiano'] = metadata.apply(bayesian_rating, C=C, m=m, axis=1)

In [8]:
np.random.seed(112)
users = reviews['user_id'].unique()
samp_users = np.random.choice(users, size=10000, replace=False)
samp = reviews[reviews['user_id'].isin(samp_users)]

In [9]:
import geopandas as gpd

shapefile_url = "https://data.cityofnewyork.us/api/geospatial/7t3b-ywvw?method=export&format=Shapefile"

boroughs = gpd.read_file(shapefile_url)

In [10]:
ids = samp['gmap_id'].unique()

In [11]:
samp_rest = metadata[metadata['gmap_id'].isin(ids)]

In [12]:
samp_rest.loc[:, ['category']] = samp_rest['category'].apply(lambda x: ', '.join(x))

In [13]:
categories = samp_rest['category'].str.get_dummies(sep=', ')
metadata_expanded = pd.concat([samp_rest.drop('category', axis=1), categories], axis=1)

metadata_expanded = gpd.GeoDataFrame(
    metadata_expanded, geometry=gpd.points_from_xy(metadata_expanded.longitude, metadata_expanded.latitude), crs="EPSG:4326"
)

In [14]:
columns_to_use = [col for col in metadata_expanded.columns if metadata_expanded[col].dtype in [np.float64, np.int64]]
#'latitude',
#'longitude',
#'avg_rating',
#'num_of_reviews',
#'index_right',
#'boro_code',
#'shape_leng',
#'shape_area',
#'rating_bayesiano',
#y todas las categoricas

In [15]:
columns_to_use.remove('boro_code')
columns_to_use.remove('shape_leng')
columns_to_use.remove('shape_area')
columns_to_use.remove('index_right')

In [16]:
col_norm = ['latitude',
'longitude',
'avg_rating',
'num_of_reviews',
'rating_bayesiano',]

scaler = StandardScaler()

# Normalizar las coordenadas y otros datos numéricos
metadata_expanded[col_norm] = scaler.fit_transform(
    metadata_expanded[col_norm]
)

### rankGeoFM

In [17]:
samp.columns

Index(['user_id', 'name', 'time', 'rating', 'text', 'pics', 'resp', 'gmap_id'], dtype='object')

In [18]:
df = samp[['user_id', 'gmap_id', 'rating']]
df.head()

Unnamed: 0,user_id,gmap_id,rating
28,1.069871e+20,0x89c258b8e75311b5:0x745721b4d1be3147,5
32,1.148414e+20,0x89c258b8e75311b5:0x745721b4d1be3147,2
37,1.000514e+20,0x89c25bb23e0eec35:0xe8a6498dea29132b,5
46,1.15675e+20,0x89c259063d62f499:0x5b5cab85882fd605,2
55,1.081353e+20,0x89c259063d62f499:0x5b5cab85882fd605,5


In [19]:
# Conseguimos la latitud y longitud de los restaurantes desde metadata
df = df.merge(metadata[['gmap_id', 'latitude', 'longitude']], on='gmap_id')
df

Unnamed: 0,user_id,gmap_id,rating,latitude,longitude
0,1.069871e+20,0x89c258b8e75311b5:0x745721b4d1be3147,5,40.772618,-73.952116
1,1.148414e+20,0x89c258b8e75311b5:0x745721b4d1be3147,2,40.772618,-73.952116
2,1.000514e+20,0x89c25bb23e0eec35:0xe8a6498dea29132b,5,40.686172,-73.978705
3,1.156750e+20,0x89c259063d62f499:0x5b5cab85882fd605,2,40.745304,-73.978768
4,1.081353e+20,0x89c259063d62f499:0x5b5cab85882fd605,5,40.745304,-73.978768
...,...,...,...,...,...
104511,1.141151e+20,0x89c258bf7e188dc7:0xdbd9387d875e2a02,5,40.771111,-73.956389
104512,1.103083e+20,0x89c258bf7e188dc7:0xdbd9387d875e2a02,4,40.771111,-73.956389
104513,1.082240e+20,0x89c258bf7e188dc7:0xdbd9387d875e2a02,4,40.771111,-73.956389
104514,1.020892e+20,0x89c258bf7e188dc7:0xdbd9387d875e2a02,4,40.771111,-73.956389


In [20]:
# Definición de funciones para la preparación de la matriz de características
def get_single_entries_in_fm_input_format(data, itemlist):
    column = len(itemlist)
    row = len(data)
    shape = (row, column)
    row_inds = np.zeros(len(data), dtype=int)
    col_inds = np.zeros(len(data), dtype=int)
    datalist = np.zeros(len(data), dtype=float)

    for i, item in enumerate(data):
        datalist[i] = 1
        if item in itemlist:
            col_inds[i] = np.where(itemlist == item)[0][0]
        else:
            raise ValueError(f"Item {item} not found in itemlist.")
        row_inds[i] = i

    return datalist, row_inds, col_inds, shape

def add_geographical_features(data, feature_names):
    num_entries = len(data)
    geodata = np.zeros((num_entries, 2), dtype=float)
    for i in range(num_entries):
        geodata[i, 0] = data.iloc[i]['latitude']
        geodata[i, 1] = data.iloc[i]['longitude']
    row_indices = np.repeat(np.arange(num_entries), 2)
    col_indices = np.array([feature_names.index('latitude'), feature_names.index('longitude')] * num_entries)
    data_values = geodata.flatten()
    return data_values, row_indices, col_indices

In [21]:
df.reset_index(inplace=True)

In [22]:
# Mapeo de identificaciones a índices
user_indices = {user: i for i, user in enumerate(df['user_id'].unique())}
restaurant_indices = {restaurant: i for i, restaurant in enumerate(df['gmap_id'].unique())}

In [23]:
# Creamos características
user_data = df['user_id'].map(user_indices).values
restaurant_data = df['gmap_id'].map(restaurant_indices).values

In [24]:
# Aplicamos funciones para obtener la matriz sparse
user_datalist, user_row_inds, user_col_inds, _ = get_single_entries_in_fm_input_format(user_data, list(user_indices.values()))
restaurant_datalist, restaurant_row_inds, restaurant_col_inds, _ = get_single_entries_in_fm_input_format(restaurant_data, list(restaurant_indices.values()))
geo_datalist, geo_row_inds, geo_col_inds = add_geographical_features(df, ['latitude', 'longitude'])

In [25]:
# Combinamos todas las características en una sola matriz sparse
datalist = np.concatenate([user_datalist, restaurant_datalist, geo_datalist])
row_inds = np.concatenate([user_row_inds, restaurant_row_inds, geo_row_inds])
col_inds = np.concatenate([user_col_inds, restaurant_col_inds, geo_col_inds])
total_features = len(user_indices) + len(restaurant_indices) + 2
shape = (df.shape[0], total_features)
X = csc_matrix((datalist, (row_inds, col_inds)), shape=shape)
y = df['rating'].values

In [26]:
# Dividimos los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(X, y, df.index, test_size=0.25, random_state=42)

In [27]:
# Entrenamiento del modelo de factorización de matrices
fm = als.FMRegression(n_iter=1000, init_stdev=0.1, rank=10, l2_reg_w=0.1, l2_reg_V=0.5)
fm.fit(X_train, y_train)

In [28]:
# Predicción
y_pred = fm.predict(X_test)

In [29]:
# Calculamos errores
error_als = mean_squared_error(y_test, y_pred)
print('Mean squared error under ALS: {}'.format(error_als))
print('Root mean squared error under ALS: {}'.format(np.sqrt(error_als)))

mae = np.mean(np.abs(y_test - y_pred))
print('Mean absolute error under ALS: {}'.format(mae))

Mean squared error under ALS: 1.6134929205547694
Root mean squared error under ALS: 1.2702334118400325
Mean absolute error under ALS: 0.936707028654463


In [30]:
# Preparamos DataFrame para evaluación de métricas
test_users = df.loc[idx_test, 'user_id']
test_restaurants = df.loc[idx_test, 'gmap_id']
predictions_df = pd.DataFrame({
    'user_id': test_users.values,
    'restaurant_id': test_restaurants.values,
    'predicted_rating': y_pred,
    'true_rating': y_test
})
predictions_df.sort_values(by=['user_id', 'predicted_rating'], ascending=[True, False], inplace=True)

In [31]:
def precision_at_k(df, k, threshold):
    # Calcula precision para cada usuario
    precisions = []
    for _, group in df.groupby('user_id'):
        relevant = group['true_rating'] >= threshold
        precisions.append(relevant.head(k).sum() / k)
    return np.mean(precisions)

def recall_at_k(df, k, threshold):
    # Calcula recall para cada usuario
    recalls = []
    for _, group in df.groupby('user_id'):
        relevant = group['true_rating'] >= threshold
        total_relevant = relevant.sum()
        if total_relevant > 0:
            recalls.append(relevant.head(k).sum() / total_relevant)
        else:
            recalls.append(0)
    return np.mean(recalls)

def mean_average_precision_at_k(df, k, threshold):
    # Calcula MAP para cada usuario
    ap_scores = []
    for _, group in df.groupby('user_id'):
        relevant = group['true_rating'] >= threshold
        n_relevant = 0
        average_precision = 0.0
        for i, rel in enumerate(relevant.head(k), start=1):
            if rel:
                n_relevant += 1
                average_precision += n_relevant / i
        if n_relevant > 0:
            average_precision /= n_relevant
        ap_scores.append(average_precision)
    return np.mean(ap_scores)

def dcg_at_k(scores, k):
    """Calculate DCG for the top k scoring items."""
    return scores[:k].sum()

def ndcg_at_k(df, k, threshold):
    """Calculate normalized DCG for all users and return the average."""
    ndcg_scores = []
    for _, group in df.groupby('user_id'):
        relevance_scores = (group['true_rating'] >= threshold).astype(int).values
        sorted_scores = np.sort(relevance_scores)[::-1]

        # Calculate DCG for the top k recommendations
        dcg = dcg_at_k(relevance_scores[:k], k)

        # Calculate ideal DCG for the top k recommendations
        idcg = dcg_at_k(sorted_scores[:k], k)

        # Protect against division by zero in case idcg is zero
        if idcg == 0:
            ndcg_scores.append(0)
        else:
            ndcg_scores.append(dcg / idcg)

    return np.mean(ndcg_scores) if ndcg_scores else 0

In [32]:
k = 10
threshold = 5

precision = precision_at_k(predictions_df, k, threshold)
recall = recall_at_k(predictions_df, k, threshold)
map_score = mean_average_precision_at_k(predictions_df, k, threshold)

print(f"Precision@{k}: {precision:.4f}")
print(f"Recall@{k}: {recall:.4f}")
print(f"MAP@{k}: {map_score:.4f}")

try:
    ndcg = ndcg_at_k(predictions_df, k, threshold)
    print(f"NDCG@{k}: {ndcg:.4f}")
except Exception as e:
    print(f"An error occurred: {e}")

Precision@10: 0.1695
Recall@10: 0.7445
MAP@10: 0.6498
NDCG@10: 0.7477


#### Evaluación para un usuario específico

In [47]:
user_id = 110104426283076616192

In [48]:
# Obtenemos todos los restaurantes que el usuario específico ya ha calificado
user_rated_restaurants = df[df['user_id'] == user_id]['gmap_id'].unique()

# Todos los restaurantes posibles
all_restaurants = df['gmap_id'].unique()

# Filtramos para obtener solo los restaurantes no calificados por el usuario
unrated_restaurants = np.setdiff1d(all_restaurants, user_rated_restaurants)

In [49]:
# Creamos el array para el usuario específico, repitiendo su ID para cada restaurante no calificado
user_data = np.array([user_id] * len(unrated_restaurants))

# Usamos la función para transformar los datos de usuario y restaurante a formato disperso
user_datalist, user_row_inds, user_col_inds, user_shape = get_single_entries_in_fm_input_format(user_data, userlist)
restaurant_datalist, restaurant_row_inds, restaurant_col_inds, restaurant_shape = get_single_entries_in_fm_input_format(unrated_restaurants, restaurantlist)

# Combinamos las entradas para la matriz dispersa
from scipy.sparse import coo_matrix

# Nos aseguramos de ajustar los índices de columna para los restaurantes
shift_cols = len(userlist)
restaurant_col_inds = shift_cols + restaurant_col_inds

total_features = len(userlist) + len(restaurantlist)
total_entries = len(user_datalist) + len(restaurant_datalist)
datalist = np.concatenate([user_datalist, restaurant_datalist])
row_inds = np.concatenate([user_row_inds, restaurant_row_inds])
col_inds = np.concatenate([user_col_inds, restaurant_col_inds])

# Creamos la matriz dispersa final para las predicciones
X_predict = coo_matrix((datalist, (row_inds, col_inds)), shape=(user_shape[0], total_features))

In [50]:
print("Número de características en X_predict: ", X_predict.shape[1])
print("Número de características que el modelo espera: ", len(fm.w_))

Número de características en X_predict:  26937
Número de características que el modelo espera:  26939


In [51]:
from scipy.sparse import hstack

if X_predict.shape[1] != len(fm.w_):
    # Ajustamos X_predict
    missing_cols = len(fm.w_) - X_predict.shape[1]
    if missing_cols > 0:
        X_predict = hstack([X_predict, coo_matrix((X_predict.shape[0], missing_cols))], format='coo')

In [52]:
print("Número de características en X_predict: ", X_predict.shape[1])
print("Número de características que el modelo espera: ", len(fm.w_))

Número de características en X_predict:  26939
Número de características que el modelo espera:  26939


In [53]:
y_pred_user = fm.predict(X_predict)

# Mostramos las predicciones
predictions_df = pd.DataFrame({
    'restaurant_id': unrated_restaurants,
    'predicted_rating': y_pred_user
})
top_predictions = predictions_df.sort_values(by='predicted_rating', ascending=False).head(10)
print(top_predictions)

                               restaurant_id  predicted_rating
7824   0x89c259fc38103eeb:0x9d312733d2e644c6         11.266483
8931   0x89c25ae490fc5ad1:0x1674fb4dcf8c942a         11.170501
5709   0x89c2599220925a47:0x23446a07fc576647         11.141535
6029   0x89c2599847b3905d:0x5e1bf0d8a3586757         11.132243
7817    0x89c259f7699acdd3:0x8be5fd69ac00af9         11.125738
10873   0x89c25c9972502351:0x6a3b1a4849fb836         11.056967
14711  0x89c28cb9632a19a5:0xec5fa0b8aa9b78ea         11.051325
4254   0x89c25928790e3d2b:0x739b3c92a765e8c9         10.975860
8772   0x89c25aae09d307d7:0x82eb059e0bce24d1         10.956107
12015  0x89c25f2e06a8d0fb:0xae66818256a80dbf         10.946085


In [54]:
# Normalizamos las predicciones
y_pred_user_clipped = np.clip(y_pred_user, 1, 5)

# Mostramos las predicciones
predictions_df = pd.DataFrame({
    'restaurant_id': unrated_restaurants,
    'predicted_rating': y_pred_user_clipped
})
top_predictions = predictions_df.sort_values(by='predicted_rating', ascending=False).head(10)
print(top_predictions)

                               restaurant_id  predicted_rating
0      0x4065fa5574817035:0x55c809f95c507191               5.0
11290  0x89c25df5fb899e65:0x806e9d7c700fb016               5.0
11276  0x89c25df198d23ea1:0x8937806281c5bc4a               5.0
11277  0x89c25df27353c7d3:0x5a17c12a4cf6ea57               5.0
11278  0x89c25df3ab462e9d:0xa857aaab401b289b               5.0
11279  0x89c25df40dc00101:0xc4cb3317ac05d046               5.0
11280  0x89c25df4368aab87:0x99d04f24d71b2cc9               5.0
11281  0x89c25df436af0d73:0x526b67122b725eb4               5.0
11282  0x89c25df437c465e5:0x9847a3666f6eb234               5.0
11283  0x89c25df437e4960b:0x4840493ef9cacac9               5.0
