# Recommender Systems


## Imports


In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from surprise import Reader, AlgoBase, Dataset, accuracy
from surprise.model_selection import train_test_split, cross_validate, KFold

from math import radians, sin, cos, sqrt, atan2
import json

In [23]:
SEED = 42
np.random.seed(SEED)

# Set plot styling
plt.style.use("ggplot")
sns.set(style="whitegrid")

## Load Data


In [24]:
# Set the path to your dataset folder
data_path = "../data/"

# Load each CSV into a DataFrame
df = pd.read_csv(data_path + "merged.csv")

# Feature engineering

In [25]:
df

Unnamed: 0,user_id,restaurant_id,rating,review_text,source_user_name,weekday,review_length,name,address,location_lat,...,overall_magnitude,food_score,service_score,value_score,ambiance_score,language,emotions,year,month,day
0,1,ChIJaZxVboEoQg0R2Xyk4VBN6QA,5,"As soon as you step inside, you’re enveloped i...",Melissa Pedroso-Pearson,Thursday,64,CEBO Madrid Restaurant,"Cra de S. Jerónimo, 34, Centro, 28014 Madrid, ...",40.416360,...,5.0,0.90,-0.1,0.0,0.9,en,"[""satisfaction""]",2024,12,19
1,2,ChIJaZxVboEoQg0R2Xyk4VBN6QA,5,"If you can afford the place, it is well worth ...",Guillaume Slama,Monday,146,CEBO Madrid Restaurant,"Cra de S. Jerónimo, 34, Centro, 28014 Madrid, ...",40.416360,...,5.9,0.45,0.1,0.8,-0.1,en,"[""contentment""]",2024,9,30
2,3,ChIJaZxVboEoQg0R2Xyk4VBN6QA,5,The CEBO offers excellent creative and yet “Sp...,Martin MD,Friday,29,CEBO Madrid Restaurant,"Cra de S. Jerónimo, 34, Centro, 28014 Madrid, ...",40.416360,...,1.9,0.90,0.0,0.0,0.0,en,"[""joy""]",2025,2,7
3,4,ChIJaZxVboEoQg0R2Xyk4VBN6QA,5,Amazing experience. We had the 16 course tasti...,Ju Do,Friday,38,CEBO Madrid Restaurant,"Cra de S. Jerónimo, 34, Centro, 28014 Madrid, ...",40.416360,...,6.6,0.90,0.9,0.0,0.0,en,"[""joy""]",2024,6,14
4,5,ChIJ189rc3soQg0R7l7WUa5fgo8,3,"When in Madrid. That’s me, I’m in Madrid. Grea...",Alex Yang,Saturday,89,El Minibar,"C/ del Mesón de Paños, 1, Centro, 28013 Madrid...",40.416334,...,7.3,0.00,0.0,0.0,0.9,en,"[""contentment""]",2025,3,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2115,1951,ChIJzX2zEfIpQg0RaUfFohjpnDE,2,I was expecting something good from the pictur...,Romin,Sunday,75,Tropicalista GastroBar | Brasileño en Madrid,"C. de la Ballesta, 2, Centro, 28004 Madrid, Spain",40.421114,...,6.2,-0.75,0.0,0.0,0.0,en,"[""frustration""]",2025,2,23
2116,1845,ChIJzX2zEfIpQg0RaUfFohjpnDE,4,The Coxinha was really nice. The frango was ok...,Kansala Wanuntho,Sunday,20,Tropicalista GastroBar | Brasileño en Madrid,"C. de la Ballesta, 2, Centro, 28004 Madrid, Spain",40.421114,...,2.1,0.00,0.8,0.0,0.8,en,"[""satisfaction""]",2025,3,9
2117,1952,ChIJzX2zEfIpQg0RaUfFohjpnDE,5,Brazilian paradise close to Gran Vía and close...,Gabriel Zaiden,Monday,29,Tropicalista GastroBar | Brasileño en Madrid,"C. de la Ballesta, 2, Centro, 28004 Madrid, Spain",40.421114,...,2.6,0.90,0.9,0.0,0.0,en,"[""joy""]",2025,3,3
2118,1953,ChIJzX2zEfIpQg0RaUfFohjpnDE,5,Found this restaurant in the back streets of G...,A S,Monday,87,Tropicalista GastroBar | Brasileño en Madrid,"C. de la Ballesta, 2, Centro, 28004 Madrid, Spain",40.421114,...,5.1,0.00,0.0,0.9,0.0,en,"[""joy""]",2023,10,30


In [26]:
CENTRE_LAT, CENTRE_LON = 40.4168, -3.7038

df["region"] = (
    df["location_lat"].apply(lambda x: "North" if x > CENTRE_LAT else "South") + "_" +
    df["location_lng"].apply(lambda x: "East" if x > CENTRE_LON else "West")
)


# Haversine distance to centre
def haversine(lat, lon):
    R = 6371  # Earth radius in km
    dlat = radians(lat - CENTRE_LAT)
    dlon = radians(lon - CENTRE_LON)
    a = sin(dlat/2)**2 + cos(radians(CENTRE_LAT)) * cos(radians(lat)) * sin(dlon/2)**2
    return R * 2 * atan2(sqrt(a), sqrt(1 - a))

df["dist_to_centre_km"] = df.apply(lambda r: haversine(r.location_lat, r.location_lng), axis=1)


In [27]:
month_to_season = {
    12: "Winter", 1: "Winter", 2: "Winter",
    3: "Spring", 4: "Spring", 5: "Spring",
    6: "Summer", 7: "Summer", 8: "Summer",
    9: "Fall", 10: "Fall", 11: "Fall"
}
df["season"] = df["month"].map(month_to_season)

In [28]:
# Safely parse JSON, skip NaNs
def safe_parse(attr):
    if pd.isna(attr):
        return {}
    try:
        return json.loads(attr)
    except Exception:
        return {}

# Apply safe parsing and expand into columns
attributes_expanded = df['attributes'].apply(safe_parse).apply(pd.Series)

# Merge back and drop original 'attributes' column
df = pd.concat([df.drop(columns=['attributes']), attributes_expanded], axis=1)


In [29]:
df.head(5)

Unnamed: 0,user_id,restaurant_id,rating,review_text,source_user_name,weekday,review_length,name,address,location_lat,...,day,region,dist_to_centre_km,season,dineIn,takeout,delivery,outdoorSeating,reservable,userRatingCount
0,1,ChIJaZxVboEoQg0R2Xyk4VBN6QA,5,"As soon as you step inside, you’re enveloped i...",Melissa Pedroso-Pearson,Thursday,64,CEBO Madrid Restaurant,"Cra de S. Jerónimo, 34, Centro, 28014 Madrid, ...",40.41636,...,19,South_East,0.44041,Winter,True,False,False,False,True,1006.0
1,2,ChIJaZxVboEoQg0R2Xyk4VBN6QA,5,"If you can afford the place, it is well worth ...",Guillaume Slama,Monday,146,CEBO Madrid Restaurant,"Cra de S. Jerónimo, 34, Centro, 28014 Madrid, ...",40.41636,...,30,South_East,0.44041,Fall,True,False,False,False,True,1006.0
2,3,ChIJaZxVboEoQg0R2Xyk4VBN6QA,5,The CEBO offers excellent creative and yet “Sp...,Martin MD,Friday,29,CEBO Madrid Restaurant,"Cra de S. Jerónimo, 34, Centro, 28014 Madrid, ...",40.41636,...,7,South_East,0.44041,Winter,True,False,False,False,True,1006.0
3,4,ChIJaZxVboEoQg0R2Xyk4VBN6QA,5,Amazing experience. We had the 16 course tasti...,Ju Do,Friday,38,CEBO Madrid Restaurant,"Cra de S. Jerónimo, 34, Centro, 28014 Madrid, ...",40.41636,...,14,South_East,0.44041,Summer,True,False,False,False,True,1006.0
4,5,ChIJ189rc3soQg0R7l7WUa5fgo8,3,"When in Madrid. That’s me, I’m in Madrid. Grea...",Alex Yang,Saturday,89,El Minibar,"C/ del Mesón de Paños, 1, Centro, 28013 Madrid...",40.416334,...,8,South_West,0.459365,Spring,True,False,False,False,True,7216.0


In [30]:
# List of attribute columns to dummy encode (already parsed from JSON)
attribute_cols = ["dineIn", "takeout", "delivery", "outdoorSeating", "reservable"]

# Convert boolean columns to strings (if needed) to get proper dummy variables
df[attribute_cols] = df[attribute_cols].astype(str)

# Include original categorical columns + attributes for one-hot encoding
categorical_cols = ["region", "season", "weekday"] + attribute_cols

# Perform one-hot encoding
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


## Test/Train Split


In [32]:
reader = Reader(rating_scale=(df["rating"].min(), df["rating"].max()))
data = Dataset.load_from_df(df[["user_id", "item_id", "rating"]], reader)

trainset, testset = train_test_split(data, test_size=0.2, random_state=SEED)

KeyError: "['item_id'] not in index"

## Recommenders


### Random


In [None]:
class MyRandomRecommender(AlgoBase):
    """
    Random recommender based on the normal distribution of ratings.
    """

    def __init__(self):
        AlgoBase.__init__(self)

    def estimate(self, u, i):
        """
        Predict a random rating from the normal distribution.
        Note: u and i are not used as this is non-personalized.
        """
        return np.random.normal(loc=self.train_mean, scale=self.train_std)

    def fit(self, trainset):
        """
        Train the algorithm on the trainset.
        """
        AlgoBase.fit(self, trainset)

        # Calculate mean and standard deviation of all ratings
        ratings = [r for (_, _, r) in self.trainset.all_ratings()]
        self.train_mean = np.mean(ratings)
        self.train_std = np.std(ratings)

        return self


random_RS = MyRandomRecommender()

### Popular


In [None]:
class MyPopularRecommender(AlgoBase):
    """
    Popularity-based recommender that predicts the average rating for each item.
    """

    def __init__(self):
        AlgoBase.__init__(self)

    def estimate(self, u, i):
        """
        Predict the average rating for item i.
        Note: u is not used as this is non-personalized.
        """
        # Convert internal item id to raw item id
        try:
            raw_iid = self.trainset.to_raw_iid(i)
        except ValueError:
            # If item not in training set, return global mean
            return self.trainset.global_mean

        if raw_iid in self.mean_rating_per_item_df.index:
            return self.mean_rating_per_item_df.loc[raw_iid]["rating"]
        else:
            # For items not in training set, return global mean
            return self.trainset.global_mean

    def fit(self, trainset):
        """
        Train the algorithm on the trainset.
        """
        AlgoBase.fit(self, trainset)

        # Convert internal ids to raw ids for better readability
        ratings_list = [
            (self.trainset.to_raw_iid(i), r)
            for (_, i, r) in self.trainset.all_ratings()
        ]

        # Create DataFrame of all ratings
        ratings_df = pd.DataFrame(ratings_list, columns=["item", "rating"])

        # Calculate mean rating per item
        self.mean_rating_per_item_df = ratings_df.groupby("item").agg(
            {"rating": "mean"}
        )

        return self


popular_RS = MyPopularRecommender()

### Collaborative Filtering


### Content Based


### Context Aware


### Armed Bandits


### Hybrid


## Evaluation


### Test RMSE


In [None]:
random_RS.fit(trainset)
predictions_random = random_RS.test(testset)
accuracy.rmse(predictions_random, verbose=True)

In [None]:
popular_RS.fit(trainset)
predictions_popular = popular_RS.test(testset)
accuracy.rmse(predictions_popular, verbose=True)

### Cross-Validation


### Quality Metrics


## Results
