In [1]:
from sklearn.neighbors import BallTree
import numpy as np
import pandas as pd
import numpy as np
import json
from geopy.distance import geodesic
from pathlib import Path
import warnings
import os

# Selection des features numériques

L’objectif de ce projet est d’identifier des profils de restaurants distincts à partir de signaux observables dans les données Yelp sans imposer de catégories prédéfinies. Plutôt que d'utiliser un système de règles ou des labels arbitraires, nous avons choisi une approche non supervisée qui repose sur des indicateurs interprétables.

La sélection des features repose sur l’hypothèse que la perception d’un restaurant ne dépend pas uniquement de sa note moyenne, mais d’une combinaison de facteurs complémentaires comme sa popularité, la qualité perçue, l'engagement des clients, la dynamique temporelle et le profil des reviewers. Ces dimensions permettent de capturer à la fois la performance actuelle d’un établissement, sa trajectoire dans le temps et la nature de l’attention qu’il suscite.

Les features retenues ont été volontairement limitées à des indicateurs robustes agrégés au niveau du restaurant pour garantir la stabilité du clustering et faciliter l’interprétation. Les contenus textuels des avis n’ont pas été utilisés directement pour la construction des clusters, mais mobilisés dans un second temps via des embeddings SBERT afin d’expliquer et de caractériser qualitativement les groupes identifiés.

Ce fichier est destiné à la construction des features numériques que nous allons utiliser.

### Importation des données

In [4]:
files_to_check = [
    'data-2/philly_restaurants.json',
    'data-2/philly_restaurant_reviews.json',
    'data-2/philly_restaurant_users.json',
    'data-2/philly_restaurant_photos.json'
]


def load_json_data(filepath):
    """Charge un fichier JSON ligne par ligne (format Yelp)"""
    data = []
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                data.append(json.loads(line))
        print(f"{filepath}: {len(data)} enregistrements chargés")
        return pd.DataFrame(data)
    except FileNotFoundError:
        print(f"{filepath}: Fichier non trouvé")
        return None
    except Exception as e:
        print(f"{filepath}: Erreur - {e}")
        return None

# Chargement des datasets
restaurants_df = load_json_data(files_to_check[0])
reviews_df = load_json_data(files_to_check[1])
users_df = load_json_data(files_to_check[2])
photos_df = load_json_data(files_to_check[3])

data-2/philly_restaurants.json: 5852 enregistrements chargés
data-2/philly_restaurant_reviews.json: 687289 enregistrements chargés
data-2/philly_restaurant_users.json: 209513 enregistrements chargés
data-2/philly_restaurant_photos.json: 22295 enregistrements chargés


## Dataset

### Léger nettoyage des avis

In [23]:
restaurant_reviews = None

if reviews_df is not None and restaurants_df is not None:
    # Identifier colonnes clés
    business_id_col = 'business_id'
    text_col = None
    stars_col = None

    for col in reviews_df.columns:
        if 'text' in col.lower() and text_col is None:
            text_col = col
        if 'star' in col.lower() and stars_col is None:
            stars_col = col

    print(f" Colonne business_id: {business_id_col}")
    print(f" Colonne texte: {text_col}")
    print(f" Colonne notes: {stars_col}")

if text_col and stars_col:
        # Joindre avis et restaurants
        restaurant_reviews = reviews_df.merge(
            restaurants_df[[business_id_col, 'name']],
            on=business_id_col,
            how='inner'
        )

        print(f"\n Avis de restaurants: {len(restaurant_reviews):,}")
        print(f"   (sur {len(reviews_df):,} avis totaux)")

        # Distribution notes
        print(f"\n Distribution des notes:")
        stars_dist = restaurant_reviews[stars_col].value_counts().sort_index()
        for star, count in stars_dist.items():
            pct = count / len(restaurant_reviews) * 100
            bar = '█' * int(pct / 2)
            print(f"   {star}★: {count:>8,} ({pct:>5.1f}%) {bar}")

        # Longueur des avis
        restaurant_reviews['review_length'] = restaurant_reviews[text_col].str.len()

        print(f"\n Longueur des avis:")
        print(f"   Moyenne: {restaurant_reviews['review_length'].mean():.0f} caractères")
        print(f"   Médiane: {restaurant_reviews['review_length'].median():.0f} caractères")
        print(f"   Min: {restaurant_reviews['review_length'].min():.0f}")
        print(f"   Max: {restaurant_reviews['review_length'].max():.0f}")

        # Nettoyage minimal (seulement avis complètement vides)
        # Avant j'avais enlevé les trops courts mais en vrai ils sont quand meme relevant (ex: Top, Super, etc.)
        print(f"\n Nettoyage minimal (uniquement avis vides)")
        initial = len(restaurant_reviews)
        restaurant_reviews = restaurant_reviews[
            restaurant_reviews[text_col].notna() &
            (restaurant_reviews[text_col].str.strip() != '')
        ].copy()
        print(f"   Avis vides supprimés: {initial - len(restaurant_reviews):,}")

 Colonne business_id: business_id
 Colonne texte: text
 Colonne notes: stars

 Avis de restaurants: 687,289
   (sur 687,289 avis totaux)

 Distribution des notes:
   1.0★:   66,624 (  9.7%) ████
   2.0★:   57,480 (  8.4%) ████
   3.0★:   91,702 ( 13.3%) ██████
   4.0★:  194,366 ( 28.3%) ██████████████
   5.0★:  277,117 ( 40.3%) ████████████████████

 Longueur des avis:
   Moyenne: 602 caractères
   Médiane: 443 caractères
   Min: 1
   Max: 5000

 Nettoyage minimal (uniquement avis vides)
   Avis vides supprimés: 0


In [5]:
restaurant_reviews_users = restaurant_reviews.merge(
    users_df[[
        "user_id",
        "review_count",
        "yelping_since",
        "elite",
        "fans",
        "useful",
        "funny",
        "cool",
        "average_stars",
        *[c for c in users_df.columns if c.startswith("compliment_")]
    ]],
    on="user_id",
    how="left"
)

In [6]:
df = restaurant_reviews_users.copy()

## Popularité

### Volume d'avis

In [7]:
popularity_features = (
    df.groupby("business_id")
    .agg(
        review_count=("stars", "count")
    )
    .reset_index()
)

popularity_features["log_review_count"] = np.log1p(
    popularity_features["review_count"]
)

## Qualité

### Note moyenne et variance des notes

In [8]:
quality_features = (
    df.groupby("business_id")
    .agg(
        stars=("stars", "mean"),
        rating_std=("stars", "std"),
        positive_ratio=("stars", lambda x: (x >= 4).mean()),
        negative_ratio=("stars", lambda x: (x <= 2).mean())
    )
    .reset_index()
)

quality_features["rating_std"] = quality_features["rating_std"].fillna(0)

## Engagement

In [9]:
photos = photos_df.copy()

### Nombre de photos

In [10]:
engagement_features = (
    photos
    .groupby("business_id")
    .size()
    .rename("photo_count")
    .reset_index()
)

engagement_features["log_photo_count"] = np.log1p(
    engagement_features["photo_count"]
)

## Temporel

### Age du resto et croissance dans les 6 derniers mois

In [11]:
df["date"] = pd.to_datetime(df["date"], errors="coerce")

# âge du business
temporal_features = (
    df.groupby("business_id")
    .agg(
        first_review=("date", "min"),
        last_review=("date", "max"),
        total_reviews=("stars", "count")
    )
    .reset_index()
)

ref_date = df["date"].max()

temporal_features["business_age_years"] = (
    (ref_date - temporal_features["first_review"]).dt.days / 365
).clip(lower=0.1)

# croissance récente (6 derniers mois)
recent_cutoff = ref_date - pd.Timedelta(days=180)

recent_reviews = (
    df[df["date"] >= recent_cutoff]
    .groupby("business_id")
    .size()
    .rename("recent_reviews")
    .reset_index()
)

temporal_features = temporal_features.merge(
    recent_reviews,
    on="business_id",
    how="left"
)

temporal_features["recent_reviews"] = temporal_features["recent_reviews"].fillna(0)

temporal_features["review_growth_rate"] = (
    temporal_features["recent_reviews"] /
    temporal_features["total_reviews"]
)

## Ambiance

In [14]:
import ast

# Mapping de 1 à 9 pour les 9 catégories d'ambiance
ambiance_mapping = {
    'romantic': 1,
    'intimate': 2,
    'classy': 3,
    'upscale': 4,
    'trendy': 5,
    'hipster': 6,
    'divey': 7,
    'touristy': 8,
    'casual': 9
}

def get_ambiance_cluster(ambiance_val):
    # Cas où la donnée est absente
    if pd.isna(ambiance_val) or ambiance_val == "None":
        return 0
    
    try:
        # Conversion en dictionnaire si c'est une chaîne de caractères
        if isinstance(ambiance_val, str):
            amb_dict = ast.literal_eval(ambiance_val)
        else:
            amb_dict = ambiance_val
            
        #On vérifie chaque catégorie dans l'ordre du mapping
        # Le premier qui est à True donne son numéro
        for key, cluster_value in ambiance_mapping.items():
            if amb_dict.get(key) is True:
                return cluster_value
        
        #Si aucun n'est True (tous à False ou None)
        return 0
    except:
        return 0

# Extraction de la colonne Ambience et création du cluster
# On utilise .get() pour éviter les erreurs si la clé 'Ambience' n'existe pas dans attributes
ambience_col = restaurants_df["attributes"].apply(lambda x: x.get("Ambience") if isinstance(x, dict) else None)
restaurants_df['cluster_ambiance'] = ambience_col.apply(get_ambiance_cluster)

In [16]:
restaurants_df['cluster_ambiance'].value_counts().sort_index()

cluster_ambiance
0    3072
1     112
2     106
3     703
4       9
5     169
6      98
7     111
8      24
9    1448
Name: count, dtype: int64

## Rewiewers

### Elite users

In [17]:
# Date création du compte
df['yelping_since'] = pd.to_datetime(
    df['yelping_since'],
    errors='coerce'
)

df = df.dropna(
    subset=['yelping_since', 'date']
)

# Ancienneté + nb de revues par an
ref_date = df["date"].max()

df["account_age_years"] = (
    (ref_date - df["yelping_since"]).dt.days / 365
).clip(lower=0.1)

df["reviews_per_year"] = (
    df["review_count"] /
    df["account_age_years"]
)
# nombre d'années où l'utilisateur est élite
df['elite_years'] = df['elite'].apply(
    lambda x: len(x) if isinstance(x, list) else 0
)

#ratio élite
df['elite_ratio'] = df['elite_years'] / df['account_age_years']

#Si c'est le ratio est sup à 0.15 c'est un pro elite
df['is_pro_elite_user'] = (
    df['elite_ratio'] >= 0.15
).astype(int)

### Experts

In [18]:
#Pour les utilisateurs expert
df['is_expert'] = (
    (df['reviews_per_year'] >= 8) &
    (df['account_age_years'] >= 2) &
    (
        df['average_stars'].between(3.2, 4.6) |
        (df['elite_ratio'] >= 0.15)
    )
).astype(int)

def mean_expert_rating(df):
    return df[df['is_expert'] == 1]['stars'].mean()

### Influenceurs

In [19]:
# Compliments
compliment_cols = [c for c in restaurant_reviews.columns if c.startswith("compliment_")]

df["compliment_score"] = (
    df[compliment_cols].sum(axis=1)
)

# Score d'influence
df["influence_score"] = (
    df["fans"] * 2 + df["compliment_score"]
)

# Compression des extrêmes
df["log_influence_score"] = np.log1p(
    df["influence_score"]
)

threshold = df["log_influence_score"].quantile(0.80)

df["is_high_influence_user"] = (
    df["log_influence_score"] >= threshold
).astype(int)

### Ratios d'expert, elites et influenceurs

In [20]:
reviewer_features = (
    df.groupby("business_id")
    .agg(
        expert_reviewer_ratio=("is_expert", "mean"),
        elite_reviewer_ratio=("is_pro_elite_user", "mean"),
        high_influence_ratio=("is_high_influence_user", "mean"),
        mean_influence_score=("log_influence_score", "mean")
    )
    .reset_index()
)


# Merge Final

In [21]:
features_df = (
    popularity_features
    .merge(quality_features, on="business_id", how="left")
    .merge(engagement_features, on="business_id", how="left")
    .merge(temporal_features[
        ["business_id", "business_age_years", "review_growth_rate"]
    ], on="business_id", how="left")
    .merge(reviewer_features, on="business_id", how="left")
   
)
features_df = pd.merge(
    features_df, 
    restaurants_df[['cluster_ambiance', 'business_id']], 
    on="business_id", 
    how="left"
)
features_df = pd.merge(
    features_df, 
    restaurants_df[['is_open', 'business_id']], 
    on="business_id", 
    how="left"
)

# Features finaux

In [22]:
features_df

Unnamed: 0,business_id,review_count,log_review_count,stars,rating_std,positive_ratio,negative_ratio,photo_count,log_photo_count,business_age_years,review_growth_rate,expert_reviewer_ratio,elite_reviewer_ratio,high_influence_ratio,mean_influence_score,cluster_ambiance,is_open
0,-0M0b-XhtFagyLmsBtOe8w,18,2.944439,3.722222,0.894792,0.722222,0.055556,1.0,0.693147,9.947945,0.000000,0.555556,0.0,0.500000,2.795045,2,0
1,-0PN_KFPtbnLQZEeb23XiA,11,2.484907,3.363636,1.361817,0.545455,0.181818,,,12.268493,0.000000,0.272727,0.0,0.181818,1.469558,0,0
2,-0TffRSXXIlBYVbb5AwfTg,1132,7.032624,4.355124,0.983165,0.851590,0.066254,52.0,3.970292,8.641096,0.041519,0.380742,0.0,0.183746,1.486067,3,1
3,-0eUa8TsXFFy0FCxHYmrjg,28,3.367296,3.964286,1.070899,0.714286,0.142857,1.0,0.693147,6.254795,0.000000,0.392857,0.0,0.107143,1.333880,0,0
4,-1B9pP_CrRBJYPICE5WbRA,851,6.747587,3.808461,1.205760,0.696827,0.150411,7.0,2.079442,9.512329,0.014101,0.327850,0.0,0.166863,1.406769,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5847,zxY4DgtXsVHihSUpsmwamg,6,1.945910,4.333333,0.516398,1.000000,0.000000,,,9.709589,0.000000,0.666667,0.0,0.500000,3.501224,0,1
5848,zy7uNOvpykrq-XlmDY_wHA,6,1.945910,3.166667,0.983192,0.500000,0.333333,,,14.008219,0.000000,0.500000,0.0,0.500000,3.254321,0,0
5849,zyMkbavgHASQtqVwaock9A,21,3.091042,3.809524,0.928388,0.761905,0.142857,,,11.709589,0.000000,0.238095,0.0,0.238095,1.351094,0,0
5850,zz-fcqurtm77bZ_rVvo2Lw,25,3.258097,4.480000,0.714143,0.880000,0.000000,3.0,1.386294,10.180822,0.000000,0.600000,0.0,0.440000,2.349051,6,0


In [126]:
features_df.to_csv('features.csv', index=False)