# Phase A: Exploration et Analyse des Données

## Objectifs
- Charger les datasets (Business, User, Review)
- Nettoyer et fusionner les données
- Analyser les distributions (Catégories, Notes, etc.)
- Analyser les corrélations

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Configuration
# Note: Structure is Data/filename/filename (nested folders)
DATA_DIR = "../Data"
pd.set_option('display.max_columns', None)

In [None]:
# Chargement des données
print("Loading Business data...")
business_df = pd.read_json(
    os.path.join(DATA_DIR, "yelp_academic_dataset_business.json", "yelp_academic_dataset_business.json"),
    lines=True
)
print(f"Business shape: {business_df.shape}")

print("Loading User data...")
user_df = pd.read_json(
    os.path.join(DATA_DIR, "yelp_academic_dataset_user4students.jsonl", "yelp_academic_dataset_user4students.jsonl"),
    lines=True
)
print(f"User shape: {user_df.shape}")

print("Loading Review data (this might take a while)...")
review_df = pd.read_json(
    os.path.join(DATA_DIR, "yelp_academic_reviews4students.jsonl", "yelp_academic_reviews4students.jsonl"),
    lines=True
)
print(f"Review shape: {review_df.shape}")

In [None]:
business_df.head(3)

In [None]:
review_df.head(3)

## 1. Fusion des données

In [None]:
# Fusion Review + Business (pour avoir la catégorie et infos business)
df = review_df.merge(business_df, on="business_id", how="left", suffixes=("_review", "_biz"))

# Fusion + User (pour avoir les infos user)
df = df.merge(user_df, on="user_id", how="left", suffixes=("", "_user"))

print(f"Fused dataset shape: {df.shape}")
df.head(2)

## 2. Analyse des catégories de Business

In [None]:
# Explosion des catégories (car format "Cat1, Cat2, ...")
categories_expanded = business_df.assign(categories=business_df['categories'].str.split(', ')).explode('categories')
top_categories = categories_expanded['categories'].value_counts().head(20)

plt.figure(figsize=(12, 6))
sns.barplot(x=top_categories.values, y=top_categories.index, palette="viridis")
plt.title("Top 20 Business Categories")
plt.xlabel("Number of Businesses")
plt.tight_layout()
plt.savefig("../references/fig_categories.png", dpi=150)
plt.show()

## 3. Corrélation : Nombre d'avis vs Note Moyenne

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=business_df, x="review_count", y="stars", alpha=0.5)
plt.title("Business Review Count vs Stars")
plt.xscale('log')
plt.tight_layout()
plt.savefig("../references/fig_reviews_vs_stars.png", dpi=150)
plt.show()

## 4. Analyse de la sévérité des "Big Reviewers"

In [None]:
# Calculer la note moyenne donnée par chaque utilisateur
user_avg_rating = review_df.groupby('user_id')['stars'].agg(['mean', 'count']).reset_index()
user_avg_rating.columns = ['user_id', 'avg_stars_given', 'review_count']

# Big reviewers = top 10% par nombre d'avis
threshold = user_avg_rating['review_count'].quantile(0.90)
user_avg_rating['is_big_reviewer'] = user_avg_rating['review_count'] >= threshold

# Comparaison
comparison = user_avg_rating.groupby('is_big_reviewer')['avg_stars_given'].mean()
print("Average rating given by reviewers:")
print(comparison)

plt.figure(figsize=(8, 5))
sns.boxplot(data=user_avg_rating, x='is_big_reviewer', y='avg_stars_given')
plt.xticks([0, 1], ['Regular Reviewers', 'Big Reviewers (Top 10%)'])
plt.title("Are Big Reviewers More Severe?")
plt.ylabel("Average Stars Given")
plt.tight_layout()
plt.savefig("../references/fig_big_reviewers.png", dpi=150)
plt.show()

## 5. Longueur des reviews vs Note

In [None]:
# Ajouter longueur du texte
review_df['text_length'] = review_df['text'].str.len()

# Moyenne par classe de note
length_by_stars = review_df.groupby('stars')['text_length'].mean()
print("Average review length by stars:")
print(length_by_stars)

plt.figure(figsize=(8, 5))
sns.barplot(x=length_by_stars.index, y=length_by_stars.values, palette="coolwarm")
plt.title("Average Review Length by Star Rating")
plt.xlabel("Stars")
plt.ylabel("Average Text Length (chars)")
plt.tight_layout()
plt.savefig("../references/fig_length_vs_stars.png", dpi=150)
plt.show()

## 6. Vocabulaire TF-IDF : Positifs vs Négatifs

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sous-échantillon pour la rapidité
sample_size = 10000
positive_reviews = review_df[review_df['stars'] >= 4]['text'].sample(min(sample_size, len(review_df[review_df['stars'] >= 4])), random_state=42)
negative_reviews = review_df[review_df['stars'] <= 2]['text'].sample(min(sample_size, len(review_df[review_df['stars'] <= 2])), random_state=42)

def get_top_tfidf_words(texts, n=10):
    vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(texts)
    feature_names = vectorizer.get_feature_names_out()
    mean_tfidf = tfidf_matrix.mean(axis=0).A1
    top_indices = mean_tfidf.argsort()[-n:][::-1]
    return [(feature_names[i], mean_tfidf[i]) for i in top_indices]

print("Top 10 TF-IDF words in POSITIVE reviews:")
top_positive = get_top_tfidf_words(positive_reviews)
for word, score in top_positive:
    print(f"  {word}: {score:.4f}")

print("\nTop 10 TF-IDF words in NEGATIVE reviews:")
top_negative = get_top_tfidf_words(negative_reviews)
for word, score in top_negative:
    print(f"  {word}: {score:.4f}")

## 7. Préparation des labels pour la classification

In [None]:
# Créer les labels de polarité
def get_polarity(stars):
    if stars > 3:
        return 'positive'
    elif stars < 3:
        return 'negative'
    else:
        return 'neutral'

review_df['polarity'] = review_df['stars'].apply(get_polarity)

print("Polarity distribution:")
print(review_df['polarity'].value_counts())

# Sauvegarder un dataset préparé pour la Phase B
prepared_df = review_df[['text', 'stars', 'polarity', 'text_length']].copy()
prepared_df.to_parquet("../Data/prepared_reviews.parquet", index=False)
print("\nSaved prepared dataset to ../Data/prepared_reviews.parquet")