# Notebook that first makes API call, then web scrapes, then aggregates scraped data and merges it with the API data to generate one final csv file.

In [None]:
import requests
import pandas as pd
import csv
import time
import random
from bs4 import BeautifulSoup
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

# Define API URL
url = "https://ressources.data.sncf.com/api/records/1.0/search/"

# List of destinations and links
# Each tuple contains (Destination, Page1 Link, Page2 Link)
destinations_and_links = [
    ('Orléans', 'https://www.tripadvisor.fr/Attraction_Review-g187129-d9788284-Reviews-or10-Centre_ville-Orleans_Loiret_Centre_Val_de_Loire.html', None),
    ('Metz', 'https://www.tripadvisor.fr/ShowUserReviews-g187164-d2060561-r425068727-Gare_de_Metz_Ville-Metz_Moselle_Grand_Est.html', None),
    ('Strasbourg', 'https://www.tripadvisor.fr/ShowUserReviews-g187075-r287480889-Strasbourg_Bas_Rhin_Grand_Est.html#REVIEWS', 'https://www.tripadvisor.fr/ShowUserReviews-g187075-r86136484-Strasbourg_Bas_Rhin_Grand_Est.html#REVIEWS'),
    ('Annecy', 'https://www.tripadvisor.fr/ShowUserReviews-g187260-r111704080-Annecy_Haute_Savoie_Auvergne_Rhone_Alpes.html#REVIEWS', 'https://www.tripadvisor.fr/ShowUserReviews-g187260-r86606692-Annecy_Haute_Savoie_Auvergne_Rhone_Alpes.html')
]

# Extract API data
params = {
    "dataset": "emission-co2-perimetre-complet",
    "q": "",
    "rows": 1000,
    "select": "origine, destination, transporteur, distance_entre_les_gares, "
              "train_empreinte_carbone_kgco2e, autocar_longue_distance_empreinte_carbone_kgco2e, avion_empreinte_carbone_kgco2e, "
              "voiture_electrique_2_2_pers_empreinte_carbone_kgco2e, voiture_thermique_2_2_pers_empreinte_carbone_kgco2e",
}

results = []
response = requests.get(url, params=params)
if response.status_code == 200:
    data = response.json()
    city_to_link = {city: (link1, link2) for city, link1, link2 in destinations_and_links}
    if "records" in data and data["records"]:
        for record in data["records"]:
            fields = record.get("fields", {})
            origine = fields.get("origine", "")
            destination = fields.get("destination", "")
            if 'Paris' in origine and destination in city_to_link:
                results.append({
                    "origine": origine,
                    "destination": destination,
                    "distance": fields.get("distance_entre_les_gares", 0),
                    "train_emissions": fields.get("train_empreinte_carbone_kgco2e", None),
                    "bus_emissions": fields.get("autocar_longue_distance_empreinte_carbone_kgco2e", None),
                    "plane_emissions": fields.get("avion_empreinte_carbone_kgco2e", None),
                    "electric_car_emissions": fields.get("voiture_electrique_2_2_pers_empreinte_carbone_kgco2e", None),
                    "thermal_car_emissions": fields.get("voiture_thermique_2_2_pers_empreinte_carbone_kgco2e", None),
                    "page1_link": city_to_link[destination][0],
                    "page2_link": city_to_link[destination][1]
                })
else:
    print(f"Error: {response.status_code} - {response.text}")

# Collect Reviews
driver_path = 'C:/ChromeDriver/chromedriver-win64/chromedriver.exe'
service = Service(driver_path)
driver = webdriver.Chrome(service=service)

# Function to convert rating bubbles
def convert_bubble_to_rating(bubble_class):
    if 'bubble_' in bubble_class:
        bubble_value = int(bubble_class.split('_')[1])
        return bubble_value // 10
    return None

all_reviews = []
for record in results:
    origine = record['origine']
    destination = record['destination']
    for page_link in ['page1_link', 'page2_link']:
        url = record[page_link]
        if not url: continue
        time.sleep(random.uniform(3, 7))
        response = requests.get(url, headers={
            'User-Agent': 'Mozilla/5.0',
            'Accept-Language': 'fr-FR,fr;q=0.9'
        })
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            review_blocks = soup.find_all('div', id=re.compile(r'review_\d+'))
            for block in review_blocks:
                title = block.find('div', class_='quote').get_text(strip=True) if block.find('div', class_='quote') else None
                content = block.find('div', class_='entry').get_text(strip=True) if block.find('div', class_='entry') else None
                rating_element = block.find('span', class_='ui_bubble_rating')
                rating = convert_bubble_to_rating(rating_element['class'][1]) if rating_element else None
                all_reviews.append({
                    'Origine': origine,
                    'Destination': destination,
                    'Content': content,
                    'Rating': rating
                })
        else:
            print(f"Failed to fetch page: {url}")

# Aggregate Reviews
reviews_df = pd.DataFrame(all_reviews)
aggregated_reviews = reviews_df.groupby('Destination').agg(
    reviews=('Content', lambda x: ' || '.join(x.dropna().astype(str))),
    average_rating=('Rating', 'mean')
).reset_index()

# Merge with Emissions Data
emissions_df = pd.DataFrame(results)
merged_df = pd.merge(emissions_df, aggregated_reviews, how='left', left_on='destination', right_on='Destination')
merged_df.drop(columns=['Destination'], inplace=True)

# Final CSV
output_csv = 'final_emissions_reviews.csv'
merged_df.to_csv(output_csv, index=False, encoding='utf-8')

print(f"Final CSV saved as '{output_csv}'")