# Notebook that first makes API call, then web scrapes, then aggregates scraped data and merges it with the API data to generate one final csv file.

## Importations

In [59]:
import requests
import pandas as pd
import csv
import time
import random
from bs4 import BeautifulSoup
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

## API data retrieval

In [60]:
# Define API URL
url = "https://ressources.data.sncf.com/api/records/1.0/search/"

# List of destinations and links
# Each tuple contains (Destination, Page1 Link, Page2 Link)
destinations_and_links = [
    ('Orléans', 'https://www.tripadvisor.fr/Attraction_Review-g187129-d9788284-Reviews-or10-Centre_ville-Orleans_Loiret_Centre_Val_de_Loire.html', None),
    ('Metz', 'https://www.tripadvisor.fr/ShowUserReviews-g187164-d2060561-r425068727-Gare_de_Metz_Ville-Metz_Moselle_Grand_Est.html', None),
    ('Strasbourg', 'https://www.tripadvisor.fr/ShowUserReviews-g187075-r287480889-Strasbourg_Bas_Rhin_Grand_Est.html#REVIEWS', 'https://www.tripadvisor.fr/ShowUserReviews-g187075-r86136484-Strasbourg_Bas_Rhin_Grand_Est.html#REVIEWS'),
    ('Annecy', 'https://www.tripadvisor.fr/ShowUserReviews-g187260-r111704080-Annecy_Haute_Savoie_Auvergne_Rhone_Alpes.html#REVIEWS', 'https://www.tripadvisor.fr/ShowUserReviews-g187260-r86606692-Annecy_Haute_Savoie_Auvergne_Rhone_Alpes.html')
]

# Extract API data
params = {
    "dataset": "emission-co2-perimetre-complet",
    "q": "",
    "rows": 1000,
    "select": "origine, destination, distance_entre_les_gares, train_empreinte_carbone_kgco2e"
}

results = []
response = requests.get(url, params=params)
if response.status_code == 200:
    data = response.json()
    city_to_link = {city: (link1, link2) for city, link1, link2 in destinations_and_links}
    if "records" in data and data["records"]:
        for record in data["records"]:
            fields = record.get("fields", {})
            origine = fields.get("origine", "")
            destination = fields.get("destination", "")
            if 'Paris' in origine and destination in city_to_link:
                results.append({
                    "origine": origine,
                    "destination": destination,
                    "page1_link": city_to_link[destination][0],
                    "page2_link": city_to_link[destination][1],
                    "distance": fields.get("distance_entre_les_gares", 0),
                    "train_emissions": fields.get("train_empreinte_carbone_kgco2e", None)
                })
else:
    print(f"Error: {response.status_code} - {response.text}")

In [61]:
print(results)

[{'origine': 'Paris Gare de Lyon', 'destination': 'Annecy', 'page1_link': 'https://www.tripadvisor.fr/ShowUserReviews-g187260-r111704080-Annecy_Haute_Savoie_Auvergne_Rhone_Alpes.html#REVIEWS', 'page2_link': 'https://www.tripadvisor.fr/ShowUserReviews-g187260-r86606692-Annecy_Haute_Savoie_Auvergne_Rhone_Alpes.html', 'distance': 545.0, 'train_emissions': 1.5805}, {'origine': 'Paris Est', 'destination': 'Metz', 'page1_link': 'https://www.tripadvisor.fr/ShowUserReviews-g187164-d2060561-r425068727-Gare_de_Metz_Ville-Metz_Moselle_Grand_Est.html', 'page2_link': None, 'distance': 352.0, 'train_emissions': 1.0208}, {'origine': 'Paris Austerlitz', 'destination': 'Orléans', 'page1_link': 'https://www.tripadvisor.fr/Attraction_Review-g187129-d9788284-Reviews-or10-Centre_ville-Orleans_Loiret_Centre_Val_de_Loire.html', 'page2_link': None, 'distance': 121.0, 'train_emissions': 2.9523999999999995}, {'origine': 'Paris Est', 'destination': 'Strasbourg', 'page1_link': 'https://www.tripadvisor.fr/ShowUser

## Web scraping reviews

In [62]:
# Collect Reviews
driver_path = 'C:/ChromeDriver/chromedriver-win64/chromedriver.exe'
service = Service(driver_path)
driver = webdriver.Chrome(service=service)

# Function to convert rating bubbles
def convert_bubble_to_rating(bubble_class):
    if 'bubble_' in bubble_class:
        bubble_value = int(bubble_class.split('_')[1])
        return bubble_value // 10
    return None

all_reviews = []
for record in results:
    origine = record['origine']
    destination = record['destination']
    for page_link in ['page1_link', 'page2_link']:
        url = record[page_link]
        if not url: continue
        time.sleep(random.uniform(3, 7))
        response = requests.get(url, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0',
            'Accept-Language': 'fr-FR,fr;q=0.9',
            'Referer': 'https://www.google.com',
            'Connection': 'keep-alive'
        })
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            review_blocks = soup.find_all('div', id=re.compile(r'review_\d+'))
            for block in review_blocks:
                title = block.find('div', class_='quote').get_text(strip=True) if block.find('div', class_='quote') else None
                content = block.find('div', class_='entry').get_text(strip=True) if block.find('div', class_='entry') else None
                rating_element = block.find('span', class_='ui_bubble_rating')
                rating = convert_bubble_to_rating(rating_element['class'][1]) if rating_element else None
                all_reviews.append({
                    'origine': origine,
                    'destination': destination,
                    'page1_link': record['page1_link'],
                    'page2_link': record['page2_link'],
                    'distance': record['distance'],
                    'train_emissions': record['train_emissions'],
                    'scraped_url': url,
                    'title': title,
                    'review': content,
                    'rating': rating
            })
        else:
            print(f"Failed to fetch page: {url}")

In [63]:
print(all_reviews)

[{'origine': 'Paris Gare de Lyon', 'destination': 'Annecy', 'page1_link': 'https://www.tripadvisor.fr/ShowUserReviews-g187260-r111704080-Annecy_Haute_Savoie_Auvergne_Rhone_Alpes.html#REVIEWS', 'page2_link': 'https://www.tripadvisor.fr/ShowUserReviews-g187260-r86606692-Annecy_Haute_Savoie_Auvergne_Rhone_Alpes.html', 'distance': 545.0, 'train_emissions': 1.5805, 'scraped_url': 'https://www.tripadvisor.fr/ShowUserReviews-g187260-r111704080-Annecy_Haute_Savoie_Auvergne_Rhone_Alpes.html#REVIEWS', 'title': '“Annecyyyy...!!! Quand tu nous tiens..!!!”', 'review': "Vtt sur le Semnoz, pédalo sur le lac, promenade au bord de l'eau (superbe), l'ambiance de la vielle ville, pti resto super sympa... vraiment vraiment bien.... I love Annecy .... :-)", 'rating': 5}, {'origine': 'Paris Gare de Lyon', 'destination': 'Annecy', 'page1_link': 'https://www.tripadvisor.fr/ShowUserReviews-g187260-r111704080-Annecy_Haute_Savoie_Auvergne_Rhone_Alpes.html#REVIEWS', 'page2_link': 'https://www.tripadvisor.fr/ShowU

## CSV generation

### Non-aggregated CSV

In [64]:
# Save non-aggregated reviews to CSV
non_aggregated_csv = 'non_aggregated_emissions_and_reviews.csv'
reviews_df = pd.DataFrame(all_reviews)
reviews_df.to_csv(non_aggregated_csv, index=False, encoding='utf-8')

### Aggregated CSV

In [65]:
# Aggregate Reviews
aggregated_reviews = reviews_df.groupby('destination').agg(
    scraped_url=('scraped_url', 'first'),
    titles=('title', lambda x: ' || '.join(x.dropna().astype(str))),
    reviews=('review', lambda x: ' || '.join(x.dropna().astype(str))),
    average_rating=('rating', 'mean')
).reset_index()

# Merge with Emissions Data
emissions_df = pd.DataFrame(results)
merged_df = pd.merge(emissions_df, aggregated_reviews, how='left', on='destination')

# Final Aggregated CSV
output_csv = 'final_emissions_reviews.csv'
merged_df.to_csv(output_csv, index=False, encoding='utf-8')

# Final Aggregated CSV
output_csv = 'aggregated_emissions_and_reviews.csv'
merged_df.to_csv(output_csv, index=False, encoding='utf-8')