In [28]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import os
import re

### Top Restaurants by City:

In [29]:
# Generate urls and save them
with open('urls.txt', 'w') as file:
    for i in range(20):
        offset = i * 30
        url = f'https://www.tripadvisor.es/Restaurants-g187512-oa{offset}-Haro_La_Rioja.html'
        file.write(url + '\n')

In [30]:
def get_page_contents(url):
    headers = {
        'User-Agent': random.choice([
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/87.0"
        ]),
        'Accept-Language': 'es-ES, es;q=0.5'
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return BeautifulSoup(response.text, 'html.parser')
    else:
        print(f"Error al cargar la página {url}: {response.status_code}")
        return None

In [31]:

def get_restaurants_data(url):
    soup = get_page_contents(url)
    if not soup:
        return None
    
    names = [re.sub(r'^\d+\.\s*', '', name.text.strip()) for name in soup.find_all('h3', {'class': 'biGQs _P fiohW ngXxk'})]
    num_reviews = [review.text.strip() for review in soup.find_all('span', {'class': 'yyzcQ'})]
    ratings = [rating['aria-label'].split(' ')[0].replace(',', '.') 
               for rating in soup.find_all('div', {'class': 'jVDab W f u w JqMhy'}) 
               if rating.has_attr('aria-label')]

   
    span_elements = soup.find_all('span', {'class': 'biGQs _P pZUbB hmDzD'})
    cuisine_type = [span.text.strip() for span in span_elements if "€" not in span.text.strip() and "Carta" not in span.text.strip()]
    prices = [span.text.strip() for span in span_elements if "€" in span.text.strip()]
    
    # allign dictonary
    restaurants_list = []
    for i in range(len(names)):
        restaurant_data = {
            'Restaurant/Tapas Bar': names[i] if i < len(names) else None,
            'Rating': ratings[i] if i < len(ratings) else None,
            'Number of Reviews': num_reviews[i] if i < len(num_reviews) else None,
            'Cuisine Type': cuisine_type[i] if i < len(cuisine_type) else None,
            'Price Range': prices[i] if i < len(prices) else None
        }
        restaurants_list.append(restaurant_data)
    
    
    return pd.DataFrame(restaurants_list)

# read saved urls and get data
all_restaurants_data = []
with open('urls.txt', 'r') as file:
    urls = file.readlines()
    
    for url in urls:
        url = url.strip()  # remove spaces line jumps
        print(f"Scraping URL: {url}")
        
        # scrap each page
        data = get_restaurants_data(url)
        if data is not None:
            all_restaurants_data.append(data)
        
        
        time.sleep(random.uniform(3, 7))  # Pausa de entre 3 y 7 segundos

# combine all  DataFrames
final_df = pd.concat(all_restaurants_data, ignore_index=True)
final_df.head()

Scraping URL: https://www.tripadvisor.es/Restaurants-g187512-oa0-Haro_La_Rioja.html
Scraping URL: https://www.tripadvisor.es/Restaurants-g187512-oa30-Haro_La_Rioja.html
Scraping URL: https://www.tripadvisor.es/Restaurants-g187512-oa60-Haro_La_Rioja.html
Scraping URL: https://www.tripadvisor.es/Restaurants-g187512-oa90-Haro_La_Rioja.html
Scraping URL: https://www.tripadvisor.es/Restaurants-g187512-oa120-Haro_La_Rioja.html
Scraping URL: https://www.tripadvisor.es/Restaurants-g187512-oa150-Haro_La_Rioja.html
Scraping URL: https://www.tripadvisor.es/Restaurants-g187512-oa180-Haro_La_Rioja.html
Scraping URL: https://www.tripadvisor.es/Restaurants-g187512-oa210-Haro_La_Rioja.html
Scraping URL: https://www.tripadvisor.es/Restaurants-g187512-oa240-Haro_La_Rioja.html
Scraping URL: https://www.tripadvisor.es/Restaurants-g187512-oa270-Haro_La_Rioja.html


KeyboardInterrupt: 

In [None]:
# drop first row of first page (unwanted prmoted restaurant)
# final_df = final_df.drop(0).reset_index(drop=True)

In [None]:
# final_df.head()

In [None]:
# final_df.to_csv('df.csv', index=False)

### Top Things to Do (top activities) in La Rioja:

In [None]:
# create folder for urls
url_folder = 'tripadvisor_act_urls'
os.makedirs(url_folder, exist_ok=True)

for i in range(19):  # number of pages
    offset = i * 30  # offset range
    url_act = f'https://www.tripadvisor.com/Attractions-g187511-Activities-oa{offset}-La_Rioja.html'
    file_path = os.path.join(url_folder, f'urls_page_{i+1}.txt')
    with open(file_path, 'w') as file:
        file.write(url_act + '\n')

In [32]:
# get data and return DataFrame
def get_activity_data(url):
    soup = get_page_contents(url)
    if not soup:
        return None
    
    names = [re.sub(r'^\d+\.\s*', '', name.text.strip()) for name in soup.find_all('h3', {'class': 'biGQs _P fiohW ngXxk'})]
    num_reviews = [review.text.strip() for review in soup.find_all('div', {'class': 'jVDab W f u w JqMhy'})]
    num_reviews_cleaned = [
    re.search(r'bubbles(\d[,\d]*)', review).group(1)  # grab numbers that follow word  "bubbles"
    for review in num_reviews if 'bubbles' in review  
    ]
    ratings = [rating.text.strip()[:3] for rating in soup.find_all('div', {'class': 'jVDab W f u w JqMhy'})]
    categories = [categorie.text.strip() for categorie in soup.find_all('span', {'class': 'biGQs _P pZUbB avBIb hmDzD'})]
    locations = soup.find_all('a', href=lambda href: href and '/Attraction_Review' in href)
    location_clean= [
    re.search(r'-([A-Za-z_]+)_La_Rioja', link['href']).group(1)
    for link in locations
    if '#REVIEWS' not in link['href'] and re.search(r'-([A-Za-z_]+)_La_Rioja', link['href'])
    ]
    location_unique = list(set(location_clean))
    
    
# allign dictonary
    activity_list = []
    for i in range(len(names)):
        activity_data = {
            'actividad': names[i] if i < len(names) else None,
            'num_resenas': num_reviews_cleaned [i] if i < len(num_reviews_cleaned ) else None,
            'valoracion': ratings[i] if i < len(ratings) else None,
            'categoria': categories[i] if i < len(categories) else None,
            'ubicacion': location_unique[i] if i < len(location_unique) else None
        }
        activity_list.append(activity_data)
    
    return pd.DataFrame(activity_list)

# read saved urls and get data
all_activities_data = []
for file_name in os.listdir(url_folder):
    file_path = os.path.join(url_folder, file_name)
    with open(file_path, 'r') as file:
        url = file.readline().strip()  # read url file
        print(f"Scraping URL: {url}")
        data = get_activity_data(url)
        if data is not None:
            all_activities_data.append(data)
        time.sleep(random.uniform(3, 7))

# combine all  DataFrames
final_actdf = pd.concat(all_activities_data, ignore_index=True)

Scraping URL: https://www.tripadvisor.com/Attractions-g187511-Activities-oa150-La_Rioja.html
Scraping URL: https://www.tripadvisor.com/Attractions-g187511-Activities-oa180-La_Rioja.html
Scraping URL: https://www.tripadvisor.com/Attractions-g187511-Activities-oa510-La_Rioja.html
Scraping URL: https://www.tripadvisor.com/Attractions-g187511-Activities-oa120-La_Rioja.html
Scraping URL: https://www.tripadvisor.com/Attractions-g187511-Activities-oa90-La_Rioja.html
Scraping URL: https://www.tripadvisor.com/Attractions-g187511-Activities-oa540-La_Rioja.html
Scraping URL: https://www.tripadvisor.com/Attractions-g187511-Activities-oa0-La_Rioja.html
Scraping URL: https://www.tripadvisor.com/Attractions-g187511-Activities-oa60-La_Rioja.html
Scraping URL: https://www.tripadvisor.com/Attractions-g187511-Activities-oa30-La_Rioja.html
Scraping URL: https://www.tripadvisor.com/Attractions-g187511-Activities-oa330-La_Rioja.html
Scraping URL: https://www.tripadvisor.com/Attractions-g187511-Activities-oa

In [39]:
final_actdf.head(2)


Unnamed: 0,actividad,num_resenas,valoracion,categoria,ubicacion
0,Sala Amos Salvador,6,4.5,Art Museums,Torrecilla_en_Cameros
1,Oficina de Turismo de Arnedo,12,4.5,Visitor Centers,Viguera


In [53]:
final_actdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 570 entries, 0 to 569
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   actividad    570 non-null    object 
 1   num_resenas  570 non-null    int64  
 2   valoracion   420 non-null    float64
 3   categoria    300 non-null    object 
 4   ubicacion    308 non-null    object 
dtypes: float64(1), int64(1), object(3)
memory usage: 22.4+ KB


In [None]:
final_actdf['valoracion'] = final_actdf['valoracion'].astype(float)
final_actdf['valoracion']

In [52]:
final_actdf['num_resenas'] = final_actdf['num_resenas'].fillna('0')  
final_actdf['num_resenas'] = final_actdf['num_resenas'].str.replace(',', '').astype(int) 

In [56]:
import numpy as np
final_actdf['num_resenas'] = final_actdf['num_resenas'].replace('0', np.nan)

In [59]:
final_actdf.isnull().count()

actividad      570
num_resenas    570
valoracion     570
categoria      570
ubicacion      570
dtype: int64

In [69]:
final_actdf['ubicacion'].unique()

array(['Torrecilla en Cameros', 'Viguera', 'San Asensio', 'Navarrete',
       'Enciso', 'Arnedillo', 'Cenicero', 'Mansilla de la Sierra', 'Haro',
       'Cuzcurrita de Rio Tiron', 'Murillo del Rio Leza', 'Logrono',
       'El Villar', 'Briones', 'Ezcaray', 'Canas', 'Sojuela', 'Arnedo',
       'Ribabellosa', 'Aldeanueva de Ebro', None, 'Najera',
       'Albelda de Iregua', 'Sajazarra', 'Ollauri', 'Calahorra',
       'San Vicente de la Sonsierra', 'Fuenmayor',
       'Santo Domingo de la Calzada', 'Brinas', 'Alfaro', 'Pradejon',
       'Bambu', 'Berceo', 'Lanciego', 'Banos de Rio Tobia', 'Lumbreras',
       'Clavijo', 'Badaran', 'Alcanadre', 'Autol', 'Ventosa',
       'Cervera del Rio Alhama', 'Navajun', 'Gimileo', 'Torremontalbo',
       'Oficina de Turismo de', 'Tricio', 'Matute', 'Cihuri',
       'Aguilar del Rio Alhama', 'Galilea', 'Pradillo', 'Rodezno',
       'Enjoy the most beautiful places in', 'Hornos de Moncalvillo',
       'Villamediana de Iregua', 'Nalda', 'Fonzaleche', 'Casa

In [68]:
final_actdf['ubicacion'] = final_actdf['ubicacion'].str.replace('_', ' ')

In [None]:
# final_actdf.to_csv('top_activities_trip.csv', index=False)