In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import os
import re

### Top Restaurants by City:

In [29]:
# Generate urls and save them
with open('urls.txt', 'w') as file:
    for i in range(20):
        offset = i * 30
        url = f'https://www.tripadvisor.es/Restaurants-g187512-oa{offset}-Haro_La_Rioja.html'
        file.write(url + '\n')

In [3]:
def get_page_contents(url):
    headers = {
        'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/87.0",
        
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return BeautifulSoup(response.text, 'html.parser')
    else:
        print(f"Error al cargar la página {url}: {response.status_code}")
        return None

In [None]:

def get_restaurants_data(url):
    soup = get_page_contents(url)
    if not soup:
        return None
    
    names = [re.sub(r'^\d+\.\s*', '', name.text.strip()) for name in soup.find_all('h3', {'class': 'biGQs _P fiohW ngXxk'})]
    num_reviews = [review.text.strip() for review in soup.find_all('span', {'class': 'yyzcQ'})]
    ratings = [rating['aria-label'].split(' ')[0].replace(',', '.') 
               for rating in soup.find_all('div', {'class': 'jVDab W f u w JqMhy'}) 
               if rating.has_attr('aria-label')]

   
    span_elements = soup.find_all('span', {'class': 'biGQs _P pZUbB hmDzD'})
    cuisine_type = [span.text.strip() for span in span_elements if "€" not in span.text.strip() and "Carta" not in span.text.strip()]
    prices = [span.text.strip() for span in span_elements if "€" in span.text.strip()]
    
    # allign dictonary
    restaurants_list = []
    for i in range(len(names)):
        restaurant_data = {
            'Restaurant/Tapas Bar': names[i] if i < len(names) else None,
            'Rating': ratings[i] if i < len(ratings) else None,
            'Number of Reviews': num_reviews[i] if i < len(num_reviews) else None,
            'Cuisine Type': cuisine_type[i] if i < len(cuisine_type) else None,
            'Price Range': prices[i] if i < len(prices) else None
        }
        restaurants_list.append(restaurant_data)
    
    
    return pd.DataFrame(restaurants_list)

# read saved urls and get data
all_restaurants_data = []
with open('urls.txt', 'r') as file:
    urls = file.readlines()
    
    for url in urls:
        url = url.strip()  # remove spaces line jumps
        print(f"Scraping URL: {url}")
        
        # scrap each page
        data = get_restaurants_data(url)
        if data is not None:
            all_restaurants_data.append(data)
        
        
        time.sleep(random.uniform(3, 7))  # Pausa de entre 3 y 7 segundos

# combine all  DataFrames
final_df = pd.concat(all_restaurants_data, ignore_index=True)
final_df.head()

In [None]:
# drop first row of first page (unwanted prmoted restaurant)
# final_df = final_df.drop(0).reset_index(drop=True)

In [None]:
# final_df.head()

In [None]:
# final_df.to_csv('df.csv', index=False)

### Top Things to Do (top activities) in La Rioja:

In [4]:
# create folder for urls
url_folder = 'tripadvisor_act_urls'
os.makedirs(url_folder, exist_ok=True)

for i in range(19):  # number of pages
    offset = i * 30  # offset range
    url_act = f'https://www.tripadvisor.com/Attractions-g187511-Activities-oa{offset}-La_Rioja.html'
    file_path = os.path.join(url_folder, f'urls_page_{i+1}.txt')
    with open(file_path, 'w') as file:
        file.write(url_act + '\n')

In [6]:
# get data and return DataFrame
def get_activity_data(url):
    soup = get_page_contents(url)
    if not soup:
        return None
    
    names = [re.sub(r'^\d+\.\s*', '', name.text.strip()) for name in soup.find_all('h3', {'class': 'biGQs _P fiohW ngXxk'})]
    num_reviews = [review.text.strip() for review in soup.find_all('div', {'class': 'jVDab W f u w JqMhy'})]
    num_reviews_cleaned = [
    re.search(r'bubbles(\d[,\d]*)', review).group(1)  # grab numbers that follow word  "bubbles"
    for review in num_reviews if 'bubbles' in review  
    ]
    ratings = [rating.text.strip()[:3] for rating in soup.find_all('div', {'class': 'jVDab W f u w JqMhy'})]
    categories = [categorie.text.strip() for categorie in soup.find_all('span', {'class': 'biGQs _P pZUbB avBIb hmDzD'})]
    
    # ------------------------------locations (un poco mas dificil de obtener)------------------------------
    location_href = soup.find_all('a', {'class': 'BMQDV _F Gv wSSLS SwZTJ', 'href': True})
    href_list = []
    pattern = r'([A-Za-z_]+)_La_Rioja'
        # Extraer y filtrar los valores de 'href' que comienzan con "/Attraction"
    for location in location_href:
            href = location['href']
            if href.startswith('/Attraction'):  # Verifica si comienza con "/Attraction"
                href_list.append(href)
    href_list = href_list# habian enlaces duplicados. Uso set para eliminarlos antes de seguir
        # Extraer ubicaciones
    locations = []
    for href in href_list[ : -5]:
            match = re.search(pattern, href)
            if match:
                locations.append(match.group(1).replace('_', ' '))  # Reemplaza "_" en las ubicaciones por espacio 
        
    
    # allign dictonary
    activity_list = []
    for i in range(len(names)):
        activity_data = {
            'actividad': names[i] if i < len(names) else None,
            'num_resenas': num_reviews_cleaned [i] if i < len(num_reviews_cleaned ) else None,
            'valoracion': ratings[i] if i < len(ratings) else None,
            'categoria': categories[i] if i < len(categories) else None,
            'ubicacion': locations[i] if i < len(locations) else None
        }
        activity_list.append(activity_data)
    
    return pd.DataFrame(activity_list)

# read saved urls and get data
all_activities_data = []
for file_name in os.listdir(url_folder):
    file_path = os.path.join(url_folder, file_name)
    with open(file_path, 'r') as file:
        url = file.readline().strip()  # read url file
        print(f"Scraping URL: {url}")
        data = get_activity_data(url)
        if data is not None:
            all_activities_data.append(data)
        time.sleep(random.uniform(3, 7))

# combine all  DataFrames
final_actdf = pd.concat(all_activities_data, ignore_index=True)

Scraping URL: https://www.tripadvisor.com/Attractions-g187511-Activities-oa150-La_Rioja.html
Scraping URL: https://www.tripadvisor.com/Attractions-g187511-Activities-oa180-La_Rioja.html
Scraping URL: https://www.tripadvisor.com/Attractions-g187511-Activities-oa510-La_Rioja.html
Scraping URL: https://www.tripadvisor.com/Attractions-g187511-Activities-oa120-La_Rioja.html
Scraping URL: https://www.tripadvisor.com/Attractions-g187511-Activities-oa90-La_Rioja.html
Scraping URL: https://www.tripadvisor.com/Attractions-g187511-Activities-oa540-La_Rioja.html
Scraping URL: https://www.tripadvisor.com/Attractions-g187511-Activities-oa0-La_Rioja.html
Scraping URL: https://www.tripadvisor.com/Attractions-g187511-Activities-oa60-La_Rioja.html
Scraping URL: https://www.tripadvisor.com/Attractions-g187511-Activities-oa30-La_Rioja.html
Scraping URL: https://www.tripadvisor.com/Attractions-g187511-Activities-oa330-La_Rioja.html
Scraping URL: https://www.tripadvisor.com/Attractions-g187511-Activities-oa

In [8]:
final_actdf.head(8)


Unnamed: 0,actividad,num_resenas,valoracion,categoria,ubicacion
0,Paseo de la Florida,13,4.0,Parks,Logrono
1,Castillo de Clavijo,27,4.0,Castles,Clavijo
2,Sala Amos Salvador,6,4.5,Art Museums,Logrono
3,Finca Ribavellosa,9,5.0,Hiking Trails,Ribabellosa
4,Centro de la Cultura del Rioja,52,3.5,Art Museums,Logrono
5,Chorrones de Pena Puerta,6,4.5,Waterfalls,Viguera
6,Monumento al Sagrado Corazón de Jesús,11,4.5,Monuments & Statues,Torrecilla en Cameros
7,Ermita de Santa Barbara,5,4.0,Churches & Cathedrals,Ezcaray


In [14]:
final_actdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 570 entries, 0 to 569
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   actividad    570 non-null    object 
 1   num_resenas  570 non-null    int64  
 2   valoracion   423 non-null    float64
 3   categoria    360 non-null    object 
 4   ubicacion    297 non-null    object 
dtypes: float64(1), int64(1), object(3)
memory usage: 22.4+ KB


In [10]:
final_actdf['valoracion'] = final_actdf['valoracion'].astype(float)
final_actdf['valoracion']

0      4.0
1      4.0
2      4.5
3      5.0
4      3.5
      ... 
565    NaN
566    NaN
567    NaN
568    NaN
569    NaN
Name: valoracion, Length: 570, dtype: float64

In [11]:
final_actdf['num_resenas'] = final_actdf['num_resenas'].fillna('0')  
final_actdf['num_resenas'] = final_actdf['num_resenas'].str.replace(',', '').astype(int) 

In [12]:
import numpy as np
final_actdf['num_resenas'] = final_actdf['num_resenas'].replace('0', np.nan)

In [13]:
final_actdf.isnull().count()

actividad      570
num_resenas    570
valoracion     570
categoria      570
ubicacion      570
dtype: int64

In [15]:
final_actdf['categoria'].unique()

array(['Parks', 'Castles', 'Art Museums', 'Hiking Trails', 'Waterfalls',
       'Monuments & Statues', 'Churches & Cathedrals', 'Religious Sites',
       'Sports Complexes', 'Theaters', 'Wineries & Vineyards',
       'Escape Games', 'Points of Interest & Landmarks',
       'Factory Outlets', 'Bodies of Water', 'Fountains', 'Golf Courses',
       'Bridges', 'Visitor Centers', 'Government Buildings',
       'Architectural Buildings', 'Piers & Boardwalks',
       'Speciality & Gift Shops', 'Speciality Museums', 'Spas',
       'Ancient Ruins', 'Taxis & Shuttles', 'Lessons & Workshops',
       'Nature & Wildlife Tours', 'Climbing Tours', None, 'Wine Bars',
       'History Museums', 'Game & Entertainment Centers',
       'Caverns & Caves', 'Shopping Malls', 'Gardens', 'Science Museums',
       'Water Parks', 'Bars & Clubs', 'Gear Rentals', 'Sightseeing Tours',
       'Dance Clubs & Discos', 'Horseback Riding Tours',
       'Convention Centers', 'Wine Tours & Tastings', 'Department Stores',
 

In [16]:
final_actdf.to_csv('top_activities_trip3.csv', index=False)