In [118]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

In [119]:
# Generar las URLs y escribirlas en un archivo de texto
with open('urls.txt', 'w') as file:
    for i in range(1):
        offset = i * 30
        # url = f'https://www.tripadvisor.es/Restaurants-g187512-oa{offset}-Haro_La_Rioja.html'
        url = f'https://www.tripadvisor.es/Restaurants-g1099799-oa0-Cenicero_La_Rioja.html'
        file.write(url + '\n')

In [120]:

# Función para obtener el contenido de una página
def get_page_contents(url):
    headers = {
        'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        'Accept-Language': 'es-ES, es;q=0.5'
    }
    page = requests.get(url, headers=headers)
    if page.status_code == 200:
        return BeautifulSoup(page.text, 'html.parser')
    else:
        print(f"Error al cargar la página {url}: {page.status_code}")
        return None

# Función para extraer los datos y devolver un DataFrame
def get_restaurants_data(url):
    soup = get_page_contents(url)
    if not soup:
        return None
    
    names = [name.text.strip() for name in soup.find_all('div', {'class': 'biGQs _P fiohW alXOW oCpZu GzNcM nvOhm UTQMg ZTpaU mtnKn ngXxk'})]
    num_reviews = [review.text.strip() for review in soup.find_all('span', {'class': 'yyzcQ'})]
    ratings = [rating['aria-label'].split(' ')[0].replace(',', '.') 
               for rating in soup.find_all('div', {'class': 'jVDab W f u w JqMhy'}) 
               if rating.has_attr('aria-label')]

   
    span_elements = soup.find_all('span', {'class': 'biGQs _P pZUbB hmDzD'})
    cuisine_type = [span.text.strip() for span in span_elements if "€" not in span.text.strip() and "Carta" not in span.text.strip()]
    prices = [span.text.strip() for span in span_elements if "€" in span.text.strip()]
    
    # Alinear los datos en una lista de diccionarios
    restaurants_list = []
    for i in range(len(names)):
        restaurant_data = {
            'Restaurant/Tapas Bar': names[i] if i < len(names) else None,
            'Rating': ratings[i] if i < len(ratings) else None,
            'Number of Reviews': num_reviews[i] if i < len(num_reviews) else None,
            'Cuisine Type': cuisine_type[i] if i < len(cuisine_type) else None,
            'Price Range': prices[i] if i < len(prices) else None
        }
        restaurants_list.append(restaurant_data)
    
    # Convertir a DataFrame y retornar
    return pd.DataFrame(restaurants_list)

# Leer las URLs desde el archivo y realizar el scraping con pausas
all_restaurants_data = []
with open('urls.txt', 'r') as file:
    urls = file.readlines()
    
    for url in urls:
        url = url.strip()  # Quitar espacios en blanco o saltos de línea
        print(f"Scraping URL: {url}")
        
        # Realizar el scraping para cada página
        data = get_restaurants_data(url)
        if data is not None:
            all_restaurants_data.append(data)
        
        # Pausa aleatoria para evitar ser detectado
        time.sleep(random.uniform(3, 7))  # Pausa de entre 3 y 7 segundos

# Concatenar todos los DataFrames en uno solo
final_df = pd.concat(all_restaurants_data, ignore_index=True)
final_df.head()

Scraping URL: https://www.tripadvisor.es/Restaurants-g1099799-oa0-Cenicero_La_Rioja.html


Unnamed: 0,Restaurant/Tapas Bar,Rating,Number of Reviews,Cuisine Type,Price Range
0,1. Restaurante Olano,4.0,132,Restaurantes en Cenicero,€
1,2. La Vermuteria,4.5,26,"Mediterránea, Española",€
2,3. Bar cafetería San Fernando,4.0,16,"Bar, Marisco",€€ - €€€
3,4. Bodegon El Parque,3.5,14,Española,
4,5. El Mayo,0.0,0 opiniones,"Mediterránea, Española",


In [121]:
# final_df = final_df.drop(0).reset_index(drop=True)

In [122]:
final_df.head()

Unnamed: 0,Restaurant/Tapas Bar,Rating,Number of Reviews,Cuisine Type,Price Range
0,1. Restaurante Olano,4.0,132,Restaurantes en Cenicero,€
1,2. La Vermuteria,4.5,26,"Mediterránea, Española",€
2,3. Bar cafetería San Fernando,4.0,16,"Bar, Marisco",€€ - €€€
3,4. Bodegon El Parque,3.5,14,Española,
4,5. El Mayo,0.0,0 opiniones,"Mediterránea, Española",


In [123]:
# Crear la columna 'Ranking Tripadvisor' extrayendo el número al inicio
final_df['Tripadvisor Ranking'] = final_df['Restaurant/Tapas Bar'].str.extract(r'^(\d+)').astype(float)

# Limpiar la columna 'Restaurant/Tapas Bar' eliminando el número y el punto
final_df['Restaurant/Tapas Bar'] = final_df['Restaurant/Tapas Bar'].str.replace(r'^\d+\.\s*', '', regex=True)

final_df.head()

Unnamed: 0,Restaurant/Tapas Bar,Rating,Number of Reviews,Cuisine Type,Price Range,Tripadvisor Ranking
0,Restaurante Olano,4.0,132,Restaurantes en Cenicero,€,1.0
1,La Vermuteria,4.5,26,"Mediterránea, Española",€,2.0
2,Bar cafetería San Fernando,4.0,16,"Bar, Marisco",€€ - €€€,3.0
3,Bodegon El Parque,3.5,14,Española,,4.0
4,El Mayo,0.0,0 opiniones,"Mediterránea, Española",,5.0


In [124]:
final_df.describe

<bound method NDFrame.describe of          Restaurant/Tapas Bar Rating Number of Reviews  \
0           Restaurante Olano    4.0               132   
1               La Vermuteria    4.5                26   
2  Bar cafetería San Fernando    4.0                16   
3           Bodegon El Parque    3.5                14   
4                     El Mayo      0       0 opiniones   
5          Bar Que Pasada Bar      0       0 opiniones   
6                 Bar Liberty      0       0 opiniones   

               Cuisine Type Price Range  Tripadvisor Ranking  
0  Restaurantes en Cenicero           €                  1.0  
1    Mediterránea, Española           €                  2.0  
2              Bar, Marisco    €€ - €€€                  3.0  
3                  Española        None                  4.0  
4    Mediterránea, Española        None                  5.0  
5                                  None                  6.0  
6     Mediterránea, Europea        None                  7.0

In [125]:
final_df.to_csv('df.csv', index=False)
