### Limpieza URL

In [1]:
import requests as req
import re
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from fuzzywuzzy import fuzz

import time
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append('../src')



Importo los csv que contienen las url.

In [2]:
url1 = pd.read_csv('./datos/href1.csv')
url2 = pd.read_csv('./datos/href2.csv')

In [3]:
url1.head()

Unnamed: 0,0
0,https://www.tripadvisor.es/Restaurant_Review-g...
1,https://www.tripadvisor.es/Restaurant_Review-g...
2,https://www.tripadvisor.es/Restaurant_Review-g...
3,https://www.tripadvisor.es/Restaurant_Review-g...
4,https://www.tripadvisor.es/Restaurant_Review-g...


In [4]:
url2.head()

Unnamed: 0,0
0,https://www.tripadvisor.es/Restaurant_Review-g...
1,https://www.tripadvisor.es/Restaurant_Review-g...
2,https://www.tripadvisor.es/Restaurant_Review-g...
3,https://www.tripadvisor.es/Restaurant_Review-g...
4,https://www.tripadvisor.es/Restaurant_Review-g...


In [5]:
url = pd.concat([url1, url2], ignore_index = True)

In [6]:
url.head()

Unnamed: 0,0
0,https://www.tripadvisor.es/Restaurant_Review-g...
1,https://www.tripadvisor.es/Restaurant_Review-g...
2,https://www.tripadvisor.es/Restaurant_Review-g...
3,https://www.tripadvisor.es/Restaurant_Review-g...
4,https://www.tripadvisor.es/Restaurant_Review-g...


In [7]:
num_duplicates = url.duplicated().sum()
num_duplicates

30

In [8]:
url = url.drop_duplicates()

In [9]:
len(url)

300

- Voy a intentar unir las URL con sus respectivos restaurantes. 
- Para ello importo el .csv limpio de restaurantes.

In [10]:
data_rest = pd.read_csv('./datos/data_rest.csv')

In [11]:
data_rest.head()

Unnamed: 0,restaurante,gastronomia,precio,opinion,reseña
0,Yakiniku Rikyu,Japonesa,Medio,267 opiniones,Patrocinado
1,marmitón,Mediterránea,Medio,89 opiniones,MICHELIN
2,Vinoteca Moratín,Internacional,Medio,1221 opiniones,MICHELIN
3,Pilar Akaneya,Japonesa,Alto,276 opiniones,MICHELIN
4,Gioia,Italiana,Medio,387 opiniones,MICHELIN


In [12]:
len(data_rest)

300

In [13]:
# Extraer el nombre del restaurante de la URL
url['restaurante'] = url['0'].str.extract(r'-Reviews-(.*)-')

url['restaurante'].head()

0                      Bardero
1     Pizzart_Villa_Fuencarral
2            Ornella_Velazquez
3    Lettera_Trattoria_Moderna
4      Pizzart_Villa_Canalejas
Name: restaurante, dtype: object

In [14]:
url.head()

Unnamed: 0,0,restaurante
0,https://www.tripadvisor.es/Restaurant_Review-g...,Bardero
1,https://www.tripadvisor.es/Restaurant_Review-g...,Pizzart_Villa_Fuencarral
2,https://www.tripadvisor.es/Restaurant_Review-g...,Ornella_Velazquez
3,https://www.tripadvisor.es/Restaurant_Review-g...,Lettera_Trattoria_Moderna
4,https://www.tripadvisor.es/Restaurant_Review-g...,Pizzart_Villa_Canalejas


In [15]:

# Crear una función para encontrar la mejor coincidencia

def find_best_match(restaurante, url_restaurantes):
    return max(url_restaurantes, key=lambda x: fuzz.ratio(restaurante, x))

# Aplicar la función para encontrar la mejor coincidencia para cada restaurante en data_rest
data_rest['best_match'] = data_rest['restaurante'].apply(find_best_match, url_restaurantes=url['restaurante'])

In [16]:
data_rest['best_match'].head()

0              Yakiniku_Rikyu
1                    Marmiton
2    Moratin_Vinoteca_Bistrot
3               Pilar_Akaneya
4                       Gioia
Name: best_match, dtype: object

In [17]:
len(data_rest['best_match'])

300

In [18]:
# Unir los DataFrames por la mejor coincidencia

merged_data = data_rest.merge(url, left_on='best_match', right_on='restaurante', how='left')

In [19]:
merged_data.head()

Unnamed: 0,restaurante_x,gastronomia,precio,opinion,reseña,best_match,0,restaurante_y
0,Yakiniku Rikyu,Japonesa,Medio,267 opiniones,Patrocinado,Yakiniku_Rikyu,https://www.tripadvisor.es/Restaurant_Review-g...,Yakiniku_Rikyu
1,marmitón,Mediterránea,Medio,89 opiniones,MICHELIN,Marmiton,https://www.tripadvisor.es/Restaurant_Review-g...,Marmiton
2,Vinoteca Moratín,Internacional,Medio,1221 opiniones,MICHELIN,Moratin_Vinoteca_Bistrot,https://www.tripadvisor.es/Restaurant_Review-g...,Moratin_Vinoteca_Bistrot
3,Pilar Akaneya,Japonesa,Alto,276 opiniones,MICHELIN,Pilar_Akaneya,https://www.tripadvisor.es/Restaurant_Review-g...,Pilar_Akaneya
4,Gioia,Italiana,Medio,387 opiniones,MICHELIN,Gioia,https://www.tripadvisor.es/Restaurant_Review-g...,Gioia


In [20]:

# Eliminar columnas innecesarias
merged_data.drop(['best_match', 'restaurante_y'], axis=1, inplace=True)
merged_data.rename(columns={'restaurante_x': 'restaurante'}, inplace=True)


In [21]:
merged_data.rename(columns={'0': 'url'}, inplace=True)

In [22]:
merged_data.head()

Unnamed: 0,restaurante,gastronomia,precio,opinion,reseña,url
0,Yakiniku Rikyu,Japonesa,Medio,267 opiniones,Patrocinado,https://www.tripadvisor.es/Restaurant_Review-g...
1,marmitón,Mediterránea,Medio,89 opiniones,MICHELIN,https://www.tripadvisor.es/Restaurant_Review-g...
2,Vinoteca Moratín,Internacional,Medio,1221 opiniones,MICHELIN,https://www.tripadvisor.es/Restaurant_Review-g...
3,Pilar Akaneya,Japonesa,Alto,276 opiniones,MICHELIN,https://www.tripadvisor.es/Restaurant_Review-g...
4,Gioia,Italiana,Medio,387 opiniones,MICHELIN,https://www.tripadvisor.es/Restaurant_Review-g...


In [29]:
len(merged_data)

306

In [28]:
merged_data.head()

Unnamed: 0,restaurante,gastronomia,precio,opinion,reseña,url
0,Yakiniku Rikyu,Japonesa,Medio,267 opiniones,Patrocinado,https://www.tripadvisor.es/Restaurant_Review-g...
1,marmitón,Mediterránea,Medio,89 opiniones,MICHELIN,https://www.tripadvisor.es/Restaurant_Review-g...
2,Vinoteca Moratín,Internacional,Medio,1221 opiniones,MICHELIN,https://www.tripadvisor.es/Restaurant_Review-g...
3,Pilar Akaneya,Japonesa,Alto,276 opiniones,MICHELIN,https://www.tripadvisor.es/Restaurant_Review-g...
4,Gioia,Italiana,Medio,387 opiniones,MICHELIN,https://www.tripadvisor.es/Restaurant_Review-g...


In [30]:
num_duplicates = merged_data.url.duplicated().sum()
num_duplicates

18

In [31]:
merged_data_no_duplicates = merged_data.drop_duplicates(subset='url', keep='first')

In [32]:
len(merged_data_no_duplicates)

288

In [33]:
merged_data_no_duplicates.head()

Unnamed: 0,restaurante,gastronomia,precio,opinion,reseña,url
0,Yakiniku Rikyu,Japonesa,Medio,267 opiniones,Patrocinado,https://www.tripadvisor.es/Restaurant_Review-g...
1,marmitón,Mediterránea,Medio,89 opiniones,MICHELIN,https://www.tripadvisor.es/Restaurant_Review-g...
2,Vinoteca Moratín,Internacional,Medio,1221 opiniones,MICHELIN,https://www.tripadvisor.es/Restaurant_Review-g...
3,Pilar Akaneya,Japonesa,Alto,276 opiniones,MICHELIN,https://www.tripadvisor.es/Restaurant_Review-g...
4,Gioia,Italiana,Medio,387 opiniones,MICHELIN,https://www.tripadvisor.es/Restaurant_Review-g...


In [34]:
reserva = merged_data_no_duplicates.copy()

In [35]:
reserva.to_csv('../restaurantes/datos/reserva.csv', index=False, encoding='utf-8')