In [3]:
from langdetect import detect, DetectorFactory
import pandas as pd
import os

In [4]:
def read_tripadvisor_data(path: str) -> pd.DataFrame:
    df = pd.DataFrame()
    size = len(os.listdir(path))

    for i in range(1, size + 1):
        file_path = os.path.join(path, f"Chile_all1_part{i}.csv")
        df = pd.concat([df, pd.read_csv(file_path, sep=';')], ignore_index=True)

    return df

In [5]:
def safe_detect(text):
    if not isinstance(text, str) or not text.strip():
        return 'unknown'
    try:
        return detect(text)
    except Exception:
        return 'unknown'

In [6]:
def filter_language_reviews(
        df: pd.DataFrame, 
        languages: tuple = ('es',), 
        text_col: str | None = None
) -> pd.DataFrame:
    """
    Filtra reseñas por uno o varios idiomas usando langdetect.
    - df: DataFrame original
    - languages: iterable de códigos de idioma (p. ej., ('es', 'en')); también acepta string único
    - text_col: nombre de la columna de texto (si None, intenta 'review_text' y luego 'Review')

    Retorna un nuevo DataFrame filtrado (copia).
    """
    # Reproducibilidad
    DetectorFactory.seed = 0

    # Selección de columna de texto
    if text_col is None:
        if 'review_text' in df.columns:
            text_col = 'review_text'
        elif 'Review' in df.columns:
            text_col = 'Review'
        else:
            raise ValueError("No se encontró la columna de reviews")
        
    lang_set = set(languages)

    tmp_lang = df[text_col].apply(safe_detect)
    mask = tmp_lang.isin(lang_set)
    filtered_df = df[mask].copy()
    return filtered_df

In [7]:
df = read_tripadvisor_data('./data')

In [8]:
DetectorFactory.seed = 0

def safe_detect(text: str) -> str:
    if not isinstance(text, str) or not text.strip():
        return 'unknown'
    try:
        return detect(text)
    except:
        return 'unknown'

# Asegurar existencia de la columna
if 'review_text' not in df.columns:
    raise ValueError("La columna 'review_text' no existe en el DataFrame.")

df['language'] = df['review_text'].apply(safe_detect)
filtered_df = df[df['language'].isin(['es', 'en'])].copy()
print('Total original:', len(df))
print('Filtrados (es/en):', len(filtered_df))
print(filtered_df['language'].value_counts())

df_2 = read_tripadvisor_data('./data')
filtered_df_2 = filter_language_reviews(df_2, languages=('es', 'en'), text_col='review_text')
print('Total original:', len(df_2))
print('Filtrados (es/en):', len(filtered_df_2))
print(filtered_df_2['language'].value_counts())

Total original: 565350
Filtrados (es/en): 462695
language
es    310430
en    152265
Name: count, dtype: int64
Total original: 565350
Filtrados (es/en): 462695
language
spanish       307284
english       155239
portuguese       172
Name: count, dtype: int64
Total original: 565350
Filtrados (es/en): 462695
language
spanish       307284
english       155239
portuguese       172
Name: count, dtype: int64


In [5]:
filtered_df.head()

Unnamed: 0,region_name,attraction_name,language,username,rating_review,title,review_text,written_date,visit_date,companion_type,contribution,sentiment,sentiment_score,location
0,I- Región de Tarapacá,Museo Corbeta Esmeralda,en,ARMANDO H,5,Remarkable Museum with the history of our sail...,Very well documented guide. Excellent descript...,2025-05-16,2025-05-01,Couples,6,VERY_POSITIVE,4,NO INFO
1,I- Región de Tarapacá,Museo Corbeta Esmeralda,en,Flyer26777231658,5,Excellent!,Excellent tour. The guide (Brian) showed off w...,2025-05-16,2025-05-01,Family,1,VERY_POSITIVE,4,NO INFO
2,I- Región de Tarapacá,Museo Corbeta Esmeralda,en,Henry R,5,Great family experience,"It was an extraordinary, exciting and very int...",2025-05-03,2025-05-01,Family,5,VERY_POSITIVE,4,NO INFO
3,I- Región de Tarapacá,Museo Corbeta Esmeralda,en,Freedom28030667115,5,Excellent experience,"Very nice place to visit, highlight the work o...",2025-04-10,2025-04-01,Family,1,VERY_POSITIVE,4,NO INFO
4,I- Región de Tarapacá,Museo Corbeta Esmeralda,en,Italia Q,5,Lovely experience,"The excellent Byron guide, you can see that he...",2025-02-22,2025-02-01,Family,5,VERY_POSITIVE,4,"Santiago, Chile"
