# üéØ TrustPilot Monitor Inteligente de Opini√≥n
## üíé Herramienta de Inteligencia Comercial Unificada y Mejorada

Este notebook representa la uni√≥n definitiva de los prototipos iniciales, optimizado para ofrecer una soluci√≥n robusta, escalable y profesional para el an√°lisis de la reputaci√≥n digital.

### üë§ Cr√©ditos y Autor√≠a:
- **Base del Proyecto**: [Juanes] (MONITOR_INTELIGENCIA_OPINION / Prototipos Iniciales)
- **Mejoras y Nuevos M√≥dulos**: [Rub√©n] (B√∫squeda din√°mica, Modo Stealth, IA H√≠brida, Dashboard Temporal)

### üî¨ Fases del Monitor:
1.  üì• **ADQUISICI√ìN DE DATOS**: Scraping inteligente con rotaci√≥n de identidad y b√∫squeda din√°mica. [Base: Juanes | Mejoras: Rub√©n]
2.  üßπ **PREPROCESAMIENTO NLP**: Limpieza profunda y tokenizaci√≥n de textos en espa√±ol. [Autor: Juanes]
3.  üíé **AN√ÅLISIS DE SENTIMIENTO**: Motor h√≠brido diccionario + IA avanzada. [Base: Juanes | Refuerzo AI: Rub√©n]
4.  üìä **DASHBOARD BI**: Visualizaci√≥n de impacto para toma de decisiones. [Visuales: Juanes | Eje Temporal: Rub√©n]

In [None]:
# =============================================================================
# FASE 0: PREPARACI√ìN DEL ENTORNO
# [Mejora Compatibilidad Rub√©n / Original Juanes]
# =============================================================================
import sys
import subprocess

print("üîß Inicializando entorno del monitor...")

def setup_environment():
    # Listado de librer√≠as esenciales unificadas
    libs = [
        "requests", "beautifulsoup4", "lxml", "nltk", "textblob", 
        "googletrans==4.0.0-rc1", "wordcloud", "matplotlib", 
        "seaborn", "plotly", "fake-useragent", "pandas", 
        "numpy", "regex", "tqdm", "spacy"
    ]
    
    for lib in libs:
        try:
            # [Rub√©n] Uso de sys.executable para garantizar instalaci√≥n en el kernel activo
            subprocess.check_call([sys.executable, "-m", "pip", "install", lib, "-q"])
        except:
            pass
    
    # Descarga de recursos NLP
    import nltk
    nltk.download(['punkt', 'stopwords', 'punkt_tab'], quiet=True)
    
    try:
        subprocess.check_call([sys.executable, "-m", "spacy", "download", "es_core_news_sm", "-q"])
    except:
        pass

setup_environment()
print("‚úÖ Entorno listo para procesamiento.")

In [None]:
import pandas as pd
import numpy as np
import requests
import re
import time
import random
from datetime import datetime
from typing import List, Dict, Optional
from collections import Counter
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from textblob import TextBlob
from googletrans import Translator
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import warnings

warnings.filterwarnings("ignore")
plt.style.use('seaborn-v0_8')
print("üìö Librer√≠as de an√°lisis cargadas.")

# üì• FASE 1: SCRAPER INTELIGENTE DE TRUSTPILOT
### [Base: Juanes | Mejoras Rub√©n: B√∫squeda y Modo Stealth]

In [None]:
class TrustpilotScraper:
    def __init__(self, business_query: str = None, max_pages: int = 5):
        self.query = business_query
        self.max_pages = max_pages
        self.ua = UserAgent() # [Rub√©n] Rotaci√≥n de identidad
        self.session = requests.Session()
        self.base_url = "https://es.trustpilot.com"
        self.target_url = None
        self.reviews_data = []

    def _get_headers(self):
        """[Rub√©n] Modo Stealth: Encabezados realistas aleatorios."""
        return {
            'User-Agent': self.ua.random,
            'Accept-Language': 'es-ES,es;q=0.9,en;q=0.8',
            'Referer': 'https://www.google.com/'
        }

    def search_business(self) -> bool:
        """[Rub√©n] B√∫squeda autom√°tica por palabra clave."""
        if not self.query: return False
        if "trustpilot.com/review/" in self.query:
            self.target_url = self.query
            return True

        print(f"üîç Buscando: '{self.query}'...")
        try:
            s_url = f"{self.base_url}/search?query={self.query.replace(' ', '+')}"
            res = self.session.get(s_url, headers=self._get_headers())
            soup = BeautifulSoup(res.content, 'html.parser')
            link = soup.select_one('a[data-business-unit-card-link="true"]')
            if link: 
                self.target_url = self.base_url + link['href']
                print(f"‚úÖ Encontrado: {self.target_url}")
                return True
        except: pass
        return False

    def safe_extract(self, element, select_list: List[str], attr: str = None) -> str:
        """[Juanes] L√≥gica resiliente ante cambios de HTML."""
        for selector in select_list:
            found = element.select_one(selector)
            if found:
                return found.get(attr, "") if attr else found.get_text(strip=True)
        return ""

    def run(self) -> pd.DataFrame:
        if not self.target_url and not self.search_business(): return pd.DataFrame()
        
        for page in range(1, self.max_pages + 1):
            print(f"üìÑ P√°gina {page}...")
            time.sleep(random.uniform(2, 4)) # [Rub√©n] Delay antidetect
            try:
                res = self.session.get(f"{self.target_url}?page={page}", headers=self._get_headers())
                soup = BeautifulSoup(res.content, 'html.parser')
                cards = soup.select('article[data-service-review-card-paper="true"]')
                
                for card in cards:
                    text = self.safe_extract(card, ['p[data-service-review-text-typography="true"]', 'p[data-review-content-typography="true"]'])
                    if not text: continue
                    
                    rating_alt = self.safe_extract(card, ['.styles_reviewHeader__iU9_n img'], 'alt')
                    rating = re.search(r'\d', rating_alt).group() if rating_alt else "0"
                    date_iso = self.safe_extract(card, ['time'], 'datetime')
                    
                    self.reviews_data.append({
                        'texto': text,
                        'puntuacion': int(rating),
                        'fecha': date_iso[:10] if date_iso else datetime.now().strftime('%Y-%m-%d')
                    })
            except: break
        return pd.DataFrame(self.reviews_data)

In [None]:
# [Rub√©n] Ejecuci√≥n del Scraper
empresa = input("Ingrese nombre de empresa o URL de Trustpilot: ") or "Amazon Spain"
df_raw = TrustpilotScraper(empresa, max_pages=3).run()

if df_raw.empty:
    # Dataset de cortes√≠a para pruebas r√°pidas si no hay internet o bloqueo
    df_raw = pd.DataFrame({'texto': ["Muy bueno", "Mal servicio", "Perfecto", "No lleg√≥"], 'puntuacion': [5, 1, 5, 1], 'fecha': ["2024-01-01"]*4})

print(f"üìå {len(df_raw)} rese√±as capturadas.")

# üßπ FASE 2: PREPROCESAMIENTO NLP
### [Autor Principal: Juanes]

In [None]:
class TextCleaner:
    def __init__(self):
        self.stop = set(stopwords.words('spanish'))
        self.stop.update(['amazon', 'servicio', 'producto', 'pedido', 'env√≠o'])

    def clean(self, text):
        # [Juanes] Normalizaci√≥n y eliminaci√≥n de ruido
        text = text.lower()
        text = re.sub(r'[^a-z√°√©√≠√≥√∫√º√±\s]', '', text)
        tokens = word_tokenize(text)
        return ' '.join([t for t in tokens if t not in self.stop and len(t) > 2])

cleaner = TextCleaner()
df_proc = df_raw.copy()
df_proc['texto_limpio'] = df_proc['texto'].apply(cleaner.clean)
print("üßπ Textos procesados y limpios.")

# üíé FASE 3: AN√ÅLISIS DE SENTIMIENTO
### [Base Juanes | Mejora AI Rub√©n]

In [None]:
class SentimentEngine:
    def __init__(self):
        # [Juanes] Diccionarios de referencia r√†pida
        self.pos = {'excelente', 'perfecto', 'bueno', 'r√°pido', 'recomiendo'}
        self.neg = {'malo', 'p√©simo', 'lento', 'estafa', 'horrible'}
        self.trans = Translator()

    def analyze(self, text):
        words = set(text.split())
        p_score = len(words & self.pos)
        n_score = len(words & self.neg)
        
        # [Rub√©n] Fallback a IA avanzada si el texto es descriptivo
        if len(text) > 50 and p_score == n_score:
            try:
                en_text = self.trans.translate(text, dest='en').text
                score = TextBlob(en_text).sentiment.polarity
                return 'positivo' if score > 0 else ('negativo' if score < 0 else 'neutral')
            except: pass
            
        return 'positivo' if p_score > n_score else ('negativo' if n_score > p_score else 'neutral')

engine = SentimentEngine()
df_proc['sentimiento'] = df_proc['texto_limpio'].apply(engine.analyze)
print(df_proc['sentimiento'].value_counts(normalize=True))

# üìä FASE 4: VISUALIZACI√ìN GR√ÅFICA (BI)
### [Dise√±o Juanes | Rub√©n: Visi√≥n Temporal]

In [None]:
plt.figure(figsize=(15, 10))

# 1. Distribuci√≥n Sentimiento [Juanes]
plt.subplot(2, 2, 1)
sns.countplot(data=df_proc, x='sentimiento', palette='viridis')
plt.title("Distribuci√≥n de Sentimiento [Juanes]")

# 2. Nube de Palabras [Juanes]
plt.subplot(2, 2, 2)
wc = WordCloud(background_color='white').generate(' '.join(df_proc['texto_limpio']))
plt.imshow(wc)
plt.axis('off')
plt.title("Nube de Temas [Juanes]")

# 3. Tendencia Temporal [Rub√©n]
plt.subplot(2, 2, 3)
df_proc['fecha'] = pd.to_datetime(df_proc['fecha'])
df_proc.groupby('fecha')['puntuacion'].mean().plot(marker='o', color='red')
plt.title("Evoluci√≥n de Satisfacci√≥n [Rub√©n]")

# 4. Consistencia IA vs Estrellas [Rub√©n]
plt.subplot(2, 2, 4)
sns.boxplot(data=df_proc, x='sentimiento', y='puntuacion')
plt.title("Consistencia IA vs Rating [Rub√©n]")

plt.tight_layout()
plt.show()