# IMPORTS

In [1]:
import os

import pandas as pd
pd.options.display.max_columns = None

import numpy as np
import plotly.express as px

from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

from statistics import mean
from scipy import signal

from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as stats
from scipy.fft import fft

import nltk
try:
    nltk.data.find('corpora/stopwords')
except nltk.downloader.DownloadError:
    nltk.download('stopwords')

from nltk.corpus import stopwords
spanish_stop_words = stopwords.words('spanish')
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

from dateutil.relativedelta import relativedelta
from datetime import datetime

import sys

import re

from matplotlib import pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

# OBTENCIÓN DE DATOS

In [2]:
# Cargar el archivo csv
df_noticias_pais = pd.read_csv('Data/df_noticias_pais.csv')
df_noticias_region = pd.read_csv('Data/df_noticias_region.csv')  

In [3]:
df_noticias_pais.head(2)

Unnamed: 0,id,title,slug,subcategory,category,published,tags,País,Usuarios activos,Usuarios recurrentes,Sesiones,Vistas,Vistas por usuario activo,Vistas por sesión,Tiempo de interacción medio por usuario activo,Duración media de la sesión,Porcentaje de rebote
0,168132,Cerco a las viviendas turísticas: las nuevas n...,168132_cerco-a-las-viviendas-turisticas-las-nu...,Actualidad hotelera,Hoteles y Alojamientos,2025-01-31,"alquiler turístico, Ayuntamiento de Granada, A...",Argentina,12.0,10.0,19.0,22.0,1.833333,1.157895,158.75,56.680664,0.157895
1,168132,Cerco a las viviendas turísticas: las nuevas n...,168132_cerco-a-las-viviendas-turisticas-las-nu...,Actualidad hotelera,Hoteles y Alojamientos,2025-01-31,"alquiler turístico, Ayuntamiento de Granada, A...",Australia,2.0,1.0,3.0,4.0,2.0,1.333333,28.5,101.715149,0.0


In [4]:
df_noticias_pais.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 448039 entries, 0 to 448038
Data columns (total 17 columns):
 #   Column                                          Non-Null Count   Dtype  
---  ------                                          --------------   -----  
 0   id                                              448039 non-null  int64  
 1   title                                           448039 non-null  object 
 2   slug                                            448039 non-null  object 
 3   subcategory                                     448039 non-null  object 
 4   category                                        448039 non-null  object 
 5   published                                       448039 non-null  object 
 6   tags                                            445467 non-null  object 
 7   País                                            448039 non-null  object 
 8   Usuarios activos                                448039 non-null  float64
 9   Usuarios recurrentes      

In [5]:
df_noticias_region.head(2)

Unnamed: 0,id,title,slug,subcategory,category,published,tags,Región,Usuarios activos,Usuarios recurrentes,Sesiones,Vistas,Vistas por usuario activo,Vistas por sesión,Tiempo de interacción medio por usuario activo,Duración media de la sesión,Porcentaje de rebote
0,168132,Cerco a las viviendas turísticas: las nuevas n...,168132_cerco-a-las-viviendas-turisticas-las-nu...,Actualidad hotelera,Hoteles y Alojamientos,2025-01-31,"alquiler turístico, Ayuntamiento de Granada, A...",Andalusia,137.0,83.0,170.0,169.0,1.221253,0.914434,82.321178,74.193478,0.07714
1,168132,Cerco a las viviendas turísticas: las nuevas n...,168132_cerco-a-las-viviendas-turisticas-las-nu...,Actualidad hotelera,Hoteles y Alojamientos,2025-01-31,"alquiler turístico, Ayuntamiento de Granada, A...",Aragon,11.0,7.0,17.0,13.0,1.37037,0.633333,69.777778,61.345131,0.333333


In [6]:
df_noticias_region.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325793 entries, 0 to 325792
Data columns (total 17 columns):
 #   Column                                          Non-Null Count   Dtype  
---  ------                                          --------------   -----  
 0   id                                              325793 non-null  int64  
 1   title                                           325793 non-null  object 
 2   slug                                            325793 non-null  object 
 3   subcategory                                     325793 non-null  object 
 4   category                                        325793 non-null  object 
 5   published                                       325793 non-null  object 
 6   tags                                            323528 non-null  object 
 7   Región                                          325793 non-null  object 
 8   Usuarios activos                                325793 non-null  float64
 9   Usuarios recurrentes      

In [7]:
# Convertir la columna a datetime
df_noticias_pais['published'] = pd.to_datetime(df_noticias_pais['published'])
df_noticias_region['published'] = pd.to_datetime(df_noticias_region['published'])

# MODELO IA

Desarrollamos un modelo para predecir las visitas a noticias por país y, para España, por región.

Para el entrenamiento del modelo, utilizaremos exclusivamente noticias maduras, es decir, aquellas cuya curva de visitas ya ha descendido y el número de visitas se ha estabilizado. Esto asegura que el modelo aprenda de datos consistentes y representativos del comportamiento final de la noticia.

El input para la predicción es: fecha, título, categoría y subcategoría. El output serán las visitas esperadas por país y, si es España, por región.

## Tratamiento de datos del modelo ia

In [8]:
def preparacion_dataset(df_region, df_pais):

    # Seleccionar columnas relevantes
    df_region = df_region[['id', 'title', 'subcategory', 'category', 'published', 'tags', 'Región', 'Vistas']]
    df_pais = df_pais[['id', 'title', 'subcategory', 'category', 'published', 'tags', 'País', 'Vistas']]
    
    # Eliminar filas que contenga nulos en Visitas, País o Región
    df_region = df_region.dropna(subset=['Vistas', 'Región'])
    df_pais = df_pais.dropna(subset=['Vistas', 'País'])
    
    # Encontrar la fecha de publicación máxima en el dataset actual
    df_region_max_date = df_region['published'].max()
    df_pais_max_date = df_pais['published'].max()
    
    # Restar 2 meses a la fecha maxima (para que las noticias sean "maduras")
    df_region_menos_2meses = df_region_max_date - relativedelta(months=2)
    df_pais_menos_2meses = df_pais_max_date - relativedelta(months=2)
    
    # Eliminar noticias no maduras del dataset
    df_region = df_region[df_region["published"] >= df_region_menos_2meses]
    df_pais = df_pais[df_pais["published"] >= df_pais_menos_2meses]
    print(f"Número de noticias por region: {len(df_region)}")
    print(f"Número de noticias por pais: {len(df_pais)}")
    
    
    return df_region, df_pais

In [9]:
df_noticias_region, df_noticias_pais = preparacion_dataset(df_noticias_region, df_noticias_pais)

Número de noticias por region: 14953
Número de noticias por pais: 21468


In [10]:
df_noticias_pais.head(2)

Unnamed: 0,id,title,subcategory,category,published,tags,País,Vistas
0,168132,Cerco a las viviendas turísticas: las nuevas n...,Actualidad hotelera,Hoteles y Alojamientos,2025-01-31,"alquiler turístico, Ayuntamiento de Granada, A...",Argentina,22.0
1,168132,Cerco a las viviendas turísticas: las nuevas n...,Actualidad hotelera,Hoteles y Alojamientos,2025-01-31,"alquiler turístico, Ayuntamiento de Granada, A...",Australia,4.0


In [11]:
df_noticias_region.head(2)

Unnamed: 0,id,title,subcategory,category,published,tags,Región,Vistas
0,168132,Cerco a las viviendas turísticas: las nuevas n...,Actualidad hotelera,Hoteles y Alojamientos,2025-01-31,"alquiler turístico, Ayuntamiento de Granada, A...",Andalusia,169.0
1,168132,Cerco a las viviendas turísticas: las nuevas n...,Actualidad hotelera,Hoteles y Alojamientos,2025-01-31,"alquiler turístico, Ayuntamiento de Granada, A...",Aragon,13.0


In [12]:
#df_noticias_pais.to_csv("df_noticias_pais.csv", index=False)
#df_noticias_region.to_csv("df_noticias_region.csv", index=False)

In [13]:
def preprocess_fecha(df):
    # Convert 'fecha' to datetime, coercing errors to NaT
    df['published'] = pd.to_datetime(df['published'], errors='coerce')

    # Drop rows where 'fecha' is NaT (invalid dates)
    df.dropna(subset=['published'], inplace=True)

    # Feature Engineering from 'fecha'
    df['year'] = df['published'].dt.year.astype(int)
    df['month'] = df['published'].dt.month.astype(int)
    df['day'] = df['published'].dt.day.astype(int)
    df['day_of_week'] = df['published'].dt.dayofweek.astype(int) # Monday=0, Sunday=6
    df['day_of_year'] = df['published'].dt.dayofyear.astype(int)
    # isocalendar().week returns a UInt32, explicitly cast to int for consistency
    df['week_of_year'] = df['published'].dt.isocalendar().week.astype(int)

    # Drop original 'fecha' as we have extracted features
    df = df.drop('published', axis=1)

    return df

In [14]:
def clean_text(text):
    """
    Basic text cleaning: lowercase, remove punctuation, remove numbers.
    You might want to add more sophisticated cleaning like stop words removal, stemming, etc.
    """
    text = str(text).lower()
    text = re.sub(r'[^a-zñáéíóúü\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [15]:
# Clean the 'titulo' column
df_noticias_pais['title_cleaned'] = df_noticias_pais['title'].apply(clean_text)
df_noticias_region['title_cleaned'] = df_noticias_region['title'].apply(clean_text)

In [16]:
# Apply preprocessing
df_pais_processed = preprocess_fecha(df_noticias_pais.copy())
df_region_processed = preprocess_fecha(df_noticias_region.copy())

In [17]:
# Define features and target for country-level prediction
features_pais = ['category', 'subcategory', 'title_cleaned', 'País', 'year', 'month', 'day', 'day_of_week', 'day_of_year', 'week_of_year']
target_pais = 'Vistas'

X_pais = df_pais_processed[features_pais]
y_pais = df_pais_processed[target_pais]

## Modelo sin hiperparametros

### Entrenamiento

In [18]:
# Define features and target for region-level prediction (for Spain)
# Note: In a real scenario, df_noticias_region would only contain Spanish data
features_region = ['category', 'subcategory', 'title_cleaned', 'Región', 'year', 'month', 'day', 'day_of_week', 'day_of_year', 'week_of_year']
target_region = 'Vistas'

X_region = df_region_processed[features_region]
y_region = df_region_processed[target_region]

In [19]:
# Identify categorical features for One-Hot Encoding
categorical_features_pais = ['category', 'subcategory', 'País']
text_features_pais = 'title_cleaned'
categorical_features_region = ['category', 'subcategory', 'Región']
text_features_region = 'title_cleaned'

In [20]:
# Create preprocessor for country model
preprocessor_pais = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_pais),
        ('tfidf', TfidfVectorizer(max_features=1000, stop_words=spanish_stop_words), text_features_pais)
    ],
    remainder='passthrough'
)

# Create preprocessor for region model
preprocessor_region = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_region),
        ('tfidf', TfidfVectorizer(max_features=1000, stop_words=spanish_stop_words), text_features_region)
    ],
    remainder='passthrough'
)

In [21]:
# --- MODEL TRAINING ---

# 1. Country-level Model
print("\n--- TRAINING COUNTRY-LEVEL MODEL ---")
X_train_pais, X_test_pais, y_train_pais, y_test_pais = train_test_split(
    X_pais, y_pais, test_size=0.2, random_state=42
)


--- TRAINING COUNTRY-LEVEL MODEL ---


In [22]:
# Create the country model pipeline
model_pais = Pipeline(steps=[
    ('preprocessor', preprocessor_pais),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [23]:
# Train the country model
model_pais.fit(X_train_pais, y_train_pais)

In [24]:
# 2. Region-level Model (for Spain)
print("\n--- TRAINING REGION-LEVEL MODEL (FOR SPAIN) ---")
X_train_region, X_test_region, y_train_region, y_test_region = train_test_split(
    X_region, y_region, test_size=0.2, random_state=42
)


--- TRAINING REGION-LEVEL MODEL (FOR SPAIN) ---


In [25]:
# Create the region model pipeline
model_region = Pipeline(steps=[
    ('preprocessor', preprocessor_region),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [26]:
# Train the region model
model_region.fit(X_train_region, y_train_region)

### Evaluación

Mean Absolute Error (MAE)
El Mean Absolute Error (MAE), o Error Absoluto Medio, mide el promedio de las diferencias absolutas entre las predicciones y los valores reales.
- Valores más bajos son mejores: Un MAE de 0 indica un modelo perfecto (las predicciones son exactamente iguales a los valores reales).
- Misma unidad que la variable objetivo: El MAE se expresa en la misma unidad que la variable que estás prediciendo (en tu caso, "visitas"). Esto lo hace muy interpretable. Por ejemplo, si tu MAE es de 50, significa que, en promedio, tus predicciones se desvían de los valores reales en 50 visitas.

El R-squared (R2), o Coeficiente de Determinación, es una métrica que representa la proporción de la varianza en la variable dependiente (las visitas) que es predecible a partir de las variables independientes (tus características).
- Un R2 de 1.0 significa que el modelo explica el 100% de la variabilidad en la variable objetivo; es un ajuste perfecto.
- Un R2 de 0.0 significa que el modelo no explica ninguna de la variabilidad de la variable objetivo. Es tan bueno como simplemente predecir la media de los datos reales.
- Un R2 negativo es posible cuando el modelo es peor que simplemente predecir la media, lo que indica un modelo muy pobre.



In [27]:
# Overall typical visits for country data
overall_mean_pais = df_noticias_pais['Vistas'].mean()
overall_mean_region  = df_noticias_region['Vistas'].mean()
print(f"Número promedio de visitas en todas las noticias (País): {overall_mean_pais:.2f}")
print(f"Número promedio de visitas en todas las noticias (Región): {overall_mean_region:.2f}")

Número promedio de visitas en todas las noticias (País): 5.79
Número promedio de visitas en todas las noticias (Región): 55.02


In [28]:
# Evaluate Country Model
y_pred_pais = model_pais.predict(X_test_pais)
mae_pais = mean_absolute_error(y_test_pais, y_pred_pais)
r2_pais = r2_score(y_test_pais, y_pred_pais)

In [29]:
print(f"\nCountry Model Evaluation:")
print(f"Mean Absolute Error (MAE): {mae_pais:.2f}")
print(f"R-squared (R2): {r2_pais:.2f}")


Country Model Evaluation:
Mean Absolute Error (MAE): 5.15
R-squared (R2): 0.01


In [30]:
# Evaluate Region Model
y_pred_region = model_region.predict(X_test_region)
mae_region = mean_absolute_error(y_test_region, y_pred_region)
r2_region = r2_score(y_test_region, y_pred_region)

In [31]:
print(f"\nRegion Model Evaluation:")
print(f"Mean Absolute Error (MAE): {mae_region:.2f}")
print(f"R-squared (R2): {r2_region:.2f}")


Region Model Evaluation:
Mean Absolute Error (MAE): 46.49
R-squared (R2): 0.12


### Función de predicción

In [32]:
def predict_news_visits(published: str, title: str, category: str, subcategory: str):
    """
    Predicts news visits by country and, if Spain, by region.

    Args:
        published (str): Date of the news (e.g., 'YYYY-MM-DD').
        title (str): Title of the news.
        categoria (str): Category of the news.
        subcategoria (str): Subcategory of the news.

    Returns:
        dict: A dictionary containing predicted visits by country, and by region if Spain.
    """
    input_data = pd.DataFrame([{
        'published': published,
        'title': title,
        'category': category,
        'subcategory': subcategory
    }])
    
    # Clean the title for prediction, consistent with training
    input_data['title_cleaned'] = input_data['title'].apply(clean_text)

    # Convert 'published' to datetime, coercing errors
    input_data['published'] = pd.to_datetime(input_data['published'], errors='coerce')
    # Drop if date is invalid, though for single prediction, you might raise an error
    if input_data['published'].isna().any():
        raise ValueError("Formato de fecha no valido para la predicción.")

    # Feature Engineering from 'fecha'
    input_data['year'] = input_data['published'].dt.year.astype(int)
    input_data['month'] = input_data['published'].dt.month.astype(int)
    input_data['day'] = input_data['published'].dt.day.astype(int)
    input_data['day_of_week'] = input_data['published'].dt.dayofweek.astype(int)
    input_data['day_of_year'] = input_data['published'].dt.dayofyear.astype(int)
    input_data['week_of_year'] = input_data['published'].dt.isocalendar().week.astype(int)

    # Drop original 'published' and 'title' (consistent with training preprocessing)
    input_data = input_data.drop(['published', 'title'], axis=1)

    predictions = {}

    # --- Predict for each country ---
    unique_countries = X_pais['País'].unique() # Get unique countries from training data
    country_predictions_list = []

    for country in unique_countries:
        country_input = input_data.copy()
        country_input['País'] = country

        # Ensure the columns match the order of features_pais used in X_pais
        # before passing to the preprocessor
        country_input_reordered = country_input[
            ['category', 'subcategory', 'title_cleaned', 'País', 'year', 'month', 'day', 'day_of_week', 'day_of_year', 'week_of_year']
        ]

        # Use .iloc[0] to ensure it's a Series, not a DataFrame with one row
        predicted_visits = model_pais.predict(country_input_reordered)[0]
        country_predictions_list.append({'country': country, 'predicted_visits': max(0, int(predicted_visits))})

    predictions['country_visits'] = country_predictions_list

    # --- Predict for regions if the country is Spain ---
    unique_regions = X_region['Región'].unique() # Get unique regions from training data
    region_predictions_list = []

    for region in unique_regions:
        region_input = input_data.copy()
        region_input['Región'] = region

        # Ensure the columns match the order of features_region used in X_region
        region_input_reordered = region_input[
            ['category', 'subcategory', 'title_cleaned', 'Región', 'year', 'month', 'day', 'day_of_week', 'day_of_year', 'week_of_year']
        ]

        # Use .iloc[0] to ensure it's a Series, not a DataFrame with one row
        predicted_visits_region = model_region.predict(region_input_reordered)[0]
        region_predictions_list.append({'Región': region, 'predicted_visits': max(0, int(predicted_visits_region))})

    predictions['spain_region_visits'] = region_predictions_list

    return predictions

In [33]:
# --- EXAMPLE USAGE OF THE PREDICTION FUNCTION ---
print("\n--- EXAMPLE PREDICTION ---")
example_fecha = '2023-01-05'
example_titulo = 'Cerco a las viviendas turísticas'
example_categoria = 'Hoteles y Alojamientos'
example_subcategoria = 'Actualidad hotelera'

try:
    predicted_results = predict_news_visits(example_fecha, example_titulo, example_categoria, example_subcategoria)
    import json
    print(json.dumps(predicted_results, indent=4))
except ValueError as e:
    print(f"Prediction Error: {e}")


--- EXAMPLE PREDICTION ---
{
    "country_visits": [
        {
            "country": "Argentina",
            "predicted_visits": 8
        },
        {
            "country": "Australia",
            "predicted_visits": 1
        },
        {
            "country": "Belgium",
            "predicted_visits": 1
        },
        {
            "country": "Canada",
            "predicted_visits": 1
        },
        {
            "country": "Chile",
            "predicted_visits": 2
        },
        {
            "country": "Colombia",
            "predicted_visits": 6
        },
        {
            "country": "Cuba",
            "predicted_visits": 2
        },
        {
            "country": "Czechia",
            "predicted_visits": 1
        },
        {
            "country": "El Salvador",
            "predicted_visits": 1
        },
        {
            "country": "France",
            "predicted_visits": 4
        },
        {
            "country": "Germany",
          

In [34]:
# Example with a potentially problematic date
print("\n--- EXAMPLE PREDICTION (with invalid date) ---")
example_fecha_invalid = '2023-XX-05'
try:
    predicted_results_invalid = predict_news_visits(example_fecha_invalid, example_titulo, example_categoria, example_subcategoria)
    import json
    print(json.dumps(predicted_results_invalid, indent=4))
except ValueError as e:
    print(f"Prediction Error: {e}")


--- EXAMPLE PREDICTION (with invalid date) ---
Prediction Error: Formato de fecha no valido para la predicción.


## Modelo con hiperparametros

### Entrenamiento

In [35]:
# Define features and target for region-level prediction (for Spain)
# Note: In a real scenario, df_noticias_region would only contain Spanish data
features_region = ['category', 'subcategory', 'title_cleaned', 'Región', 'year', 'month', 'day', 'day_of_week', 'day_of_year', 'week_of_year']
target_region = 'Vistas'

X_region = df_region_processed[features_region]
y_region = df_region_processed[target_region]

In [36]:
# Identify categorical features for One-Hot Encoding
categorical_features_pais = ['category', 'subcategory', 'País']
text_features_pais = 'title_cleaned'
categorical_features_region = ['category', 'subcategory', 'Región']
text_features_region = 'title_cleaned'

In [37]:
# Create preprocessor for country model
preprocessor_pais = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_pais),
        ('tfidf', TfidfVectorizer(max_features=1000, stop_words=spanish_stop_words), text_features_pais)
    ],
    remainder='passthrough'
)

# Create preprocessor for region model
preprocessor_region = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_region),
        ('tfidf', TfidfVectorizer(max_features=1000, stop_words=spanish_stop_words), text_features_region)
    ],
    remainder='passthrough'
)

In [38]:
# --- Model Pipelines (Base Models) ---
base_model_pais = Pipeline(steps=[
    ('preprocessor', preprocessor_pais),
    ('regressor', RandomForestRegressor(random_state=42)) 
])

base_model_region = Pipeline(steps=[
    ('preprocessor', preprocessor_region),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [39]:
# --- Hyperparameter Tuning (GridSearchCV) ---
print("\n--- Iniciando la búsqueda de hiperparámetros con GridSearchCV ---")
print("GridSearchCV explora sistemáticamente combinaciones de hiperparámetros para encontrar las que dan el mejor rendimiento.")
print("La métrica de optimización es el Mean Absolute Error (MAE) negado, ya que GridSearchCV busca maximizar un score.")

# Define parameter grid for RandomForestRegressor
# This is an illustrative grid; in a real-world scenario, you might explore more values
# or use RandomizedSearchCV for a wider search space if computation time is a concern.
param_grid = {
    'regressor__n_estimators': [50, 100], # Número de árboles en el bosque
    'regressor__max_features': ['sqrt', 'log2'], # Número de características a considerar para la mejor división
    'regressor__max_depth': [10, 20], # Profundidad máxima de cada árbol (None significa profundidad completa)
    'regressor__min_samples_split': [2, 5], # Mínimo de muestras requeridas para dividir un nodo interno
    'regressor__min_samples_leaf': [1, 2] # Mínimo de muestras requeridas para estar en un nodo hoja
}


--- Iniciando la búsqueda de hiperparámetros con GridSearchCV ---
GridSearchCV explora sistemáticamente combinaciones de hiperparámetros para encontrar las que dan el mejor rendimiento.
La métrica de optimización es el Mean Absolute Error (MAE) negado, ya que GridSearchCV busca maximizar un score.


In [40]:
# --- For Country Model ---
print("\nBuscando los mejores hiperparámetros para el Modelo de País (Country Model)...")
grid_search_pais = GridSearchCV(
    base_model_pais,
    param_grid,
    cv=3,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=1
)
try:
    grid_search_pais.fit(X_pais, y_pais)
    print("\n--- Resultados del Ajuste de Hiperparámetros para el Modelo de País ---")
    print("Mejores hiperparámetros encontrados:")
    print(grid_search_pais.best_params_)
    print(f"Mejor MAE (validación cruzada): {-grid_search_pais.best_score_:.2f} visitas")
    model_pais_tuned = grid_search_pais.best_estimator_
except Exception as e:
    print(f"Error durante el ajuste del Modelo de País: {e}")
    print("No se pudo obtener el mejor estimador. Podría ser un problema con los datos simulados o la configuración del GridSearchCV.")
    # Assign a default model if tuning fails
    model_pais_tuned = base_model_pais
    model_pais_tuned.fit(X_pais, y_pais) # Fit base model for demonstration


Buscando los mejores hiperparámetros para el Modelo de País (Country Model)...
Fitting 3 folds for each of 32 candidates, totalling 96 fits

--- Resultados del Ajuste de Hiperparámetros para el Modelo de País ---
Mejores hiperparámetros encontrados:
{'regressor__max_depth': 20, 'regressor__max_features': 'sqrt', 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 100}
Mejor MAE (validación cruzada): 5.48 visitas


In [41]:
# --- For Region Model ---
print("\nBuscando los mejores hiperparámetros para el Modelo de Región (Region Model - para España)...")
grid_search_region = GridSearchCV(
    base_model_region,
    param_grid,
    cv=3,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=1
)
try:
    grid_search_region.fit(X_region, y_region)
    print("\n--- Resultados del Ajuste de Hiperparámetros para el Modelo de Región ---")
    print("Mejores hiperparámetros encontrados:")
    print(grid_search_region.best_params_)
    print(f"Mejor MAE (validación cruzada): {-grid_search_region.best_score_:.2f} visitas")
    model_region_tuned = grid_search_region.best_estimator_
except Exception as e:
    print(f"Error durante el ajuste del Modelo de Región: {e}")
    print("No se pudo obtener el mejor estimador. Podría ser un problema con los datos simulados o la configuración del GridSearchCV.")
    # Assign a default model if tuning fails
    model_region_tuned = base_model_region
    model_region_tuned.fit(X_region, y_region) 


Buscando los mejores hiperparámetros para el Modelo de Región (Region Model - para España)...
Fitting 3 folds for each of 32 candidates, totalling 96 fits

--- Resultados del Ajuste de Hiperparámetros para el Modelo de Región ---
Mejores hiperparámetros encontrados:
{'regressor__max_depth': 20, 'regressor__max_features': 'sqrt', 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 100}
Mejor MAE (validación cruzada): 60.82 visitas


In [42]:
print("\n--- Entrenamiento de modelos con hiperparámetros ajustados (o base) ---")


--- Entrenamiento de modelos con hiperparámetros ajustados (o base) ---


### Evaluación

In [43]:
# Re-training and Evaluation of Tuned Country Model
X_train_pais, X_test_pais, y_train_pais, y_test_pais = train_test_split(
    X_pais, y_pais, test_size=0.2, random_state=42
)
# Ensure the model is fitted on the training split for fair evaluation
model_pais_tuned.fit(X_train_pais, y_train_pais)
y_pred_pais_tuned = model_pais_tuned.predict(X_test_pais)
mae_pais_tuned = mean_absolute_error(y_test_pais, y_pred_pais_tuned)
r2_pais_tuned = r2_score(y_test_pais, y_pred_pais_tuned)

print(f"\nEvaluación del Modelo de País (Ajustado):")
print(f"  Mean Absolute Error (MAE): {mae_pais_tuned:.2f} visitas")
print(f"  R-squared (R2): {r2_pais_tuned:.2f}")


Evaluación del Modelo de País (Ajustado):
  Mean Absolute Error (MAE): 6.25 visitas
  R-squared (R2): 0.06


In [44]:
# Re-training and Evaluation of Tuned Region Model
X_train_region, X_test_region, y_train_region, y_test_region = train_test_split(
    X_region, y_region, test_size=0.2, random_state=42
)
# Ensure the model is fitted on the training split for fair evaluation
model_region_tuned.fit(X_train_region, y_train_region)
y_pred_region_tuned = model_region_tuned.predict(X_test_region)
mae_region_tuned = mean_absolute_error(y_test_region, y_pred_region_tuned)
r2_region_tuned = r2_score(y_test_region, y_pred_region_tuned)

print(f"\nEvaluación del Modelo de Región (Ajustado):")
print(f"  Mean Absolute Error (MAE): {mae_region_tuned:.2f} visitas")
print(f"  R-squared (R2): {r2_region_tuned:.2f}")


Evaluación del Modelo de Región (Ajustado):
  Mean Absolute Error (MAE): 50.82 visitas
  R-squared (R2): 0.24


### Función de predicción

In [45]:
def predict_news_visits(published: str, title: str, category: str, subcategory: str):
    """
    Predicts news visits by country and, if Spain, by region.

    Args:
        published (str): Date of the news (e.g., 'YYYY-MM-DD').
        title (str): Title of the news.
        categoria (str): Category of the news.
        subcategoria (str): Subcategory of the news.

    Returns:
        dict: A dictionary containing predicted visits by country, and by region if Spain.
    """
    input_data = pd.DataFrame([{
        'published': published,
        'title': title,
        'category': category,
        'subcategory': subcategory
    }])
    
    # Clean the title for prediction, consistent with training
    input_data['title_cleaned'] = input_data['title'].apply(clean_text)

    # Convert 'published' to datetime, coercing errors
    input_data['published'] = pd.to_datetime(input_data['published'], errors='coerce')
    # Drop if date is invalid, though for single prediction, you might raise an error
    if input_data['published'].isna().any():
        raise ValueError("Formato de fecha no valido para la predicción.")

    # Feature Engineering from 'fecha'
    input_data['year'] = input_data['published'].dt.year.astype(int)
    input_data['month'] = input_data['published'].dt.month.astype(int)
    input_data['day'] = input_data['published'].dt.day.astype(int)
    input_data['day_of_week'] = input_data['published'].dt.dayofweek.astype(int)
    input_data['day_of_year'] = input_data['published'].dt.dayofyear.astype(int)
    input_data['week_of_year'] = input_data['published'].dt.isocalendar().week.astype(int)

    # Drop original 'published' and 'title' (consistent with training preprocessing)
    input_data = input_data.drop(['published', 'title'], axis=1)

    predictions = {}

    # --- Predict for each country ---
    unique_countries = X_pais['País'].unique() # Get unique countries from training data
    country_predictions_list = []

    for country in unique_countries:
        country_input = input_data.copy()
        country_input['País'] = country

        # Ensure the columns match the order of features_pais used in X_pais
        # before passing to the preprocessor
        country_input_reordered = country_input[
            ['category', 'subcategory', 'title_cleaned', 'País', 'year', 'month', 'day', 'day_of_week', 'day_of_year', 'week_of_year']
        ]

        # Use .iloc[0] to ensure it's a Series, not a DataFrame with one row
        predicted_visits = model_pais_tuned.predict(country_input_reordered)[0]
        country_predictions_list.append({'country': country, 'predicted_visits': max(0, int(predicted_visits))})

    predictions['country_visits'] = country_predictions_list

    # --- Predict for regions if the country is Spain ---
    unique_regions = X_region['Región'].unique() # Get unique regions from training data
    region_predictions_list = []

    for region in unique_regions:
        region_input = input_data.copy()
        region_input['Región'] = region

        # Ensure the columns match the order of features_region used in X_region
        region_input_reordered = region_input[
            ['category', 'subcategory', 'title_cleaned', 'Región', 'year', 'month', 'day', 'day_of_week', 'day_of_year', 'week_of_year']
        ]

        # Use .iloc[0] to ensure it's a Series, not a DataFrame with one row
        predicted_visits_region = model_region_tuned.predict(region_input_reordered)[0]
        region_predictions_list.append({'Región': region, 'predicted_visits': max(0, int(predicted_visits_region))})

    predictions['spain_region_visits'] = region_predictions_list

    return predictions

In [46]:
# --- EXAMPLE USAGE OF THE PREDICTION FUNCTION ---
print("\n--- EXAMPLE PREDICTION ---")
example_fecha = '2023-01-05'
example_titulo = 'Cerco a las viviendas turísticas'
example_categoria = 'Hoteles y Alojamientos'
example_subcategoria = 'Actualidad hotelera'

try:
    predicted_results = predict_news_visits(example_fecha, example_titulo, example_categoria, example_subcategoria)
    import json
    print(json.dumps(predicted_results, indent=4))
except ValueError as e:
    print(f"Prediction Error: {e}")


--- EXAMPLE PREDICTION ---
{
    "country_visits": [
        {
            "country": "Argentina",
            "predicted_visits": 4
        },
        {
            "country": "Australia",
            "predicted_visits": 3
        },
        {
            "country": "Belgium",
            "predicted_visits": 3
        },
        {
            "country": "Canada",
            "predicted_visits": 3
        },
        {
            "country": "Chile",
            "predicted_visits": 3
        },
        {
            "country": "Colombia",
            "predicted_visits": 3
        },
        {
            "country": "Cuba",
            "predicted_visits": 3
        },
        {
            "country": "Czechia",
            "predicted_visits": 3
        },
        {
            "country": "El Salvador",
            "predicted_visits": 3
        },
        {
            "country": "France",
            "predicted_visits": 3
        },
        {
            "country": "Germany",
          

In [47]:
# Example with a potentially problematic date
print("\n--- EXAMPLE PREDICTION (with invalid date) ---")
example_fecha_invalid = '2023-XX-05'
try:
    predicted_results_invalid = predict_news_visits(example_fecha_invalid, example_titulo, example_categoria, example_subcategoria)
    import json
    print(json.dumps(predicted_results_invalid, indent=4))
except ValueError as e:
    print(f"Prediction Error: {e}")


--- EXAMPLE PREDICTION (with invalid date) ---
Prediction Error: Formato de fecha no valido para la predicción.


despues de hacer modelo, el rdo no es concluyendo ya que hay un problema en el dato. se tiene qe recoger mejor los datos igual que se ha ido explicando en los apartados anteiores. 