In [4]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd
import numpy as np
import joblib
import os

# === Cargar dataset base ===
df = pd.read_csv("../data/datasets/publications_training_dataset.csv", index_col='gb_id')
df['title'] = df['title'].fillna('')
df['keywords'] = df['keywords'].fillna('')
df['text'] = df['title'] + ' ' + df['keywords']
df['jcr_materia_main'] = df['jcr_materias'].fillna('').apply(lambda x: x.split(';')[0] if x else 'unknown')
df['citations'] = df['citations'].fillna(0)
df['citations_log'] = np.log1p(df['citations'])

for col in ['impact_factor', 'percentile', 'international_collab', 'num_countries', 'num_foreign_affils', 'num_spanish_affils']:
    df[col] = df[col].fillna(0)

# Etiquetas
label_counts = df['label'].value_counts()
df = df[df['label'].isin(label_counts[label_counts > 3].index)]
y = df['label'].copy()

# Variables
numeric_cols = ['citations_log', 'impact_factor', 'percentile', 'year',
                'international_collab', 'num_countries', 'num_foreign_affils', 'num_spanish_affils']

# === Grid de pruebas ===
max_features_list = [2000, 3000, 5000]
n_components_list = [500, 750, 850, 1000]

for max_f in max_features_list:
    for n_comp in n_components_list:
        print(f"\n🔍 Proceso: max_features={max_f}, n_components={n_comp}")

        text_pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=max_f, stop_words='english')),
            ('svd', TruncatedSVD(n_components=n_comp, random_state=42))
        ])

        categorical_pipeline = Pipeline([
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        numeric_pipeline = Pipeline([
            ('scaler', StandardScaler())
        ])

        preprocessor = ColumnTransformer(transformers=[
            ('text', text_pipeline, 'text'),
            ('cat', categorical_pipeline, ['language', 'jcr_materia_main']),
            ('num', numeric_pipeline, numeric_cols)
        ])

        X = df[['text', 'language', 'jcr_materia_main'] + numeric_cols]
        X_processed = preprocessor.fit_transform(X)

        if hasattr(X_processed, 'toarray'):
            X_processed = X_processed.toarray()

        # Guardar resultado
        dir_out = f"experiments/X_{max_f}_{n_comp}"
        os.makedirs(dir_out, exist_ok=True)
        pd.DataFrame(X_processed, index=df.index).to_csv(f"{dir_out}/X_ready.csv")
        y.to_csv(f"{dir_out}/y_ready.csv")
        print(f"✅ Guardado en {dir_out}")



🔍 Proceso: max_features=2000, n_components=500
✅ Guardado en experiments/X_2000_500

🔍 Proceso: max_features=2000, n_components=750
✅ Guardado en experiments/X_2000_750

🔍 Proceso: max_features=2000, n_components=850
✅ Guardado en experiments/X_2000_850

🔍 Proceso: max_features=2000, n_components=1000
✅ Guardado en experiments/X_2000_1000

🔍 Proceso: max_features=3000, n_components=500
✅ Guardado en experiments/X_3000_500

🔍 Proceso: max_features=3000, n_components=750
✅ Guardado en experiments/X_3000_750

🔍 Proceso: max_features=3000, n_components=850
✅ Guardado en experiments/X_3000_850

🔍 Proceso: max_features=3000, n_components=1000
✅ Guardado en experiments/X_3000_1000

🔍 Proceso: max_features=5000, n_components=500
✅ Guardado en experiments/X_5000_500

🔍 Proceso: max_features=5000, n_components=750
✅ Guardado en experiments/X_5000_750

🔍 Proceso: max_features=5000, n_components=850
✅ Guardado en experiments/X_5000_850

🔍 Proceso: max_features=5000, n_components=1000
✅ Guardado e