In [5]:
import json
from collections import defaultdict, Counter
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
json_path = "../data/data/IPBLN/json/gesbib_authors_ipbln_info.json"

# Cargar JSON
with open(json_path, encoding="utf-8") as f:
    author_data = json.load(f)

counter = 0
for id, author_info in author_data.items():
    if author_info["numPubs"] == 0:
        counter += 1
print(f"Autores sin publicaciones: {counter}")

# Crear estructura para almacenar datos
rows = []
for author_id, data in author_data.items():
    materias = defaultdict(int)
    
    # Añadir materias JCR
    for m, v in data.get("materiasJcr", {}).items():
        materias[m.lower().strip()] += v
        
    # Añadir materias CS
    for m, v in data.get("materiasCs", {}).items():
        materias[m.lower().strip()] += v

    # Crear fila
    row = {"author_id": author_id}
    row.update(materias)
    rows.append(row)

# Crear DataFrame
df = pd.DataFrame(rows)
df = df.fillna(0)  # reemplazar NaNs con 0
# Mostrar resumen
print(f"Autores: {df.shape[0]} | Materias distintas: {df.shape[1] - 1}")
df.head()

Autores sin publicaciones: 25
Autores: 724 | Materias distintas: 335


Unnamed: 0,author_id,"chemistry, organic","chemistry, medicinal","chemistry, multidisciplinary",materials chemistry,molecular biology,pharmaceutical science,general medicine,clinical biochemistry,metals and alloys,...,regional & urban planning,"management, monitoring, policy and law",nature and landscape conservation,nuclear science & technology,radiation,otorhinolaryngology,"medicine, legal",mathematical physics,small animals,urban studies
0,2078,3.0,2.0,4.0,1.0,1.0,1.0,4.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2105,3.0,0.0,6.0,2.0,2.0,5.0,7.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4243,1.0,0.0,1.0,0.0,29.0,1.0,38.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4244,7.0,4.0,4.0,0.0,34.0,3.0,67.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,13157,6.0,6.0,82.0,50.0,4.0,11.0,28.0,0.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Descargar recursos necesarios (solo la primera vez)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Stopwords estándar + personalizadas
stop_words = set(stopwords.words('english')).union({
    "general", "miscellaneous", "misc", "applied", "special", "multidisciplinary",
    "science", "sciences", "studies", "engineering", "technology", "systems",
    "methods", "interdisciplinary", "other", "clinical", "medical"
})

# Lematizador
lemmatizer = WordNetLemmatizer()

def clean_subject_nltk(subject):
    subject = subject.lower()
    subject = re.sub(r"[^\w\s]", " ", subject)
    tokens = nltk.word_tokenize(subject)
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words]
    return " ".join(tokens)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Pablo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pablo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Pablo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
# Suponiendo que df es tu DataFrame original y ya tiene una columna 'author_id'
df_original = df.copy()
columnas_materias = [col for col in df.columns if col != 'author_id']

# Paso 1: crear un mapeo columna original → nombre limpio
col_mapping = {
    original: clean_subject_nltk(original)
    for original in columnas_materias
}

# Paso 2: agrupar columnas que se limpian al mismo nombre
grupo_materias = defaultdict(list)
for original, cleaned in col_mapping.items():
    grupo_materias[cleaned].append(original)

# Paso 3: construir nuevo DataFrame con columnas fusionadas
df_fusionado = pd.DataFrame()
df_fusionado["author_id"] = df["author_id"]

for nombre_limpio, columnas in grupo_materias.items():
    df_fusionado[nombre_limpio] = df[columnas].sum(axis=1)

print(f"✅ Columnas antes: {len(columnas_materias)} | Después de fusionar: {df_fusionado.shape[1] - 1}")

✅ Columnas antes: 335 | Después de fusionar: 280


  df_fusionado[nombre_limpio] = df[columnas].sum(axis=1)
  df_fusionado[nombre_limpio] = df[columnas].sum(axis=1)
  df_fusionado[nombre_limpio] = df[columnas].sum(axis=1)
  df_fusionado[nombre_limpio] = df[columnas].sum(axis=1)
  df_fusionado[nombre_limpio] = df[columnas].sum(axis=1)
  df_fusionado[nombre_limpio] = df[columnas].sum(axis=1)
  df_fusionado[nombre_limpio] = df[columnas].sum(axis=1)
  df_fusionado[nombre_limpio] = df[columnas].sum(axis=1)
  df_fusionado[nombre_limpio] = df[columnas].sum(axis=1)
  df_fusionado[nombre_limpio] = df[columnas].sum(axis=1)
  df_fusionado[nombre_limpio] = df[columnas].sum(axis=1)
  df_fusionado[nombre_limpio] = df[columnas].sum(axis=1)
  df_fusionado[nombre_limpio] = df[columnas].sum(axis=1)
  df_fusionado[nombre_limpio] = df[columnas].sum(axis=1)
  df_fusionado[nombre_limpio] = df[columnas].sum(axis=1)
  df_fusionado[nombre_limpio] = df[columnas].sum(axis=1)
  df_fusionado[nombre_limpio] = df[columnas].sum(axis=1)
  df_fusionado[nombre_limpio] =

In [None]:
# # --- FILTRAR COLUMNAS ---
# min_authors = 2
# min_weight = 2

# # Calculamos cuántos autores usan cada materia
# col_author_counts = (df_cleaned.drop(columns='author_id') > 0).sum(axis=0)
# col_total_weights = df_cleaned.drop(columns='author_id').sum(axis=0)

# # Criterio combinado
# columns_to_keep = [
#     col for col in df_cleaned.columns if col == 'author_id' or (
#         col_author_counts.get(col, 0) >= min_authors and col_total_weights.get(col, 0) >= min_weight
#     )
# ]

# df_filtered = df_cleaned[columns_to_keep].copy()
# print(f"✅ Columnas tras filtrar: {df_filtered.shape[1]}")
# print(f"Diferencia de columnas: {df.shape[1] - df_filtered.shape[1]}")

✅ Columnas tras filtrar: 275
Diferencia de columnas: 61


In [9]:
# Separar ID y materias
author_ids = df_fusionado["author_id"]
df_materias = df_fusionado.drop(columns=["author_id"])

In [10]:
# Filas donde todas las materias son 0
mask_empty = df_materias.sum(axis=1) == 0

# Autores sin materias (para asignar "Unknown" después)
authors_unknown = author_ids[mask_empty].tolist()

# Filtrar autores válidos (que tienen al menos una materia)
df_valid = df_fusionado.loc[~mask_empty].reset_index(drop=True)

print(f"👉 Autores válidos: {df_valid.shape[0]} | Autores sin materias: {len(authors_unknown)}")

👉 Autores válidos: 608 | Autores sin materias: 116


In [6]:
# Crear un DataFrame para ellos con la etiqueta
import pandas as pd

df_unknown = pd.DataFrame({
    "author_id": authors_unknown,
    "assigned_topic": "Unknown"
})

## Comienzo del clustering por autores

In [21]:
vectors = df_valid.to_numpy()

In [56]:
from sklearn.preprocessing import StandardScaler

# Quitamos el author_id para el clustering
X = df_valid.drop(columns=["author_id"])

# Estandarizamos las columnas
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [81]:
from sklearn.decomposition import PCA

pca = PCA(n_components=10)  # puedes probar otros valores también
X_reduced = pca.fit_transform(X_scaled)

In [101]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

# === GRID SEARCH MANUAL ===

k_range = range(3, 16)
results = []
print(f"▶ Probando PCA con 10 componentes...")

for k in k_range:
    kmeans = KMeans(
        n_clusters=k,
        init="k-means++",
        random_state=42
    )
    labels = kmeans.fit_predict(X_reduced)

    # Evaluación
    sil = silhouette_score(X_reduced, labels)
    ch = calinski_harabasz_score(X_reduced, labels)
    db = davies_bouldin_score(X_reduced, labels)

    results.append({
        "pca_components": 10,
        "k": k,
        "silhouette": sil,
        "calinski_harabasz": ch,
        "davies_bouldin": db,
        "model": kmeans,
        "labels": labels
    })

▶ Probando PCA con 10 componentes...


In [102]:
# === RESULTADOS COMO DATAFRAME ===
df_results = pd.DataFrame(results)

# Ordenar por Silhouette como criterio principal
df_sorted = df_results.sort_values(by="silhouette", ascending=False).reset_index(drop=True)

# Mostrar top 5
print(df_sorted.head(10))

# === MEJOR COMBINACIÓN ===
best_model = df_sorted.iloc[0]["model"]
best_labels = df_sorted.iloc[0]["labels"]
best_pca = df_sorted.iloc[0]["pca_components"]
best_k = df_sorted.iloc[0]["k"]
print(f"\n✅ Mejor combinación: PCA={best_pca}, k={best_k}, Silhouette={df_sorted.iloc[0]['silhouette']:.4f}, Calinski-Harabasz={df_sorted.iloc[0]['calinski_harabasz']:.4f}, Davies-Bouldin={df_sorted.iloc[0]['davies_bouldin']:.4f}")

   pca_components   k  silhouette  calinski_harabasz  davies_bouldin  \
0              10   4    0.929936         121.296522        0.716332   
1              10   3    0.928693         109.056594        0.786839   
2              10   9    0.850919         240.062038        0.476180   
3              10   8    0.850331         222.801679        0.618446   
4              10  11    0.838357         366.820586        0.364881   
5              10  12    0.837833         376.879188        0.235939   
6              10  10    0.835940         307.480700        0.524229   
7              10   7    0.830523         187.792156        0.616576   
8              10   6    0.829488         145.950858        0.855987   
9              10   5    0.824916         132.523498        0.912675   

                                    model  \
0   KMeans(n_clusters=4, random_state=42)   
1   KMeans(n_clusters=3, random_state=42)   
2   KMeans(n_clusters=9, random_state=42)   
3                 KMeans(ra

In [103]:
from collections import defaultdict
import numpy as np

# Supongamos que tienes tu dataframe de materias normalizadas llamado `df_final`
# con filas = autores y columnas = materias, y que tienes acceso a:
# df_sorted -> resultados del GridSearch con silhouette, CH, DB...

# === Recuperar los mejores modelos por k ===
labels_k9 = df_sorted[df_sorted["k"] == 4].iloc[0]["labels"]
labels_k12 = df_sorted[df_sorted["k"] == 12].iloc[0]["labels"]

# === Función para agrupar materias dominantes por cluster ===
def describe_clusters(labels, df_data, top_n=10):
    cluster_subjects = defaultdict(lambda: defaultdict(float))

    for idx, cluster_id in enumerate(labels):
        for subj in df_data.columns:
            weight = float(df_data.iloc[idx][subj])
            if weight > 0:
                cluster_subjects[cluster_id][subj] += weight

    # Mostrar top materias por cluster
    for cluster_id, subject_weights in cluster_subjects.items():
        sorted_subjects = sorted(subject_weights.items(), key=lambda x: x[1], reverse=True)
        top_subjects = [s for s, _ in sorted_subjects[:top_n]]
        print(f"\n🔹 Cluster {cluster_id} ({len(subject_weights)} materias):")
        print(", ".join(top_subjects))

# === Descripción para k = 9 ===
print("\n📊 Clusters para k = 9:")
describe_clusters(labels_k9, X)

# === Descripción para k = 12 ===
print("\n📊 Clusters para k = 12:")
describe_clusters(labels_k12, X)


📊 Clusters para k = 9:

🔹 Cluster 0 (263 materias):
medicine, immunology, biochemistry molecular biology, cell biology, molecular biology, biochemistry, genetics, rheumatology, immunology allergy, parasitology

🔹 Cluster 2 (120 materias):
material, chemistry, crystallography, condensed matter physic, inorganic chemistry, physical theoretical chemistry, biochemistry, structural biology, material chemistry, chemistry inorganic nuclear

🔹 Cluster 3 (40 materias):
geosciences, oceanography, paleontology, geology, geography physical, ecology evolution behavior systematics, global planetary change, archeology art humanity, geochemistry petrology, archeology

🔹 Cluster 1 (79 materias):
rheumatology, immunology, immunology allergy, medicine, genetics, biochemistry genetics molecular biology, genetics heredity, , cell biology, pathology

📊 Clusters para k = 12:

🔹 Cluster 0 (242 materias):
immunology, medicine, cell biology, biochemistry molecular biology, molecular biology, genetics, rheumato

Número de autores por cluster

In [104]:
from collections import Counter

# labels_k9 es el array de etiquetas asignado por el modelo
cluster_counts = Counter(labels_k9)

print("📊 Número de autores por cluster (k=9):")
for cluster_id, count in sorted(cluster_counts.items()):
    print(f"🔹 Cluster {cluster_id}: {count} autores")

📊 Número de autores por cluster (k=9):
🔹 Cluster 0: 603 autores
🔹 Cluster 1: 1 autores
🔹 Cluster 2: 3 autores
🔹 Cluster 3: 1 autores


### Nombramos los clusters

In [96]:
from collections import defaultdict, Counter

def get_dominant_subjects(labels, df_data, min_authors=1, min_weight=1, top_n=5):
    cluster_subjects = defaultdict(list)
    subject_counts = defaultdict(lambda: defaultdict(int))  # subject_counts[cluster][subj]
    subject_weights = defaultdict(lambda: defaultdict(float))

    for idx, cluster_id in enumerate(labels):
        row = df_data.iloc[idx]
        for subj, weight in row.items():
            if weight > 0:
                subject_counts[cluster_id][subj] += 1
                subject_weights[cluster_id][subj] += weight

    cluster_labels = {}
    for cluster_id in sorted(set(labels)):
        # Filtrar materias dominantes
        counts = subject_counts[cluster_id]
        weights = subject_weights[cluster_id]
        dominant = {
            subj: weights[subj]
            for subj in counts
            if counts[subj] >= min_authors and weights[subj] >= min_weight
        }

        # Ordenar y seleccionar top N
        sorted_dominant = sorted(dominant.items(), key=lambda x: x[1], reverse=True)
        top_subjects = [s for s, _ in sorted_dominant[:top_n]]
        cluster_labels[cluster_id] = top_subjects

        # Mostrar por pantalla
        print(f"\n🔹 Cluster {cluster_id}:")
        print(f"Materias dominantes: {', '.join(top_subjects)}")

    return cluster_labels


In [97]:
dominant_per_cluster = get_dominant_subjects(labels_k9, X)


🔹 Cluster 0:
Materias dominantes: immunology, medicine, biochemistry molecular biology, molecular biology, cell biology

🔹 Cluster 1:
Materias dominantes: rheumatology, immunology, immunology allergy, medicine, genetics

🔹 Cluster 2:
Materias dominantes: material, crystallography, chemistry, condensed matter physic, structural biology

🔹 Cluster 3:
Materias dominantes: geosciences, oceanography, paleontology, geology, geography physical

🔹 Cluster 4:
Materias dominantes: chemistry, inorganic chemistry, chemistry inorganic nuclear, physical theoretical chemistry, material

🔹 Cluster 5:
Materias dominantes: geology, geochemistry petrology, mineralogy, geochemistry geophysics, material

🔹 Cluster 6:
Materias dominantes: plant, agronomy crop, zoology, ecology evolution behavior systematics, horticulture

🔹 Cluster 7:
Materias dominantes: medicine, immunology, cell biology, biochemistry, biochemistry molecular biology

🔹 Cluster 8:
Materias dominantes: material, chemistry, condensed matter