## Rentrenamiento a una version 9 del modelo de multiclasificacion mediante el algoritmo xgboost

In [1]:
import zipfile as zip

In [2]:
# CELDA 1: Imports y configuración inicial
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import glob
import pickle
import os
import zipfile as zip

print("🔧 CONFIGURACIÓN INICIAL COMPLETADA")
print("✅ Todas las librerías importadas correctamente")

🔧 CONFIGURACIÓN INICIAL COMPLETADA
✅ Todas las librerías importadas correctamente


In [3]:
# CELDA 2: Extraer dataset del zip
def extract_zip(zip_path, extract_to):
    with zip.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
        
# Extraer dataset
extract_zip('datos/archive.zip', 'datasets')
print("✅ Dataset extraído correctamente")

✅ Dataset extraído correctamente


In [4]:
# CELDA 3: Cargar y explorar datos
def load_and_explore_data():
    """Carga y explora el dataset"""
    print("📊 CARGANDO Y EXPLORANDO DATASET")
    print("="*50)
    
    # Cargar datos
    train_df = pd.read_csv('datasets/archive/Training.csv')
    test_df = pd.read_csv('datasets/archive/Testing.csv')
    
    print(f"📋 Training shape: {train_df.shape}")
    print(f"📋 Testing shape: {test_df.shape}")
    print(f"🏥 Enfermedades únicas: {train_df['prognosis'].nunique()}")
    print(f"🔢 Características: {len(train_df.columns) - 1}")
    
    # Mostrar primeras enfermedades
    diseases = train_df['prognosis'].unique()
    print(f"\n🏥 Primeras 10 enfermedades:")
    for i, disease in enumerate(diseases[:10], 1):
        print(f"  {i:2d}. {disease}")
    
    # Verificar balance de clases
    disease_counts = train_df['prognosis'].value_counts()
    print(f"\n📊 Balance de clases:")
    print(f"   Min samples: {disease_counts.min()}")
    print(f"   Max samples: {disease_counts.max()}")
    print(f"   Promedio: {disease_counts.mean():.1f}")
    
    return train_df, test_df

# Ejecutar exploración
train_df, test_df = load_and_explore_data()

📊 CARGANDO Y EXPLORANDO DATASET
📋 Training shape: (4920, 134)
📋 Testing shape: (42, 133)
🏥 Enfermedades únicas: 41
🔢 Características: 133

🏥 Primeras 10 enfermedades:
   1. Fungal infection
   2. Allergy
   3. GERD
   4. Chronic cholestasis
   5. Drug Reaction
   6. Peptic ulcer diseae
   7. AIDS
   8. Diabetes 
   9. Gastroenteritis
  10. Bronchial Asthma

📊 Balance de clases:
   Min samples: 120
   Max samples: 120
   Promedio: 120.0


In [5]:
# CELDA 4: Preparar datos para XGBoost v9
def prepare_data_v9(train_df, test_df):
    """Prepara datos específicamente para modelo v9"""
    print("\n🔧 PREPARANDO DATOS PARA MODELO V9")
    print("="*50)
    
    # Verificar columnas
    print("🔍 Verificando columnas...")
    print(f"📋 Training columns: {len(train_df.columns)}")
    print(f"📋 Testing columns: {len(test_df.columns)}")
    
    # Limpiar datos
    train_df_clean = train_df.copy()
    test_df_clean = test_df.copy()
    
    # Eliminar columnas 'Unnamed' si existen
    cols_to_drop = [col for col in train_df_clean.columns if 'Unnamed' in str(col)]
    if cols_to_drop:
        print(f"🗑️ Eliminando columnas problemáticas: {cols_to_drop}")
        train_df_clean = train_df_clean.drop(columns=cols_to_drop)
    
    cols_to_drop_test = [col for col in test_df_clean.columns if 'Unnamed' in str(col)]
    if cols_to_drop_test:
        test_df_clean = test_df_clean.drop(columns=cols_to_drop_test)
    
    # Obtener columnas comunes (características)
    common_cols = list(set(train_df_clean.columns) & set(test_df_clean.columns))
    
    # Separar características de target
    if 'prognosis' in common_cols:
        common_cols.remove('prognosis')
    
    feature_cols = sorted(common_cols)  # Ordenar para consistencia
    
    print(f"✅ Características comunes: {len(feature_cols)}")
    
    # Separar datos
    X_train = train_df_clean[feature_cols]
    y_train = train_df_clean['prognosis']
    X_test = test_df_clean[feature_cols]
    y_test = test_df_clean['prognosis']
    
    # Verificar que no hay valores NaN
    print(f"🔍 NaN en X_train: {X_train.isna().sum().sum()}")
    print(f"🔍 NaN en X_test: {X_test.isna().sum().sum()}")
    
    # Encoder para las enfermedades
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)
    y_test_encoded = label_encoder.transform(y_test)
    
    print(f"✅ Features finales: {len(feature_cols)}")
    print(f"✅ Clases: {len(label_encoder.classes_)}")
    print(f"✅ Training samples: {len(X_train)}")
    print(f"✅ Testing samples: {len(X_test)}")
    
    return X_train, X_test, y_train_encoded, y_test_encoded, label_encoder, feature_cols

# Preparar datos
X_train, X_test, y_train_encoded, y_test_encoded, label_encoder, feature_cols = prepare_data_v9(train_df, test_df)


🔧 PREPARANDO DATOS PARA MODELO V9
🔍 Verificando columnas...
📋 Training columns: 134
📋 Testing columns: 133
🗑️ Eliminando columnas problemáticas: ['Unnamed: 133']
✅ Características comunes: 132
🔍 NaN en X_train: 0
🔍 NaN en X_test: 0
✅ Features finales: 132
✅ Clases: 41
✅ Training samples: 4920
✅ Testing samples: 42


In [6]:
# CELDA 5: Entrenar modelo XGBoost v9
def train_xgboost_v9(X_train, y_train):
    """Entrena modelo XGBoost optimizado para v9"""
    print("\n🚀 ENTRENANDO XGBOOST V9")
    print("="*50)
    
    # Configuración optimizada para diagnóstico médico
    xgb_model = XGBClassifier(
        objective='multi:softprob',  # Multiclase con probabilidades
        n_estimators=300,            # Más árboles para mejor precisión
        max_depth=8,                 # Profundidad moderada
        learning_rate=0.1,           # Tasa de aprendizaje estándar
        subsample=0.8,               # Submuestreo para evitar overfitting
        colsample_bytree=0.8,        # Submuestreo de características
        random_state=42,             # Reproducibilidad
        n_jobs=-1,                   # Usar todos los cores
        eval_metric='mlogloss'       # Métrica para multiclase
    )
    
    print("🔄 Entrenando modelo...")
    xgb_model.fit(X_train, y_train)
    print("✅ Entrenamiento completado!")
    
    return xgb_model

# Entrenar modelo
xgb_model_v9 = train_xgboost_v9(X_train, y_train_encoded)


🚀 ENTRENANDO XGBOOST V9
🔄 Entrenando modelo...
✅ Entrenamiento completado!


In [7]:
# CELDA 6: Evaluar modelo v9
def evaluate_model_v9(model, X_test, y_test, label_encoder):
    """Evalúa el modelo v9"""
    print("\n📊 EVALUANDO MODELO V9")
    print("="*50)
    
    # Predicciones
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)
    
    # Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"🎯 Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
    
    # Convertir a nombres de enfermedades
    y_test_names = label_encoder.inverse_transform(y_test)
    y_pred_names = label_encoder.inverse_transform(y_pred)
    
    # Reporte detallado (solo las primeras líneas para no saturar)
    print("\n📋 Estadísticas por clase:")
    report = classification_report(y_test_names, y_pred_names, output_dict=True)
    print(f"   Precision promedio: {report['macro avg']['precision']:.4f}")
    print(f"   Recall promedio: {report['macro avg']['recall']:.4f}")
    print(f"   F1-score promedio: {report['macro avg']['f1-score']:.4f}")
    
    # Ejemplos de predicciones con confianza
    print("\n🔝 Ejemplos de predicciones con confianza:")
    for i in range(min(5, len(X_test))):
        top_3_idx = np.argsort(y_pred_proba[i])[-3:][::-1]
        print(f"\nMuestra {i+1}:")
        print(f"  Real: {y_test_names[i]}")
        print(f"  Predicho: {y_pred_names[i]} ({y_pred_proba[i][y_pred[i]]:.3f})")
        print(f"  Top 3:")
        for j, idx in enumerate(top_3_idx, 1):
            disease = label_encoder.inverse_transform([idx])[0]
            prob = y_pred_proba[i][idx]
            print(f"    {j}. {disease} ({prob:.3f})")
    
    return accuracy, y_pred, y_pred_proba

# Evaluar modelo
accuracy_v9, predictions_v9, probabilities_v9 = evaluate_model_v9(xgb_model_v9, X_test, y_test_encoded, label_encoder)


📊 EVALUANDO MODELO V9
🎯 Accuracy: 0.9762 (97.62%)

📋 Estadísticas por clase:
   Precision promedio: 0.9878
   Recall promedio: 0.9878
   F1-score promedio: 0.9837

🔝 Ejemplos de predicciones con confianza:

Muestra 1:
  Real: Fungal infection
  Predicho: Fungal infection (0.997)
  Top 3:
    1. Fungal infection (0.997)
    2. Drug Reaction (0.000)
    3. Chronic cholestasis (0.000)

Muestra 2:
  Real: Allergy
  Predicho: Allergy (0.997)
  Top 3:
    1. Allergy (0.997)
    2. Typhoid (0.000)
    3. Malaria (0.000)

Muestra 3:
  Real: GERD
  Predicho: GERD (0.997)
  Top 3:
    1. GERD (0.997)
    2. Drug Reaction (0.000)
    3. Migraine (0.000)

Muestra 4:
  Real: Chronic cholestasis
  Predicho: Chronic cholestasis (0.999)
  Top 3:
    1. Chronic cholestasis (0.999)
    2. Hepatitis C (0.000)
    3. Jaundice (0.000)

Muestra 5:
  Real: Drug Reaction
  Predicho: Drug Reaction (0.998)
  Top 3:
    1. Drug Reaction (0.998)
    2. Fungal infection (0.000)
    3. Urinary tract infection (0.0

In [8]:
# CELDA 7: Función de prueba rápida
def test_prediction_v9(model, label_encoder, feature_cols):
    """Prueba rápida de predicción"""
    print("\n🧪 PRUEBA DE PREDICCIÓN V9")
    print("="*50)
    
    # Crear síntomas de ejemplo para infección por hongos
    example_symptoms = {
        'itching': 1,
        'skin_rash': 1, 
        'nodal_skin_eruptions': 1,
        'dischromic_patches': 1
    }
    
    # Crear vector de características
    feature_vector = np.zeros(len(feature_cols))
    active_symptoms = []
    
    for symptom, value in example_symptoms.items():
        if symptom in feature_cols:
            idx = feature_cols.index(symptom)
            feature_vector[idx] = value
            active_symptoms.append(symptom)
    
    # Predicción
    feature_vector = feature_vector.reshape(1, -1)
    prediction = model.predict(feature_vector)[0]
    probabilities = model.predict_proba(feature_vector)[0]
    
    # Resultados
    predicted_disease = label_encoder.inverse_transform([prediction])[0]
    confidence = probabilities[prediction]
    
    print(f"🩺 Síntomas activos: {', '.join(active_symptoms)}")
    print(f"🎯 Predicción: {predicted_disease}")
    print(f"📊 Confianza: {confidence:.4f} ({confidence*100:.2f}%)")
    
    # Top 5 predicciones
    top_5_idx = np.argsort(probabilities)[-5:][::-1]
    print(f"\n🏆 Top 5 predicciones:")
    for i, idx in enumerate(top_5_idx, 1):
        disease = label_encoder.inverse_transform([idx])[0]
        prob = probabilities[idx]
        print(f"  {i}. {disease}: {prob:.4f} ({prob*100:.2f}%)")

# Ejecutar prueba
test_prediction_v9(xgb_model_v9, label_encoder, feature_cols)


🧪 PRUEBA DE PREDICCIÓN V9
🩺 Síntomas activos: itching, skin_rash, nodal_skin_eruptions
🎯 Predicción: Fungal infection
📊 Confianza: 0.9842 (98.42%)

🏆 Top 5 predicciones:
  1. Fungal infection: 0.9842 (98.42%)
  2. Drug Reaction: 0.0008 (0.08%)
  3. Chronic cholestasis: 0.0008 (0.08%)
  4. Acne: 0.0007 (0.07%)
  5. Impetigo: 0.0006 (0.06%)


In [10]:
# CELDA 8: Guardar modelo v9 para backend
def save_model_v9_final():
    """Guardar modelo v9 final para backend"""
    print("\n💾 GUARDANDO MODELO V9 FINAL PARA BACKEND")
    print("="*50)
    
    # Crear directorios necesarios
    backend_dir = '../../../Backend/models'
    local_dir = 'models'
    
    os.makedirs(backend_dir, exist_ok=True)
    os.makedirs(local_dir, exist_ok=True)
    
    # Guardar modelo v9
    print("🔄 Guardando modelo XGBoost v9...")
    model_filename = 'modelo_diagnostico_v9_final.pkl'
    model_path = os.path.join(backend_dir, model_filename)
    joblib.dump(xgb_model_v9, model_path)
    
    # También guardar copia local
    local_model_path = os.path.join(local_dir, model_filename)
    joblib.dump(xgb_model_v9, local_model_path)
    
    # Guardar preprocesadores v9
    print("🔄 Guardando preprocesadores v9...")
    preprocessors_v9 = {
        'diagnosis_encoder': label_encoder,
        'feature_columns': feature_cols,
        'model_info': {
            'version': 'v9_final',
            'algorithm': 'XGBoost',
            'accuracy': accuracy_v9,
            'features_count': len(feature_cols),
            'classes_count': len(label_encoder.classes_),
            'trained_date': datetime.now().isoformat(),
            'input_type': 'symptom_binary_features',
            'diseases': list(label_encoder.classes_)
        }
    }
    
    prep_filename = 'preprocesadores_v9_final.pkl'
    prep_path = os.path.join(backend_dir, prep_filename)
    joblib.dump(preprocessors_v9, prep_path)
    
    # También guardar copia local
    local_prep_path = os.path.join(local_dir, prep_filename)
    joblib.dump(preprocessors_v9, local_prep_path)
    
    print(f"✅ MODELO V9 GUARDADO EXITOSAMENTE:")
    print(f"   📁 Backend: {backend_dir}")
    print(f"   📁 Local: {local_dir}")
    print(f"   🎯 Accuracy: {accuracy_v9:.4f} ({accuracy_v9*100:.2f}%)")
    print(f"   🏥 Enfermedades: {len(label_encoder.classes_)}")
    print(f"   🔢 Síntomas: {len(feature_cols)}")
    
    # Mostrar enfermedades detectables
    print(f"\n🏥 ENFERMEDADES DETECTABLES:")
    diseases = list(label_encoder.classes_)
    for i, disease in enumerate(diseases[:15], 1):
        print(f"   {i:2d}. {disease}")
    if len(diseases) > 15:
        print(f"   ... y {len(diseases)-15} más")
    
    # Mostrar síntomas detectables
    print(f"\n🩺 SÍNTOMAS DETECTABLES (primeros 15):")
    for i, symptom in enumerate(feature_cols[:15], 1):
        print(f"   {i:2d}. {symptom}")
    if len(feature_cols) > 15:
        print(f"   ... y {len(feature_cols)-15} más")
    
    return {
        'model_path': model_path,
        'preprocessors_path': prep_path,
        'accuracy': accuracy_v9,
        'diseases_count': len(label_encoder.classes_),
        'symptoms_count': len(feature_cols)
    }

# Ejecutar guardado final
print("🚀 GUARDANDO MODELO V9 FINAL...")
model_info = save_model_v9_final()

🚀 GUARDANDO MODELO V9 FINAL...

💾 GUARDANDO MODELO V9 FINAL PARA BACKEND
🔄 Guardando modelo XGBoost v9...
🔄 Guardando preprocesadores v9...
✅ MODELO V9 GUARDADO EXITOSAMENTE:
   📁 Backend: ../../../Backend/models
   📁 Local: models
   🎯 Accuracy: 0.9762 (97.62%)
   🏥 Enfermedades: 41
   🔢 Síntomas: 132

🏥 ENFERMEDADES DETECTABLES:
    1. (vertigo) Paroymsal  Positional Vertigo
    2. AIDS
    3. Acne
    4. Alcoholic hepatitis
    5. Allergy
    6. Arthritis
    7. Bronchial Asthma
    8. Cervical spondylosis
    9. Chicken pox
   10. Chronic cholestasis
   11. Common Cold
   12. Dengue
   13. Diabetes 
   14. Dimorphic hemmorhoids(piles)
   15. Drug Reaction
   ... y 26 más

🩺 SÍNTOMAS DETECTABLES (primeros 15):
    1. abdominal_pain
    2. abnormal_menstruation
    3. acidity
    4. acute_liver_failure
    5. altered_sensorium
    6. anxiety
    7. back_pain
    8. belly_pain
    9. blackheads
   10. bladder_discomfort
   11. blister
   12. blood_in_sputum
   13. bloody_stool
   14. 

In [11]:
# CELDA 9: Prueba final con casos específicos
def test_final_v9_model():
    """Probar el modelo v9 final con casos realistas"""
    print("\n🧪 PROBANDO MODELO V9 FINAL CON CASOS ESPECÍFICOS")
    print("="*60)
    
    # Casos de prueba con síntomas binarios
    test_cases = [
        {
            "name": "Infección por hongos",
            "symptoms": {
                'itching': 1,
                'skin_rash': 1,
                'nodal_skin_eruptions': 1,
                'dischromic_patches': 1
            },
            "expected": "Fungal infection"
        },
        {
            "name": "Diabetes",
            "symptoms": {
                'excessive_hunger': 1,
                'polyuria': 1,
                'weight_loss': 1,
                'fatigue': 1
            },
            "expected": "Diabetes"
        },
        {
            "name": "Hipertensión",
            "symptoms": {
                'headache': 1,
                'dizziness': 1,
                'chest_pain': 1,
                'palpitations': 1
            },
            "expected": "Hypertension"
        },
        {
            "name": "Problemas respiratorios",
            "symptoms": {
                'cough': 1,
                'chest_pain': 1,
                'breathlessness': 1,
                'phlegm': 1
            },
            "expected": "Bronchial Asthma"
        },
        {
            "name": "Problemas digestivos",
            "symptoms": {
                'abdominal_pain': 1,
                'nausea': 1,
                'vomiting': 1,
                'loss_of_appetite': 1
            },
            "expected": "Gastroenteritis"
        }
    ]
    
    print(f"🔍 EVALUANDO {len(test_cases)} CASOS CON SÍNTOMAS BINARIOS:")
    
    correct_predictions = 0
    total_cases = len(test_cases)
    
    for i, case in enumerate(test_cases, 1):
        print(f"\n--- CASO {i}: {case['name']} ---")
        
        try:
            # Crear vector de características
            feature_vector = np.zeros(len(feature_cols))
            active_symptoms = []
            
            # Llenar vector con síntomas del caso
            for symptom, value in case['symptoms'].items():
                if symptom in feature_cols and value == 1:
                    idx = feature_cols.index(symptom)
                    feature_vector[idx] = 1
                    active_symptoms.append(symptom)
            
            # Hacer predicción
            feature_vector = feature_vector.reshape(1, -1)
            probabilities = xgb_model_v9.predict_proba(feature_vector)[0]
            prediction_idx = np.argmax(probabilities)
            
            predicted_disease = label_encoder.inverse_transform([prediction_idx])[0]
            confidence = probabilities[prediction_idx] * 100
            
            # Verificar si es correcta
            is_correct = predicted_disease == case['expected']
            if is_correct:
                correct_predictions += 1
                status = "✅ CORRECTO"
            else:
                status = "❌ INCORRECTO"
            
            print(f"🩺 Síntomas activos: {', '.join(active_symptoms)}")
            print(f"🎯 Esperado: {case['expected']}")
            print(f"🤖 Predicho: {predicted_disease} ({confidence:.2f}%) {status}")
            
            # Top 3 predicciones
            top_3_indices = np.argsort(probabilities)[-3:][::-1]
            print(f"🏆 Top 3 predicciones:")
            
            for j, idx in enumerate(top_3_indices, 1):
                disease = label_encoder.inverse_transform([idx])[0]
                conf = probabilities[idx] * 100
                marker = "👑" if disease == case['expected'] else "  "
                print(f"   {marker}{j}. {disease}: {conf:.2f}%")
                
        except Exception as e:
            print(f"❌ Error en predicción: {e}")
    
    # Estadísticas finales
    accuracy_test = correct_predictions / total_cases
    print(f"\n📊 ESTADÍSTICAS FINALES DEL MODELO V9:")
    print(f"   🎯 Precisión en casos específicos: {correct_predictions}/{total_cases} ({accuracy_test*100:.1f}%)")
    print(f"   🤖 Precisión general del modelo: {accuracy_v9*100:.2f}%")
    print(f"   🏥 Enfermedades detectables: {len(label_encoder.classes_)}")
    print(f"   🩺 Síntomas analizables: {len(feature_cols)}")
    
    return accuracy_test

# Ejecutar prueba final
final_accuracy = test_final_v9_model()


🧪 PROBANDO MODELO V9 FINAL CON CASOS ESPECÍFICOS
🔍 EVALUANDO 5 CASOS CON SÍNTOMAS BINARIOS:

--- CASO 1: Infección por hongos ---
🩺 Síntomas activos: itching, skin_rash, nodal_skin_eruptions
🎯 Esperado: Fungal infection
🤖 Predicho: Fungal infection (98.42%) ✅ CORRECTO
🏆 Top 3 predicciones:
   👑1. Fungal infection: 98.42%
     2. Drug Reaction: 0.08%
     3. Chronic cholestasis: 0.08%

--- CASO 2: Diabetes ---
🩺 Síntomas activos: excessive_hunger, polyuria, weight_loss, fatigue
🎯 Esperado: Diabetes
🤖 Predicho: Diabetes  (11.90%) ❌ INCORRECTO
🏆 Top 3 predicciones:
     1. Diabetes : 11.90%
     2. Jaundice: 3.65%
     3. AIDS: 2.70%

--- CASO 3: Hipertensión ---
🩺 Síntomas activos: headache, dizziness, chest_pain, palpitations
🎯 Esperado: Hypertension
🤖 Predicho: Hypoglycemia (94.70%) ❌ INCORRECTO
🏆 Top 3 predicciones:
     1. Hypoglycemia: 94.70%
     2. Hypertension : 3.89%
     3. Heart attack: 0.17%

--- CASO 4: Problemas respiratorios ---
🩺 Síntomas activos: cough, chest_pain, brea

In [12]:
# CELDA 10: Resumen final
print(f"\n🎉 MODELO V9 COMPLETADO EXITOSAMENTE!")
print("="*60)
print(f"✅ Modelo entrenado y guardado")
print(f"✅ Precisión general: {accuracy_v9*100:.2f}%")
print(f"✅ Enfermedades detectables: {len(label_encoder.classes_)}")
print(f"✅ Síntomas analizables: {len(feature_cols)}")
print(f"✅ Archivos guardados en Backend/models/")
print(f"\n📁 ARCHIVOS GENERADOS:")
print(f"   • modelo_diagnostico_v9_final.pkl")
print(f"   • preprocesadores_v9_final.pkl")
print(f"\n🔧 CARACTERÍSTICAS DEL MODELO:")
print(f"   • Algoritmo: XGBoost")
print(f"   • Entrada: Síntomas binarios (0/1)")
print(f"   • Salida: Diagnóstico + confianza")
print(f"   • Optimizado para backend")
print(f"\n🚀 LISTO PARA INTEGRAR EN EL BACKEND!")


🎉 MODELO V9 COMPLETADO EXITOSAMENTE!
✅ Modelo entrenado y guardado
✅ Precisión general: 97.62%
✅ Enfermedades detectables: 41
✅ Síntomas analizables: 132
✅ Archivos guardados en Backend/models/

📁 ARCHIVOS GENERADOS:
   • modelo_diagnostico_v9_final.pkl
   • preprocesadores_v9_final.pkl

🔧 CARACTERÍSTICAS DEL MODELO:
   • Algoritmo: XGBoost
   • Entrada: Síntomas binarios (0/1)
   • Salida: Diagnóstico + confianza
   • Optimizado para backend

🚀 LISTO PARA INTEGRAR EN EL BACKEND!


In [13]:
# CELDA 11: CONVERTIR DATOS V9 A FORMATO V8
def convert_v9_to_v8_format():
    """Convertir datos binarios v9 a texto descriptivo para v8"""
    print("\n🔄 CONVIRTIENDO DATOS V9 A FORMATO V8")
    print("="*50)
    
    # Cargar datos v9
    train_df_v9 = pd.read_csv('datasets/archive/Training.csv')
    test_df_v9 = pd.read_csv('datasets/archive/Testing.csv')
    
    # Función para convertir síntomas binarios a texto
    def symptoms_to_text(row):
        symptoms_text = []
        
        # Mapeo de características binarias a texto descriptivo
        symptom_descriptions = {
            'itching': 'patient experiences itching and skin irritation',
            'skin_rash': 'patient has skin rash and eruptions',
            'nodal_skin_eruptions': 'patient presents nodular skin eruptions',
            'continuous_sneezing': 'patient has continuous sneezing episodes',
            'shivering': 'patient experiences shivering and chills',
            'chills': 'patient reports chills and cold sensations',
            'joint_pain': 'patient complains of joint pain and stiffness',
            'stomach_pain': 'patient has abdominal and stomach pain',
            'acidity': 'patient experiences acidity and heartburn',
            'ulcers_on_tongue': 'patient has ulcers on tongue',
            'muscle_wasting': 'patient shows muscle wasting and weakness',
            'vomiting': 'patient experiences vomiting and nausea',
            'burning_micturition': 'patient has burning sensation during urination',
            'spotting_urination': 'patient has blood spots in urine',
            'fatigue': 'patient reports severe fatigue and tiredness',
            'weight_gain': 'patient has unexplained weight gain',
            'anxiety': 'patient experiences anxiety and nervousness',
            'cold_hands_and_feets': 'patient has cold hands and feet',
            'mood_swings': 'patient experiences mood swings',
            'weight_loss': 'patient has unexplained weight loss',
            'restlessness': 'patient feels restless and agitated',
            'lethargy': 'patient experiences lethargy and drowsiness',
            'patches_in_throat': 'patient has patches in throat',
            'irregular_sugar_level': 'patient has irregular blood sugar levels',
            'cough': 'patient has persistent cough',
            'high_fever': 'patient experiences high fever',
            'sunken_eyes': 'patient has sunken eyes appearance',
            'breathlessness': 'patient experiences shortness of breath',
            'sweating': 'patient has excessive sweating',
            'dehydration': 'patient shows signs of dehydration',
            'indigestion': 'patient has indigestion problems',
            'headache': 'patient complains of severe headache',
            'yellowish_skin': 'patient has yellowish skin discoloration',
            'dark_urine': 'patient has dark colored urine',
            'nausea': 'patient experiences nausea',
            'loss_of_appetite': 'patient has loss of appetite',
            'pain_behind_the_eyes': 'patient has pain behind the eyes',
            'back_pain': 'patient experiences back pain',
            'constipation': 'patient has constipation issues',
            'abdominal_pain': 'patient has abdominal pain',
            'diarrhoea': 'patient experiences diarrhea',
            'mild_fever': 'patient has mild fever',
            'yellow_urine': 'patient has yellow colored urine',
            'yellowing_of_eyes': 'patient shows yellowing of eyes',
            'acute_liver_failure': 'patient has acute liver failure symptoms',
            'fluid_overload': 'patient experiences fluid overload',
            'swelling_of_stomach': 'patient has stomach swelling',
            'swelled_lymph_nodes': 'patient has swollen lymph nodes',
            'malaise': 'patient feels general malaise',
            'blurred_and_distorted_vision': 'patient has blurred vision',
            'phlegm': 'patient produces phlegm',
            'throat_irritation': 'patient has throat irritation',
            'redness_of_eyes': 'patient has red eyes',
            'sinus_pressure': 'patient experiences sinus pressure',
            'runny_nose': 'patient has runny nose',
            'congestion': 'patient has nasal congestion',
            'chest_pain': 'patient experiences chest pain',
            'weakness_in_limbs': 'patient has weakness in limbs',
            'fast_heart_rate': 'patient has rapid heart rate',
            'pain_during_bowel_movements': 'patient has pain during bowel movements',
            'pain_in_anal_region': 'patient has anal pain',
            'bloody_stool': 'patient has blood in stool',
            'irritation_in_anus': 'patient has anal irritation',
            'neck_pain': 'patient experiences neck pain',
            'dizziness': 'patient feels dizzy',
            'cramps': 'patient has muscle cramps',
            'bruising': 'patient shows bruising',
            'obesity': 'patient is obese',
            'swollen_legs': 'patient has swollen legs',
            'swollen_blood_vessels': 'patient has swollen blood vessels',
            'puffy_face_and_eyes': 'patient has puffy face and eyes',
            'enlarged_thyroid': 'patient has enlarged thyroid',
            'brittle_nails': 'patient has brittle nails',
            'swollen_extremeties': 'patient has swollen extremities',
            'excessive_hunger': 'patient experiences excessive hunger',
            'extra_marital_contacts': 'patient has history of risky contacts',
            'drying_and_tingling_lips': 'patient has dry and tingling lips',
            'slurred_speech': 'patient has slurred speech',
            'knee_pain': 'patient has knee pain',
            'hip_joint_pain': 'patient has hip joint pain',
            'muscle_weakness': 'patient experiences muscle weakness',
            'stiff_neck': 'patient has stiff neck',
            'swelling_joints': 'patient has swollen joints',
            'movement_stiffness': 'patient has movement stiffness',
            'spinning_movements': 'patient experiences spinning sensations',
            'loss_of_balance': 'patient has loss of balance',
            'unsteadiness': 'patient feels unsteady',
            'weakness_of_one_body_side': 'patient has weakness on one body side',
            'loss_of_smell': 'patient has loss of smell',
            'bladder_discomfort': 'patient has bladder discomfort',
            'foul_smell_of_urine': 'patient has foul smelling urine',
            'continuous_feel_of_urine': 'patient feels continuous urge to urinate',
            'passage_of_gases': 'patient has excessive gas',
            'internal_itching': 'patient has internal itching sensation',
            'toxic_look_typhos': 'patient has toxic appearance',
            'depression': 'patient shows signs of depression',
            'irritability': 'patient is irritable',
            'muscle_pain': 'patient has muscle pain',
            'altered_sensorium': 'patient has altered consciousness',
            'red_spots_over_body': 'patient has red spots on body',
            'belly_pain': 'patient has belly pain',
            'abnormal_menstruation': 'patient has abnormal menstruation',
            'dischromic_patches': 'patient has skin discoloration patches',
            'watering_from_eyes': 'patient has watery eyes',
            'increased_appetite': 'patient has increased appetite',
            'polyuria': 'patient has frequent urination',
            'family_history': 'patient has relevant family history',
            'mucoid_sputum': 'patient produces mucoid sputum',
            'rusty_sputum': 'patient has rusty colored sputum',
            'lack_of_concentration': 'patient has difficulty concentrating',
            'visual_disturbances': 'patient has visual disturbances',
            'receiving_blood_transfusion': 'patient received blood transfusion',
            'receiving_unsterile_injections': 'patient received unsterile injections',
            'coma': 'patient is in comatose state',
            'stomach_bleeding': 'patient has stomach bleeding',
            'distention_of_abdomen': 'patient has abdominal distention',
            'history_of_alcohol_consumption': 'patient has history of alcohol consumption',
            'blood_in_sputum': 'patient has blood in sputum',
            'prominent_veins_on_calf': 'patient has prominent veins on calf',
            'palpitations': 'patient experiences palpitations',
            'painful_walking': 'patient has painful walking',
            'pus_filled_pimples': 'patient has pus filled pimples',
            'blackheads': 'patient has blackheads',
            'scurring': 'patient has scarring',
            'skin_peeling': 'patient has skin peeling',
            'silver_like_dusting': 'patient has silver-like skin dusting',
            'small_dents_in_nails': 'patient has small dents in nails',
            'inflammatory_nails': 'patient has inflammatory nails',
            'blister': 'patient has blisters',
            'red_sore_around_nose': 'patient has red sore around nose',
            'yellow_crust_ooze': 'patient has yellow crust ooze'
        }
        
        # Convertir síntomas binarios a texto
        for symptom, description in symptom_descriptions.items():
            if symptom in row.index and row[symptom] == 1:
                symptoms_text.append(description)
        
        return '. '.join(symptoms_text) if symptoms_text else 'patient presents with general symptoms'
    
    # Convertir datos
    print("🔄 Convirtiendo datos de entrenamiento...")
    train_df_converted = train_df_v9.copy()
    train_df_converted['findings_text'] = train_df_v9.apply(symptoms_to_text, axis=1)
    train_df_converted['age_range'] = '21-40'  # Rango por defecto
    train_df_converted['gender'] = 'Unknown'   # Género por defecto
    
    print("🔄 Convirtiendo datos de prueba...")
    test_df_converted = test_df_v9.copy()
    test_df_converted['findings_text'] = test_df_v9.apply(symptoms_to_text, axis=1)
    test_df_converted['age_range'] = '21-40'
    test_df_converted['gender'] = 'Unknown'
    
    print(f"✅ Convertidos {len(train_df_converted)} casos de entrenamiento")
    print(f"✅ Convertidos {len(test_df_converted)} casos de prueba")
    
    return train_df_converted, test_df_converted

# Convertir datos
train_converted, test_converted = convert_v9_to_v8_format()

# Mostrar ejemplo
print(f"\n📋 EJEMPLO DE CONVERSIÓN:")
print(f"Enfermedad: {train_converted.iloc[0]['prognosis']}")
print(f"Texto generado: {train_converted.iloc[0]['findings_text'][:200]}...")


🔄 CONVIRTIENDO DATOS V9 A FORMATO V8
🔄 Convirtiendo datos de entrenamiento...
🔄 Convirtiendo datos de prueba...
✅ Convertidos 4920 casos de entrenamiento
✅ Convertidos 42 casos de prueba

📋 EJEMPLO DE CONVERSIÓN:
Enfermedad: Fungal infection
Texto generado: patient experiences itching and skin irritation. patient has skin rash and eruptions. patient presents nodular skin eruptions...


In [None]:
# CELDA 12: REENTRENAR MODELO V8 CON DATOS V9 (VERSIÓN MEJORADA)
def train_enhanced_v8_model_improved():
    """Reentrenar modelo v8 con datos mejorados de v9 - Versión corregida"""
    print("\n🚀 ENTRENANDO MODELO V8 MEJORADO (VERSIÓN CORREGIDA)")
    print("="*60)
    
    # Preparar datos combinados
    X_text = train_converted['findings_text']
    X_age = train_converted['age_range'] 
    X_gender = train_converted['gender']
    y = train_converted['prognosis']
    
    # Función de preprocesamiento mejorada
    def preprocess_text_improved(text):
        """Preprocesamiento mejorado de texto médico"""
        import re
        from nltk.corpus import stopwords
        
        if pd.isna(text) or text == '':
            return 'patient presents with general symptoms'
        
        text = str(text).lower()
        
        # Preservar términos médicos importantes
        medical_terms = ['patient', 'experiences', 'has', 'shows', 'reports', 'complains', 
                        'presents', 'symptoms', 'pain', 'fever', 'headache', 'nausea']
        
        # Remover puntuación pero preservar espacios
        text = re.sub(r'[^\w\s]', ' ', text)
        
        # Remover números
        text = re.sub(r'\d+', '', text)
        
        # Remover palabras muy cortas (menos de 3 caracteres) excepto términos médicos
        words = text.split()
        words = [word for word in words if len(word) >= 3 or word in medical_terms]
        
        text = ' '.join(words)
        
        # Si el texto queda muy corto, usar texto por defecto
        if len(text.split()) < 3:
            text = 'patient presents with general symptoms'
            
        return text
    
    # Preprocesar texto
    print("🔄 Preprocesando texto con método mejorado...")
    X_text_processed = X_text.apply(preprocess_text_improved)
    
    # Verificar textos procesados
    print(f"📝 Ejemplo de texto procesado:")
    print(f"Original: {X_text.iloc[0][:100]}...")
    print(f"Procesado: {X_text_processed.iloc[0][:100]}...")
    
    # Configurar TF-IDF más conservador
    from sklearn.preprocessing import LabelEncoder
    from sklearn.feature_extraction.text import TfidfVectorizer
    from scipy.sparse import hstack
    
    print("🔄 Configurando TF-IDF conservador...")
    tfidf_improved = TfidfVectorizer(
        max_features=3000,      # Menos características para evitar overfitting
        ngram_range=(1, 2),     # Solo unigramas y bigramas
        min_df=3,               # Mínimo 3 documentos
        max_df=0.8,             # Máximo 80% de documentos
        sublinear_tf=True,
        stop_words='english',
        lowercase=True,
        strip_accents='ascii'
    )
    
    # Encoders
    age_encoder_improved = LabelEncoder()
    gender_encoder_improved = LabelEncoder()
    diagnosis_encoder_improved = LabelEncoder()
    
    # Ajustar encoders
    print("🔄 Entrenando encoders...")
    X_tfidf = tfidf_improved.fit_transform(X_text_processed)
    X_age_encoded = age_encoder_improved.fit_transform(X_age)
    X_gender_encoded = gender_encoder_improved.fit_transform(X_gender)
    y_encoded = diagnosis_encoder_improved.fit_transform(y)
    
    print(f"✅ TF-IDF: {X_tfidf.shape[1]} características")
    print(f"✅ Vocabulario: {len(tfidf_improved.vocabulary_):,} términos")
    
    # Combinar características
    X_categorical = np.column_stack([X_age_encoded, X_gender_encoded])
    X_combined = hstack([X_tfidf, X_categorical])
    
    # División de datos con validación cruzada
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(
        X_combined, y_encoded, 
        test_size=0.3,          # Más datos para test
        random_state=42, 
        stratify=y_encoded
    )
    
    print(f"✅ División de datos:")
    print(f"   Entrenamiento: {X_train.shape[0]} muestras")
    print(f"   Prueba: {X_test.shape[0]} muestras")
    print(f"   Características: {X_train.shape[1]}")
    
    # Modelo XGBoost más conservador para evitar overfitting
    print("\n🚀 Entrenando XGBoost conservador...")
    
    model_improved = XGBClassifier(
        objective='multi:softprob',
        n_estimators=100,       # Menos árboles
        max_depth=4,            # Menor profundidad
        learning_rate=0.05,     # Menor tasa de aprendizaje
        subsample=0.7,          # Más submuestreo
        colsample_bytree=0.7,   # Más submuestreo de características
        reg_alpha=1.0,          # Más regularización L1
        reg_lambda=2.0,         # Más regularización L2
        min_child_weight=3,     # Mínimo peso por hoja
        random_state=42,
        n_jobs=-1,
        eval_metric='mlogloss'
    )
    
    # Entrenar con validación temprana
    from sklearn.model_selection import cross_val_score
    
    print("🔄 Entrenamiento con validación cruzada...")
    cv_scores = cross_val_score(model_improved, X_train, y_train, cv=3, scoring='accuracy')
    print(f"📊 CV Scores: {cv_scores}")
    print(f"📊 CV Mean: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    
    # Entrenar modelo final
    model_improved.fit(X_train, y_train)
    
    # Evaluar en conjunto de prueba
    y_pred = model_improved.predict(X_test)
    y_pred_proba = model_improved.predict_proba(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"\n📊 RESULTADOS DEL MODELO V8 MEJORADO:")
    print(f"   🎯 Accuracy (Test): {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"   🎯 Accuracy (CV): {cv_scores.mean():.4f}")
    print(f"   🏥 Enfermedades: {len(diagnosis_encoder_improved.classes_)}")
    print(f"   🔤 Vocabulario: {len(tfidf_improved.vocabulary_):,} términos")
    
    # Análisis de confianza de predicciones
    max_probas = np.max(y_pred_proba, axis=1)
    print(f"\n📊 ANÁLISIS DE CONFIANZA:")
    print(f"   Confianza promedio: {max_probas.mean():.4f}")
    print(f"   Confianza mínima: {max_probas.min():.4f}")
    print(f"   Confianza máxima: {max_probas.max():.4f}")
    print(f"   Predicciones > 90%: {(max_probas > 0.9).sum()}/{len(max_probas)}")
    print(f"   Predicciones > 50%: {(max_probas > 0.5).sum()}/{len(max_probas)}")
    
    # Mostrar enfermedades más comunes
    print(f"\n🏥 Enfermedades más frecuentes en predicciones:")
    pred_diseases = diagnosis_encoder_improved.inverse_transform(y_pred)
    from collections import Counter
    disease_counts = Counter(pred_diseases)
    for disease, count in disease_counts.most_common(10):
        print(f"   {disease}: {count} casos")
    
    return (model_improved, tfidf_improved, age_encoder_improved, 
            gender_encoder_improved, diagnosis_encoder_improved, accuracy)

# Entrenar modelo mejorado
print("🚀 Iniciando entrenamiento del modelo v8 mejorado (versión corregida)...")
(model_v8_improved, tfidf_improved, age_improved, 
 gender_improved, diagnosis_improved, accuracy_improved) = train_enhanced_v8_model_improved()

🚀 Iniciando entrenamiento del modelo v8 mejorado (versión corregida)...

🚀 ENTRENANDO MODELO V8 MEJORADO (VERSIÓN CORREGIDA)
🔄 Preprocesando texto con método mejorado...
📝 Ejemplo de texto procesado:
Original: patient experiences itching and skin irritation. patient has skin rash and eruptions. patient presen...
Procesado: patient experiences itching and skin irritation patient has skin rash and eruptions patient presents...
🔄 Configurando TF-IDF conservador...
🔄 Entrenando encoders...
✅ TF-IDF: 549 características
✅ Vocabulario: 549 términos
✅ División de datos:
   Entrenamiento: 3444 muestras
   Prueba: 1476 muestras
   Características: 551

🚀 Entrenando XGBoost conservador...
🔄 Entrenamiento con validación cruzada...
📊 CV Scores: [0.9956446  1.         0.99738676]
📊 CV Mean: 0.9977 (+/- 0.0036)

📊 RESULTADOS DEL MODELO V8 MEJORADO:
   🎯 Accuracy (Test): 1.0000 (100.00%)
   🎯 Accuracy (CV): 0.9977
   🏥 Enfermedades: 41
   🔤 Vocabulario: 549 términos

📊 ANÁLISIS DE CONFIANZA:
   Confi

In [15]:
# Paso 13: Guardar modelo v8 mejorado completo
def save_enhanced_v8_model():
    """Guardar modelo v8 mejorado"""
    print("\n💾 GUARDANDO MODELO V8 MEJORADO")
    print("="*50)
    
    # Crear directorios necesarios
    local_dir = 'models'
    os.makedirs(local_dir, exist_ok=True)
    
    # Guardar modelo mejorado
    print("🔄 Guardando modelo XGBoost mejorado...")
    model_filename = 'modelo_diagnostico_v8_mejorado.pkl'
    
    # También guardar copia local
    local_model_path = os.path.join(local_dir, model_filename)
    joblib.dump(model_v8_improved, local_model_path)
    
    # Guardar preprocesadores mejorados
    print("🔄 Guardando preprocesadores mejorados...")
    preprocessors_v8_improved = {
        'tfidf_vectorizer': tfidf_improved,
        'age_encoder': age_improved,
        'gender_encoder': gender_improved,
        'diagnosis_encoder': diagnosis_improved,
        'model_info': {
            'version': 'v8_mejorado',
            'algorithm': 'XGBoost_Enhanced',
            'accuracy': accuracy_improved,
            'features_count': tfidf_improved.get_feature_names_out().shape[0] + 2,
            'vocabulary_size': len(tfidf_improved.vocabulary_),
            'classes_count': len(diagnosis_improved.classes_),
            'trained_date': datetime.now().isoformat(),
            'input_type': 'text_findings_with_demographics',
            'preprocessing': 'improved_text_processing',
            'tfidf_params': {
                'max_features': 3000,
                'ngram_range': '(1,2)',
                'min_df': 3,
                'max_df': 0.8
            },
            'xgboost_params': {
                'n_estimators': 100,
                'max_depth': 4,
                'learning_rate': 0.05,
                'reg_alpha': 1.0,
                'reg_lambda': 2.0
            },
            'diseases': list(diagnosis_improved.classes_)
        }
    }
    
    prep_filename = 'preprocesadores_v8_mejorado.pkl'
    
    # También guardar copia local
    local_prep_path = os.path.join(local_dir, prep_filename)
    joblib.dump(preprocessors_v8_improved, local_prep_path)
    
    print(f"✅ MODELO V8 MEJORADO GUARDADO EXITOSAMENTE:")
    print(f"   📁 Local: {local_dir}")
    print(f"   🎯 Accuracy: {accuracy_improved:.4f} ({accuracy_improved*100:.2f}%)")
    print(f"   🏥 Enfermedades: {len(diagnosis_improved.classes_)}")
    print(f"   🔤 Vocabulario: {len(tfidf_improved.vocabulary_):,} términos")
    print(f"   🔢 Características totales: {tfidf_improved.get_feature_names_out().shape[0] + 2}")
    
    # Mostrar enfermedades detectables
    print(f"\n🏥 ENFERMEDADES DETECTABLES:")
    diseases = list(diagnosis_improved.classes_)
    for i, disease in enumerate(diseases[:15], 1):
        print(f"   {i:2d}. {disease}")
    if len(diseases) > 15:
        print(f"   ... y {len(diseases)-15} más")
    
    # Mostrar características del vocabulario más importantes
    print(f"\n🔤 VOCABULARIO TF-IDF (primeros 15 términos):")
    vocab_items = list(tfidf_improved.vocabulary_.items())
    vocab_sorted = sorted(vocab_items, key=lambda x: x[1])[:15]
    for i, (term, idx) in enumerate(vocab_sorted, 1):
        print(f"   {i:2d}. {term}")
    
    return {
        'model_path': local_model_path,
        'preprocessors_path': local_prep_path,
        'accuracy': accuracy_improved,
        'diseases_count': len(diagnosis_improved.classes_),
        'vocabulary_size': len(tfidf_improved.vocabulary_),
        'total_features': tfidf_improved.get_feature_names_out().shape[0] + 2
    }

# Ejecutar guardado del modelo v8 mejorado
print("🚀 GUARDANDO MODELO V8 MEJORADO...")
model_v8_info = save_enhanced_v8_model()

🚀 GUARDANDO MODELO V8 MEJORADO...

💾 GUARDANDO MODELO V8 MEJORADO
🔄 Guardando modelo XGBoost mejorado...
🔄 Guardando preprocesadores mejorados...
✅ MODELO V8 MEJORADO GUARDADO EXITOSAMENTE:
   📁 Local: models
   🎯 Accuracy: 1.0000 (100.00%)
   🏥 Enfermedades: 41
   🔤 Vocabulario: 549 términos
   🔢 Características totales: 551

🏥 ENFERMEDADES DETECTABLES:
    1. (vertigo) Paroymsal  Positional Vertigo
    2. AIDS
    3. Acne
    4. Alcoholic hepatitis
    5. Allergy
    6. Arthritis
    7. Bronchial Asthma
    8. Cervical spondylosis
    9. Chicken pox
   10. Chronic cholestasis
   11. Common Cold
   12. Dengue
   13. Diabetes 
   14. Dimorphic hemmorhoids(piles)
   15. Drug Reaction
   ... y 26 más

🔤 VOCABULARIO TF-IDF (primeros 15 términos):
    1. abdominal
    2. abdominal distention
    3. abdominal pain
    4. abdominal stomach
    5. abnormal
    6. abnormal menstruation
    7. acidity
    8. acidity heartburn
    9. acute
   10. acute liver
   11. agitated
   12. agitated patie