In [1]:
# CARREGAR DADOS PR√â-PROCESSADOS

import joblib
import numpy as np

print("=" * 80)
print("CARREGANDO DADOS PR√â-PROCESSADOS")
print("=" * 80)

# Carregar dados j√° tratados
data_package = joblib.load("diabetes_model/preprocessed_data.joblib")

X_train = data_package['X_train']
X_test = data_package['X_test']
y_train = data_package['y_train']
y_test = data_package['y_test']

print(f"‚úÖ Dados carregados:")
print(f"   X_train: {X_train.shape}")
print(f"   X_test: {X_test.shape}")
print(f"   y_train: {y_train.shape}")
print(f"   y_test: {y_test.shape}")

print("\n‚úÖ PRONTO PARA ALGORITMO GEN√âTICO!")

CARREGANDO DADOS PR√â-PROCESSADOS
‚úÖ Dados carregados:
   X_train: (95996, 18)
   X_test: (24000, 18)
   y_train: (95996,)
   y_test: (24000,)

‚úÖ PRONTO PARA ALGORITMO GEN√âTICO!


In [6]:
# CONFIGURA√á√ÉO COMPLETA - ALGORITMO GEN√âTICO COM THRESHOLD

import random
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer, f1_score
from sklearn.linear_model import LogisticRegression
from deap import base, creator, tools, algorithms

print("=" * 80)
print("CONFIGURA√á√ÉO COMPLETA - ALGORITMO GEN√âTICO COM THRESHOLD")
print("=" * 80)

def create_individual():
    """Cria indiv√≠duo com tipos corretos incluindo threshold"""
    return [
        random.uniform(0.01, 100.0),    # Gene 0: C (float)
        random.randint(0, 2),           # Gene 1: penalty_type (int)
        random.randint(0, 3),           # Gene 2: solver_type (int)
        random.randint(0, 1),           # Gene 3: class_weight_type (int)
        random.randint(500, 3000),       # Gene 4: max_iter (int)
        random.uniform(0.3, 0.7),        # Gene 5: threshold (float) 
    ]


def evaluate_individual(individual):
    C, penalty_type, solver_type, class_weight_type, max_iter, threshold = individual
    
    penalty_type = int(round(penalty_type))
    solver_type = int(round(solver_type))
    class_weight_type = int(round(class_weight_type))
    max_iter = int(round(max_iter))
    
    # Manter dentro dos limites
    penalty_type = max(0, min(2, penalty_type))
    solver_type = max(0, min(3, solver_type))
    class_weight_type = max(0, min(1, class_weight_type))
    max_iter = max(500, min(3000, max_iter))
    threshold = max(0.3, min(0.7, threshold))  # Limitar threshold
    
    # Mapear para valores reais
    penalty_map = {0: 'l2', 1: 'l1', 2: 'elasticnet'}
    solver_map = {0: 'lbfgs', 1: 'liblinear', 2: 'saga', 3: 'newton-cg'}
    class_weight_map = {0: None, 1: 'balanced'}
    
    penalty = penalty_map[penalty_type]
    solver = solver_map[solver_type]
    class_weight = class_weight_map[class_weight_type]
    
    # Validar combina√ß√µes inv√°lidas
    if penalty == 'l1' and solver not in ['liblinear', 'saga']:
        return (-1000,)
    if penalty == 'elasticnet' and solver != 'saga':
        return (-1000,)
    
    try:
        # Cross-validation com threshold customizado
        kf = KFold(n_splits=2, shuffle=True, random_state=42)
        scores = []
        
        for train_idx, val_idx in kf.split(X_train):
            # Separar dados do fold
            X_fold_train = X_train[train_idx]
            y_fold_train = y_train.iloc[train_idx] if hasattr(y_train, 'iloc') else y_train[train_idx]
            X_fold_val = X_train[val_idx]
            y_fold_val = y_train.iloc[val_idx] if hasattr(y_train, 'iloc') else y_train[val_idx]
            
            # Treinar modelo
            model = LogisticRegression(
                C=C,
                penalty=penalty,
                solver=solver,
                class_weight=class_weight,
                max_iter=max_iter,
                random_state=42,
                n_jobs=-1
            )
            
            model.fit(X_fold_train, y_fold_train)
            
            # Obter probabilidades no conjunto de valida√ß√£o
            y_proba = model.predict_proba(X_fold_val)[:, 1]
            
            # Aplicar threshold customizado ‚≠ê
            y_pred = (y_proba >= threshold).astype(int)
            
            # Calcular F1 score
            f1 = f1_score(y_fold_val, y_pred)
            scores.append(f1)
        
        # Retornar m√©dia dos scores
        return (np.mean(scores),)
    
    except Exception as e:
        return (-1000,)

def hybrid_crossover(ind1, ind2):
    """Crossover que preserva tipos: blend para float, uniform para inteiros"""
    child1, child2 = creator.Individual(ind1[:]), creator.Individual(ind2[:])
    
    # Gene 0 (C - float): Blend crossover
    alpha = 0.5
    gamma = (1 + 2 * alpha) * random.random() - alpha
    child1[0] = (1 - gamma) * ind1[0] + gamma * ind2[0]
    child2[0] = (1 - gamma) * ind2[0] + gamma * ind1[0]
    
    # Limitar C entre 0.01 e 100.0
    child1[0] = max(0.01, min(100.0, child1[0]))
    child2[0] = max(0.01, min(100.0, child2[0]))
    
    for i in range(1, 5):
        if random.random() < 0.5:
            child1[i], child2[i] = ind2[i], ind1[i]
    
    # Gene 5 (threshold - float): Blend crossover
    gamma_threshold = (1 + 2 * alpha) * random.random() - alpha
    child1[5] = (1 - gamma_threshold) * ind1[5] + gamma_threshold * ind2[5]
    child2[5] = (1 - gamma_threshold) * ind2[5] + gamma_threshold * ind1[5]
    
    # Limitar threshold entre 0.3 e 0.7
    child1[5] = max(0.3, min(0.7, child1[5]))
    child2[5] = max(0.3, min(0.7, child2[5]))
    
    return child1, child2


def hybrid_mutate(individual):
    """Muta√ß√£o que respeita tipos"""
    # Gene 0 (C - float): multiplica√ß√£o com fator aleat√≥rio
    if random.random() < 0.3:
        individual[0] *= random.uniform(0.5, 2.0)
        individual[0] = max(0.01, min(100.0, individual[0]))
    
    # Genes 1-4 (inteiros): nova amostragem aleat√≥ria
    if random.random() < 0.3:
        individual[1] = random.randint(0, 2)  # penalty_type
    if random.random() < 0.3:
        individual[2] = random.randint(0, 3)  # solver_type
    if random.random() < 0.3:
        individual[3] = random.randint(0, 1)  # class_weight_type
    if random.random() < 0.3:
        individual[4] = random.randint(500, 3000)  # max_iter
    
    # Gene 5 (threshold - float): muta√ß√£o gaussiana
    if random.random() < 0.3:
        individual[5] += random.gauss(0, 0.05)  # Pequena varia√ß√£o
        individual[5] = max(0.3, min(0.7, individual[5]))  # Limitar
    
    return individual,


if hasattr(creator, "FitnessMax"):
    del creator.FitnessMax
if hasattr(creator, "Individual"):
    del creator.Individual

creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("individual", tools.initIterate, creator.Individual, create_individual)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", evaluate_individual)
toolbox.register("mate", hybrid_crossover)      # Crossover h√≠brido
toolbox.register("mutate", hybrid_mutate)       # Muta√ß√£o h√≠brida
toolbox.register("select", tools.selTournament, tournsize=3)

print("\n‚úÖ Configura√ß√£o completa!")
print("   ‚Ä¢ create_individual: 6 genes (incluindo threshold) ‚úì")
print("   ‚Ä¢ evaluate_individual: CV + threshold customizado ‚úì")
print("   ‚Ä¢ hybrid_crossover: Preserva tipos (float + int) ‚úì")
print("   ‚Ä¢ hybrid_mutate: Otimizado para todos os genes ‚úì")
print("\nüìä Genes do indiv√≠duo:")
print("   [0] C (float: 0.01-100.0)")
print("   [1] penalty_type (int: 0-2)")
print("   [2] solver_type (int: 0-3)")
print("   [3] class_weight_type (int: 0-1)")
print("   [4] max_iter (int: 500-3000)")
print("   [5] threshold (float: 0.3-0.7)")

CONFIGURA√á√ÉO COMPLETA - ALGORITMO GEN√âTICO COM THRESHOLD

‚úÖ Configura√ß√£o completa!
   ‚Ä¢ create_individual: 6 genes (incluindo threshold) ‚úì
   ‚Ä¢ evaluate_individual: CV + threshold customizado ‚úì
   ‚Ä¢ hybrid_crossover: Preserva tipos (float + int) ‚úì
   ‚Ä¢ hybrid_mutate: Otimizado para todos os genes ‚úì

üìä Genes do indiv√≠duo:
   [0] C (float: 0.01-100.0)
   [1] penalty_type (int: 0-2)
   [2] solver_type (int: 0-3)
   [3] class_weight_type (int: 0-1)
   [4] max_iter (int: 500-3000)
   [5] threshold (float: 0.3-0.7)


In [7]:
random.seed(42)
np.random.seed(42)

POPULATION_SIZE = 30   
NGEN = 30              
CXPB = 0.7
MUTPB = 0.3

print(f"\nüìä Par√¢metros do teste")
print(f"   ‚Ä¢ Popula√ß√£o: {POPULATION_SIZE}")
print(f"   ‚Ä¢ Gera√ß√µes: {NGEN}")
print(f"   ‚Ä¢ Cross-validation: 2 folds")
print(f"   ‚Ä¢ Crossover: {CXPB}")
print(f"   ‚Ä¢ Muta√ß√£o: {MUTPB}")

population = toolbox.population(n=POPULATION_SIZE)

stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("avg", np.mean)
stats.register("std", np.std)
stats.register("min", np.min)
stats.register("max", np.max)

hof = tools.HallOfFame(5)

print("\nüöÄ Iniciando evolu√ß√£o...")
print("-" * 80)

# EXECUTAR
population, logbook = algorithms.eaSimple(
    population, 
    toolbox, 
    cxpb=CXPB, 
    mutpb=MUTPB, 
    ngen=NGEN,
    stats=stats, 
    halloffame=hof, 
    verbose=True
)




üìä Par√¢metros do teste
   ‚Ä¢ Popula√ß√£o: 30
   ‚Ä¢ Gera√ß√µes: 30
   ‚Ä¢ Cross-validation: 2 folds
   ‚Ä¢ Crossover: 0.7
   ‚Ä¢ Muta√ß√£o: 0.3

üöÄ Iniciando evolu√ß√£o...
--------------------------------------------------------------------------------




gen	nevals	avg     	std   	min  	max     
0  	30    	-399.471	490.33	-1000	0.905857




1  	23    	-232.65 	423.33	-1000	0.909692




2  	27    	-199.286	400.357	-1000	0.906195




3  	24    	-65.83  	249.667	-1000	0.906333




4  	24    	-32.4592	179.668	-1000	0.906325




5  	28    	-32.4578	179.668	-1000	0.906354




6  	24    	-32.4582	179.668	-1000	0.90641 




7  	26    	0.906026	0.00140216	0.898598	0.90649 




8  	23    	-65.8207	249.67    	-1000   	0.90649 




9  	24    	-132.548	340.243   	-1000   	0.90649 




10 	24    	-99.1843	300.272   	-1000   	0.90649 




11 	23    	-165.912	373.016   	-1000   	0.906424




12 	23    	-99.185 	300.272   	-1000   	0.906436




13 	23    	-32.4577	179.668   	-1000   	0.906434




14 	23    	-32.4583	179.668   	-1000   	0.906434




15 	23    	-32.4578	179.668   	-1000   	0.906439




16 	26    	-65.8209	249.67    	-1000   	0.906439




17 	22    	-32.4571	179.668   	-1000   	0.906434




18 	22    	-65.821 	249.67    	-1000   	0.906434




19 	24    	-32.4572	179.668   	-1000   	0.906434




20 	19    	-32.4571	179.668   	-1000   	0.906434




21 	20    	-65.8219	249.67    	-1000   	0.906434




22 	23    	-99.1844	300.272   	-1000   	0.906434




23 	26    	-99.1846	300.272   	-1000   	0.906434




24 	28    	-165.913	373.015   	-1000   	0.906434




25 	29    	-99.1858	300.271   	-1000   	0.906434




26 	24    	0.906064	0.0010012 	0.901618	0.906434




27 	27    	-32.4572	179.668   	-1000   	0.906434




28 	25    	-32.4583	179.668   	-1000   	0.906434




29 	25    	-99.1845	300.272   	-1000   	0.906434




30 	25    	0.906324	0.000592426	0.903134	0.906434


In [14]:
# AVALIA√á√ÉO FINAL COM THRESHOLD OTIMIZADO (CORRIGIDO)

print("\n" + "=" * 80)
print("MELHOR CONFIGURA√á√ÉO ENCONTRADA (COM THRESHOLD)")
print("=" * 80)

best_individual = hof[0]
C_best, penalty_type, solver_type, class_weight_type, max_iter_best, threshold_best = best_individual
#                                                                      ‚Üë IMPORTANTE: Extrair threshold!

# Garantir tipos corretos
penalty_type = int(round(penalty_type))
solver_type = int(round(solver_type))
class_weight_type = int(round(class_weight_type))
max_iter_best = int(round(max_iter_best))
threshold_best = float(threshold_best)  # ‚≠ê Threshold otimizado!

penalty_map = {0: 'l2', 1: 'l1', 2: 'elasticnet'}
solver_map = {0: 'lbfgs', 1: 'liblinear', 2: 'saga', 3: 'newton-cg'}
class_weight_map = {0: None, 1: 'balanced'}

penalty_best = penalty_map[penalty_type]
solver_best = solver_map[solver_type]
class_weight_best = class_weight_map[class_weight_type]

print(f"\nüéØ Melhores hiperpar√¢metros encontrados:")
print(f"   ‚Ä¢ C: {C_best:.4f}")
print(f"   ‚Ä¢ Penalty: {penalty_best}")
print(f"   ‚Ä¢ Solver: {solver_best}")
print(f"   ‚Ä¢ Class Weight: {class_weight_best}")
print(f"   ‚Ä¢ Max Iter: {max_iter_best}")
print(f"   ‚Ä¢ Threshold: {threshold_best:.4f}")
print(f"   ‚Ä¢ F1 Score (CV): {best_individual.fitness.values[0]:.4f}")

# Treinar modelo final
print(f"\n{'='*80}")
print("TREINANDO MODELO FINAL COM THRESHOLD OTIMIZADO")
print(f"{'='*80}")

best_model = LogisticRegression(
    C=C_best,
    penalty=penalty_best,
    solver=solver_best,
    class_weight=class_weight_best,
    max_iter=max_iter_best,
    random_state=42
)

best_model.fit(X_train, y_train)

# ‚≠ê CORRE√á√ÉO: Aplicar threshold otimizado na predi√ß√£o!
y_proba_test = best_model.predict_proba(X_test)[:, 1]
y_pred_ga = (y_proba_test >= threshold_best).astype(int)  # ‚Üê Usar threshold otimizado!

# M√©tricas finais
from sklearn.metrics import accuracy_score, precision_score, recall_score

accuracy_ga = accuracy_score(y_test, y_pred_ga) * 100
precision_ga = precision_score(y_test, y_pred_ga) * 100
recall_ga = recall_score(y_test, y_pred_ga) * 100
f1_ga = f1_score(y_test, y_pred_ga) * 100

print(f"\nüìä RESULTADOS NO CONJUNTO DE TESTE:")
print(f"   ‚Ä¢ Accuracy: {accuracy_ga:.2f}%")
print(f"   ‚Ä¢ Precision: {precision_ga:.2f}%")
print(f"   ‚Ä¢ Recall: {recall_ga:.2f}%")
print(f"   ‚Ä¢ F1 Score: {f1_ga:.2f}%")
print(f"   ‚Ä¢ Threshold usado: {threshold_best:.4f}")


MELHOR CONFIGURA√á√ÉO ENCONTRADA (COM THRESHOLD)

üéØ Melhores hiperpar√¢metros encontrados:
   ‚Ä¢ C: 0.0100
   ‚Ä¢ Penalty: l1
   ‚Ä¢ Solver: liblinear
   ‚Ä¢ Class Weight: None
   ‚Ä¢ Max Iter: 2076
   ‚Ä¢ Threshold: 0.5908
   ‚Ä¢ F1 Score (CV): 0.9097

TREINANDO MODELO FINAL COM THRESHOLD OTIMIZADO





üìä RESULTADOS NO CONJUNTO DE TESTE:
   ‚Ä¢ Accuracy: 91.53%
   ‚Ä¢ Precision: 97.94%
   ‚Ä¢ Recall: 84.83%
   ‚Ä¢ F1 Score: 90.92%
   ‚Ä¢ Threshold usado: 0.5908


In [12]:
# COMPARA√á√ÉO: ORIGINAL vs OTIMIZADO

import pandas as pd

print("=" * 80)
print("COMPARA√á√ÉO: MODELO ORIGINAL vs OTIMIZADO")
print("=" * 80)

# Carregar modelo original
model_package = joblib.load("diabetes_model/diabetes_model.joblib")
original_model = model_package['model']
y_pred_original = original_model.predict(X_test)

# M√©tricas do original
accuracy_original = accuracy_score(y_test, y_pred_original) * 100
precision_original = precision_score(y_test, y_pred_original) * 100
recall_original = recall_score(y_test, y_pred_original) * 100
f1_original = f1_score(y_test, y_pred_original) * 100

# Compara√ß√£o
comparison_df = pd.DataFrame({
    'M√©trica': ['Accuracy', 'Precision', 'Recall', 'F1 Score'],
    'Original': [accuracy_original, precision_original, recall_original, f1_original],
    'Otimizado (AG)': [accuracy_ga, precision_ga, recall_ga, f1_ga],
    'Melhoria': [
        accuracy_ga - accuracy_original,
        precision_ga - precision_original,
        recall_ga - recall_original,
        f1_ga - f1_original
    ]
})

print("\n")
print(comparison_df.round(2).to_string(index=False))

# An√°lise
improvement = f1_ga - f1_original

print(f"\n{'='*80}")
print("AN√ÅLISE DA MELHORIA")
print(f"{'='*80}")

print(f"\nüìä F1 Score:")
print(f"   Original: {f1_original:.2f}%")
print(f"   Otimizado: {f1_ga:.2f}%")
print(f"   Melhoria: {improvement:+.2f} pontos percentuais")
print(f"   Melhoria relativa: {(improvement/f1_original*100):+.2f}%")


COMPARA√á√ÉO: MODELO ORIGINAL vs OTIMIZADO


  M√©trica  Original  Otimizado (AG)  Melhoria
 Accuracy     89.46           91.52      2.06
Precision     90.85           97.94      7.09
   Recall     87.76           84.83     -2.92
 F1 Score     89.28           90.92      1.64

AN√ÅLISE DA MELHORIA

üìä F1 Score:
   Original: 89.28%
   Otimizado: 90.92%
   Melhoria: +1.64 pontos percentuais
   Melhoria relativa: +1.83%


In [25]:
# EXPORTAR MODELO OTIMIZADO COM PR√â-PROCESSADORES INCLU√çDOS
# (Execute no GA_train.ipynb ap√≥s o treinamento)

import joblib
import json
import os
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

print("=" * 80)
print("EXPORTANDO MODELO OTIMIZADO COM PR√â-PROCESSADORES")
print("=" * 80)

# 1. Carregar dataset para criar pr√©-processadores
print("\nüì¶ Carregando dataset para criar pr√©-processadores...")
df = pd.read_csv("diabetes_dataset.csv")

# Limpar dados (igual ao Diabetes.ipynb)
variables_to_remove = [
    'sleep_hours_per_day', 'alcohol_consumption_per_week',
    'screen_time_hours_per_day', 'heart_rate', 'hypertension_history',
    'cardiovascular_history', 'diastolic_bp', 'smoking_status',
    'employment_status', 'ethnicity', 'gender'
]
df_cleaned = df.drop(columns=[v for v in variables_to_remove if v in df.columns])

# Separar features
X = df_cleaned.drop(columns=["diagnosed_diabetes", "diabetes_stage"])
y = df_cleaned["diagnosed_diabetes"]

# Identificar colunas
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()

print(f"‚úÖ Dataset carregado!")
print(f"   ‚Ä¢ Features: {X.shape[1]}")
print(f"   ‚Ä¢ Categ√≥ricas: {categorical_cols}")

# 2. Criar pr√©-processadores
print("\nüì¶ Criando pr√©-processadores...")
label_encoders = {}
X_LE = X.copy()

for col in categorical_cols:
    le = LabelEncoder()
    X_LE[col] = le.fit_transform(X_LE[col].astype(str))
    label_encoders[col] = le
    print(f"   ‚úÖ {col}: {len(le.classes_)} valores")

# Criar MinMaxScaler
scaler_minmax = MinMaxScaler()
X_LE_minmax = scaler_minmax.fit_transform(X_LE)
print("   ‚úÖ MinMaxScaler treinado")

# 3. Criar pasta
output_dir = "model_optimized"
os.makedirs(output_dir, exist_ok=True)

# 4. Criar pacote completo COM pr√©-processadores
print("\nüì¶ Criando pacote completo do modelo...")
model_package_optimized = {
    'model': best_model,                    
    'threshold': threshold_best,            
    'preprocessors': {                     
        'label_encoders': label_encoders,
        'scaler': scaler_minmax,
        'categorical_cols': categorical_cols,
        'numerical_cols': numerical_cols,
        'feature_names': list(X.columns),
        'accepted_values': {
            'education_level': list(label_encoders['education_level'].classes_),
            'income_level': list(label_encoders['income_level'].classes_)
        }
    },
    'hyperparameters': {
        'C': float(C_best),
        'penalty': penalty_best,
        'solver': solver_best,
        'class_weight': class_weight_best,
        'max_iter': int(max_iter_best),
        'random_state': 42
    },
    'performance_metrics': {
        'accuracy': float(accuracy_ga),
        'precision': float(precision_ga),
        'recall': float(recall_ga),
        'f1_score': float(f1_ga),
        'threshold_used': float(threshold_best)
    },
    'optimization_info': {
        'method': 'Genetic Algorithm (DEAP)',
        'population_size': POPULATION_SIZE,
        'generations': NGEN,
        'crossover_rate': CXPB,
        'mutation_rate': MUTPB,
        'cv_folds': 2,
        'fitness_metric': 'F1 Score',
        'best_cv_score': float(best_individual.fitness.values[0])
    },
    'preprocessing_info': {
        'encoder': 'Label Encoder',
        'scaler': 'MinMaxScaler',
        'oversampling': 'SMOTE (aplicado apenas no treino)',
        'random_state': 42,
        'preprocessors_included': True  # ‚≠ê Agora est√° inclu√≠do!
    },
    'export_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'model_type': 'Logistic Regression (Optimized with GA + Threshold)'
}

# 5. Salvar
joblib.dump(model_package_optimized, f"{output_dir}/diabetes_model_optimized.joblib")

# Salvar tamb√©m em pickle
import pickle
with open(f"{output_dir}/diabetes_model_optimized.pkl", "wb") as f:
    pickle.dump(model_package_optimized, f)

# Salvar metadados
metadata = {
    'model_type': model_package_optimized['model_type'],
    'export_date': model_package_optimized['export_date'],
    'hyperparameters': model_package_optimized['hyperparameters'],
    'performance_metrics': model_package_optimized['performance_metrics'],
    'optimization_info': model_package_optimized['optimization_info'],
    'preprocessing_info': model_package_optimized['preprocessing_info']
}

with open(f"{output_dir}/model_metadata.json", "w", encoding='utf-8') as f:
    json.dump(metadata, f, indent=4, ensure_ascii=False)

print("\n‚úÖ Modelo exportado COM pr√©-processadores!")
print(f"\nüìÅ Arquivos criados em '{output_dir}/':")
print(f"   ‚Ä¢ diabetes_model_optimized.joblib")
print(f"   ‚Ä¢ diabetes_model_optimized.pkl")
print(f"   ‚Ä¢ model_metadata.json")

print(f"\nüìä Informa√ß√µes:")
print(f"   ‚Ä¢ Modelo: ‚úÖ")
print(f"   ‚Ä¢ Threshold: {threshold_best:.4f} ‚úÖ")
print(f"   ‚Ä¢ Pr√©-processadores: ‚úÖ INCLU√çDOS!")
print(f"     - LabelEncoders: {len(label_encoders)}")
print(f"     - Scaler: {type(scaler_minmax).__name__}")
print(f"     - Features: {len(X.columns)}")

print(f"\nüí° Agora voc√™ pode usar o modelo sem precisar de arquivos separados!")

EXPORTANDO MODELO OTIMIZADO COM PR√â-PROCESSADORES

üì¶ Carregando dataset para criar pr√©-processadores...
‚úÖ Dataset carregado!
   ‚Ä¢ Features: 18
   ‚Ä¢ Categ√≥ricas: ['education_level', 'income_level']

üì¶ Criando pr√©-processadores...
   ‚úÖ education_level: 4 valores
   ‚úÖ income_level: 5 valores
   ‚úÖ MinMaxScaler treinado

üì¶ Criando pacote completo do modelo...

‚úÖ Modelo exportado COM pr√©-processadores!

üìÅ Arquivos criados em 'model_optimized/':
   ‚Ä¢ diabetes_model_optimized.joblib
   ‚Ä¢ diabetes_model_optimized.pkl
   ‚Ä¢ model_metadata.json

üìä Informa√ß√µes:
   ‚Ä¢ Modelo: ‚úÖ
   ‚Ä¢ Threshold: 0.5908 ‚úÖ
   ‚Ä¢ Pr√©-processadores: ‚úÖ INCLU√çDOS!
     - LabelEncoders: 2
     - Scaler: MinMaxScaler
     - Features: 18

üí° Agora voc√™ pode usar o modelo sem precisar de arquivos separados!


In [26]:
# EXEMPLO DE X_NEW PARA TESTE

import pandas as pd
import numpy as np

# Criar exemplo de paciente (dados brutos)
X_new = pd.DataFrame({
    'age': [45],
    'education_level': ['Graduate'],  # Valores aceitos: 'Graduate', 'Highschool', 'No formal', 'Postgraduate'
    'income_level': ['Middle'],        # Valores aceitos: 'High', 'Low', 'Lower-Middle', 'Middle', 'Upper-Middle'
    'physical_activity_minutes_per_week': [150],
    'diet_score': [7.5],
    'family_history_diabetes': [1],    # 0 ou 1
    'bmi': [28.5],
    'waist_to_hip_ratio': [0.92],
    'systolic_bp': [130],
    'cholesterol_total': [200],
    'hdl_cholesterol': [45],
    'ldl_cholesterol': [130],
    'triglycerides': [150],
    'glucose_fasting': [95],
    'glucose_postprandial': [140],
    'insulin_level': [12],
    'hba1c': [5.8],
    'diabetes_risk_score': [35]
})

print("=" * 80)
print("EXEMPLO DE X_NEW (Dados brutos do paciente)")
print("=" * 80)
print(X_new)
print(f"\nShape: {X_new.shape}")
print(f"Features: {list(X_new.columns)}")

EXEMPLO DE X_NEW (Dados brutos do paciente)
   age education_level income_level  physical_activity_minutes_per_week  \
0   45        Graduate       Middle                                 150   

   diet_score  family_history_diabetes   bmi  waist_to_hip_ratio  systolic_bp  \
0         7.5                        1  28.5                0.92          130   

   cholesterol_total  hdl_cholesterol  ldl_cholesterol  triglycerides  \
0                200               45              130            150   

   glucose_fasting  glucose_postprandial  insulin_level  hba1c  \
0               95                   140             12    5.8   

   diabetes_risk_score  
0                   35  

Shape: (1, 18)
Features: ['age', 'education_level', 'income_level', 'physical_activity_minutes_per_week', 'diet_score', 'family_history_diabetes', 'bmi', 'waist_to_hip_ratio', 'systolic_bp', 'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides', 'glucose_fasting', 'glucose_postprandial', '