Importando e carregando Arquivo

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
import xgboost as xgb
from google.colab import drive
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# 1. Conectar ao Drive
drive.mount('/content/drive')

# 2. Carregar dados
path = '/content/drive/MyDrive/POSTECH -TECH_CHALLENGEüìöDATA_ANALYTCS/Fase4/Obesity.csv'
df_obesity = pd.read_csv(path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 3. Renomear colunas (Padronizando para o novo dicion√°rio)

In [5]:


novos_nomes = {
    'Gender': 'genero',
    'Age': 'idade',
    'Height': 'altura_m',
    'Weight': 'peso_kg',
    'family_history': 'historia_familiar_sobrepeso',
    'FAVC': 'come_comida_calorica_freq',
    'FCVC': 'freq_consumo_vegetais',
    'NCP': 'num_refeicoes_principais',
    'CAEC': 'come_entre_refeicoes',
    'SMOKE': 'fumante',
    'CH2O': 'consumo_agua_litros',
    'SCC': 'monitora_calorias',
    'FAF': 'freq_atividade_fisica',
    'TUE': 'tempo_uso_dispositivos',
    'CALC': 'freq_consumo_alcool',
    'MTRANS': 'meio_transporte',
    'Obesity': 'nivel_obesidade'
}
df_obesity = df_obesity.rename(columns=novos_nomes)

# 4. Engenharia de Features e Tipagem

In [4]:
# Criar IMC
df_obesity['imc'] = (df_obesity['peso_kg'] / (df_obesity['altura_m'] ** 2)).round(2)

# Convers√£o para Inteiros (Arredondamento)
colunas_para_int = [
    'idade', 'freq_consumo_vegetais', 'num_refeicoes_principais',
    'consumo_agua_litros', 'freq_atividade_fisica', 'tempo_uso_dispositivos'
]
for col in colunas_para_int:
    if col in df_obesity.columns:
        df_obesity[col] = df_obesity[col].round().astype(int)

In [6]:
# Criar IMC
df_obesity['imc'] = (df_obesity['peso_kg'] / (df_obesity['altura_m'] ** 2)).round(2)

# Convers√£o para Inteiros (Arredondamento)
colunas_para_int = [
    'idade', 'freq_consumo_vegetais', 'num_refeicoes_principais',
    'consumo_agua_litros', 'freq_atividade_fisica', 'tempo_uso_dispositivos'
]
for col in colunas_para_int:
    if col in df_obesity.columns:
        df_obesity[col] = df_obesity[col].round().astype(int)

In [7]:
# 5. Divis√£o X e y
X = df_obesity.drop('nivel_obesidade', axis=1)
y = df_obesity['nivel_obesidade']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Label Encoding para o Alvo (Obrigat√≥rio para XGBoost)
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

# 7. Defini√ß√£o do Pr√©-processador
colunas_categoricas = X.select_dtypes(include=['object']).columns.tolist()
colunas_numericas = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), colunas_numericas),
        ('cat', OneHotEncoder(handle_unknown='ignore'), colunas_categoricas)
    ])

# 8. Loop de Treino e Compara√ß√£o
modelos = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42),
    'Regress√£o Log√≠stica': LogisticRegression(max_iter=1000, random_state=42)
}

resultados = {}

for nome, modelo in modelos.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', modelo)
    ])

    pipeline.fit(X_train, y_train_enc)
    y_pred = pipeline.predict(X_test)

    acc = accuracy_score(y_test_enc, y_pred)
    resultados[nome] = acc

    print(f"\n" + "="*30)
    print(f" RELAT√ìRIO: {nome}")
    print("="*30)
    print(classification_report(y_test_enc, y_pred, target_names=le.classes_))


 RELAT√ìRIO: Random Forest
                     precision    recall  f1-score   support

Insufficient_Weight       0.98      1.00      0.99        56
      Normal_Weight       0.92      0.97      0.94        62
     Obesity_Type_I       1.00      1.00      1.00        78
    Obesity_Type_II       1.00      1.00      1.00        58
   Obesity_Type_III       1.00      1.00      1.00        63
 Overweight_Level_I       0.96      0.89      0.93        56
Overweight_Level_II       0.98      0.98      0.98        50

           accuracy                           0.98       423
          macro avg       0.98      0.98      0.98       423
       weighted avg       0.98      0.98      0.98       423


 RELAT√ìRIO: XGBoost
                     precision    recall  f1-score   support

Insufficient_Weight       1.00      0.96      0.98        56
      Normal_Weight       0.94      1.00      0.97        62
     Obesity_Type_I       1.00      0.99      0.99        78
    Obesity_Type_II       0.98 

In [8]:
# 9. Ranking Final
print("\n--- Ranking de Modelos (Acur√°cia) ---")
for nome, acc in sorted(resultados.items(), key=lambda x: x[1], reverse=True):
    print(f"{nome}: {acc:.2%}")


--- Ranking de Modelos (Acur√°cia) ---
XGBoost: 98.82%
Random Forest: 97.87%
Regress√£o Log√≠stica: 90.07%


In [9]:
# 10. Salvar Modelo Final (XGBoost) e LabelEncoder
final_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb.XGBClassifier(random_state=42))
])
final_pipeline.fit(X_train, y_train_enc)

joblib.dump(final_pipeline, 'modelo_obesidade.pkl')
joblib.dump(le, 'label_encoder.pkl')

print("\nüíæ Arquivos 'modelo_obesidade.pkl' e 'label_encoder.pkl' salvos com sucesso!")


üíæ Arquivos 'modelo_obesidade.pkl' e 'label_encoder.pkl' salvos com sucesso!
