In [1]:
import zipfile
from io import BytesIO

import numpy as np
import pandas as pd
import requests
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

pd.set_option("display.max_columns", None)

## QXD0178 - Mineração de Dados
# Classificação de dados

**Professor:** Paulo de Tarso Guerra Oliveira ([paulodetarso@ufc.br](mailto:paulodetarso@ufc.br))


# Lista de Exercícios: Classificação de dados

Nesta lista de exercícios, você explorará a aplicação de métodos de aprendizado de máquina para realizar tarefas de classificação de dados. Você usará a base de dados [Food choices: College students' food and cooking preferences](https://www.kaggle.com/datasets/borapajo/food-choices?select=food_coded.csv) e avaliará vários algoritmos de classificação para determinar sua eficácia. O objetivo é entender como diferentes métodos de aprendizado de máquina se comportam em relação à acurácia na classificação de dados.

O exercício será dividido em várias etapas:

1. **Pré-processamento dos dados:**
   - Descreva brevemente o conjunto de dados   
   - Limpe o conjunto de dados, tratando valores ausentes, removendo duplicatas e realizando transformações necessárias. 
   - Caso você use os dados pré-processados na lista anterior, faça um breve descritivo dos principais ajustes.
   - Codifique variáveis categóricas, se necessário, para que possam ser utilizadas em algoritmos de aprendizado de máquina.
   - Cria a coluna `self_perception_overweight` com valor: `True` se a coluna `self_perception_weight` tem valor 4 ou 5; e `False`, caso contrário.
   - Remova a coluna `self_perception_weight` do conjunto de dados.
2. **Divisão do conjunto de dados:**
   - Divida o conjunto de dados em um conjunto de treinamento e um conjunto de teste para avaliar o desempenho dos algoritmos. 
   - O mesmo conjunto de teste deve ser usado por todos os algoritmos analizados e nenhum dado deste pode ser usado na fase de treinamento.
   - O atributo alvo (*rótulo*) da classificação será o campo `self_perception_overweight`.   
3. **Seleção de algoritmos de classificação:**
   - Selecione uma variedade de algoritmos de aprendizado de máquina para testar na tarefa de classificação.   
   - Sua seleção deve conter, no mínimo, os seguintes métodos: Naive Bayes, k-Nearest Neighbors, Support Vector Machine (Linear/RBF), Decision Trees, Random Forest, Multilayer Perceptron.
   - Descreva brevemente como funciona cada algoritmo selecionado.
4. **Treinamento e avaliação:**
   - Treine os algoritmos de classificação usando todo o conjunto de treinamento. 
   - Avalie o desempenho de cada algoritmo no conjunto de teste usando métricas como acurácia, precisão, recall e F1-score.
   - Repita a análise treinando os algoritmos com validação cruzada.
   - Repita a análise realizando ajuste de hiperparâmetros.
5. **Análise dos resultados:**
   - Prepare um texto que descreva os resultados obtidos e faça uma análise crítica destes resultados.
   - Compare o desempenho dos diferentes algoritmos e explique por que alguns apresentaram resultados mais adequados que outros.
   
Documente todas as etapas em um arquivo Jupyter Notebook (`.ipynb`) que inclua as análises, o código e as justificativas. Lembre-se de que é fundamental justificar todas as decisões tomadas ao longo do processo e documentar as análises de forma clara e concisa. Este trabalho tem como objetivo proporcionar uma compreensão prática da seleção e avaliação de algoritmos de classificação em cenários de aprendizado supervisionado.

Envie seu Jupyter Notebook até a data de entrega especificada nesta tarefa.

## Solução


### **Coleta da base de dados experimental**

In [2]:
def extract_database() -> pd.DataFrame:
    try:
        url = "https://www.kaggle.com/api/v1/datasets/download/borapajo/food-choices"
        response = requests.get(url, stream=True)
        response.raise_for_status()
    except requests.HTTPError as exc:
        raise exc

    if response.status_code == 200:
        with zipfile.ZipFile(BytesIO(response.content), "r") as zip_ref:
            # Assuming there's only one file in the zip, we'll read it directly
            # If there are multiple files, adjust the handling accordingly
            with zip_ref.open(zip_ref.namelist()[1]) as file:
                return pd.read_csv(file)
    else:
        raise Exception("Failed to download the file")

In [3]:
df = extract_database()
df.to_csv("../data/raw/conf_foods.csv", index=False)
df.describe()

Unnamed: 0,Gender,breakfast,calories_chicken,calories_day,calories_scone,coffee,comfort_food_reasons_coded,cook,comfort_food_reasons_coded.1,cuisine,diet_current_coded,drink,eating_changes_coded,eating_changes_coded1,eating_out,employment,ethnic_food,exercise,father_education,fav_cuisine_coded,fav_food,fries,fruit_day,grade_level,greek_food,healthy_feeling,ideal_diet_coded,income,indian_food,italian_food,life_rewarding,marital_status,mother_education,nutritional_check,on_off_campus,parents_cook,pay_meal_out,persian_food,self_perception_weight,soup,sports,thai_food,tortilla_calories,turkey_calories,veggies_day,vitamins,waffle_calories
count,125.0,125.0,125.0,106.0,124.0,125.0,106.0,122.0,125.0,108.0,125.0,123.0,125.0,125.0,125.0,116.0,125.0,112.0,124.0,125.0,123.0,125.0,125.0,125.0,125.0,125.0,125.0,124.0,125.0,125.0,124.0,124.0,122.0,125.0,124.0,125.0,125.0,124.0,124.0,124.0,123.0,125.0,124.0,125.0,125.0,125.0,125.0
mean,1.392,1.112,577.32,3.028302,505.241935,1.752,2.698113,2.786885,2.688,1.388889,1.76,1.560976,1.536,4.552,2.56,2.448276,3.744,1.589286,3.483871,2.424,1.715447,1.088,4.224,2.376,3.488,5.456,3.704,4.532258,3.152,4.728,5.104839,1.5,3.42623,3.152,1.322581,1.528,3.408,2.806452,3.120968,1.217742,1.390244,3.336,947.580645,555.04,4.008,1.512,1073.4
std,0.490161,0.316636,131.214156,0.639308,230.840506,0.43359,1.972042,1.038351,1.910987,0.974759,0.766222,0.498298,0.757159,2.547788,1.13876,0.533158,1.177093,0.665137,1.206243,1.947968,0.910056,0.284435,0.923388,1.133536,1.365567,2.585643,2.086918,1.456332,1.486802,0.587175,3.120399,0.548464,1.170703,1.205204,0.681323,0.746778,1.040285,1.423824,1.11598,0.414385,0.4898,1.436528,202.090179,152.370379,1.081337,0.501867,248.667092
min,1.0,1.0,265.0,2.0,315.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,580.0,345.0,1.0,1.0,575.0
25%,1.0,1.0,430.0,3.0,420.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,3.0,2.0,2.0,3.0,1.0,2.0,1.0,1.0,1.0,4.0,1.0,3.0,3.0,2.0,4.0,2.0,5.0,2.0,1.0,2.0,2.0,1.0,1.0,3.0,2.0,2.0,1.0,1.0,2.0,725.0,500.0,3.0,1.0,900.0
50%,1.0,1.0,610.0,3.0,420.0,2.0,2.0,3.0,2.0,1.0,2.0,2.0,1.0,4.0,2.0,2.0,4.0,1.0,4.0,1.0,1.0,1.0,5.0,2.0,4.0,5.0,3.0,5.0,3.0,5.0,5.0,1.0,4.0,3.0,1.0,1.0,3.0,3.0,3.0,1.0,1.0,3.0,940.0,500.0,4.0,2.0,900.0
75%,2.0,1.0,720.0,3.0,420.0,2.0,3.0,3.0,3.0,1.0,2.0,2.0,2.0,5.0,3.0,3.0,5.0,2.0,4.0,4.0,3.0,1.0,5.0,3.0,5.0,8.0,6.0,6.0,5.0,5.0,8.0,2.0,4.0,4.0,1.0,2.0,4.0,4.0,4.0,1.0,2.0,5.0,1165.0,690.0,5.0,2.0,1315.0
max,2.0,2.0,720.0,4.0,980.0,2.0,9.0,5.0,9.0,6.0,4.0,2.0,4.0,13.0,5.0,3.0,5.0,3.0,5.0,8.0,3.0,2.0,5.0,4.0,5.0,10.0,8.0,6.0,5.0,5.0,10.0,4.0,5.0,5.0,4.0,5.0,6.0,5.0,6.0,2.0,2.0,5.0,1165.0,850.0,5.0,2.0,1315.0


In [4]:
df.head()

Unnamed: 0,GPA,Gender,breakfast,calories_chicken,calories_day,calories_scone,coffee,comfort_food,comfort_food_reasons,comfort_food_reasons_coded,cook,comfort_food_reasons_coded.1,cuisine,diet_current,diet_current_coded,drink,eating_changes,eating_changes_coded,eating_changes_coded1,eating_out,employment,ethnic_food,exercise,father_education,father_profession,fav_cuisine,fav_cuisine_coded,fav_food,food_childhood,fries,fruit_day,grade_level,greek_food,healthy_feeling,healthy_meal,ideal_diet,ideal_diet_coded,income,indian_food,italian_food,life_rewarding,marital_status,meals_dinner_friend,mother_education,mother_profession,nutritional_check,on_off_campus,parents_cook,pay_meal_out,persian_food,self_perception_weight,soup,sports,thai_food,tortilla_calories,turkey_calories,type_sports,veggies_day,vitamins,waffle_calories,weight
0,2.4,2,1,430,,315.0,1,none,we dont have comfort,9.0,2.0,9,,eat good and exercise,1,1.0,eat faster,1,1,3,3.0,1,1.0,5.0,profesor,Arabic cuisine,3,1.0,rice and chicken,2,5,2,5,2,looks not oily,being healthy,8,5.0,5,5,1.0,1.0,"rice, chicken, soup",1.0,unemployed,5,1.0,1,2,5.0,3.0,1.0,1.0,1,1165.0,345,car racing,5,1,1315,187
1,3.654,1,1,610,3.0,420.0,2,"chocolate, chips, ice cream","Stress, bored, anger",1.0,3.0,1,1.0,I eat about three times a day with some snacks...,2,2.0,I eat out more than usual.,1,2,2,2.0,4,1.0,2.0,Self employed,Italian,1,1.0,"chicken and biscuits, beef soup, baked beans",1,4,4,4,5,"Grains, Veggies, (more of grains and veggies),...",Try to eat 5-6 small meals a day. While trying...,3,4.0,4,4,1.0,2.0,"Pasta, steak, chicken",4.0,Nurse RN,4,1.0,1,4,4.0,3.0,1.0,1.0,2,725.0,690,Basketball,4,2,900,155
2,3.3,1,1,720,4.0,420.0,2,"frozen yogurt, pizza, fast food","stress, sadness",1.0,1.0,1,3.0,"toast and fruit for breakfast, salad for lunch...",3,1.0,sometimes choosing to eat fast food instead of...,1,3,2,3.0,5,2.0,2.0,owns business,italian,1,3.0,"mac and cheese, pizza, tacos",1,5,3,5,6,usually includes natural ingredients; nonproce...,i would say my ideal diet is my current diet,6,6.0,5,5,7.0,2.0,"chicken and rice with veggies, pasta, some kin...",2.0,owns business,4,2.0,1,3,5.0,6.0,1.0,2.0,5,1165.0,500,none,5,1,900,I'm not answering this.
3,3.2,1,1,430,3.0,420.0,2,"Pizza, Mac and cheese, ice cream",Boredom,2.0,2.0,2,2.0,"College diet, cheap and easy foods most nights...",2,2.0,Accepting cheap and premade/store bought foods,1,3,2,3.0,5,3.0,2.0,Mechanic,Turkish,3,1.0,"Beef stroganoff, tacos, pizza",2,4,4,5,7,"Fresh fruits& vegetables, organic meats","Healthy, fresh veggies/fruits & organic foods",2,6.0,5,5,2.0,2.0,Grilled chicken \rStuffed Shells\rHomemade Chili,4.0,Special Education Teacher,2,1.0,1,2,5.0,5.0,1.0,2.0,5,725.0,690,,3,1,1315,"Not sure, 240"
4,3.5,1,1,720,2.0,420.0,2,"Ice cream, chocolate, chips","Stress, boredom, cravings",1.0,1.0,1,2.0,I try to eat healthy but often struggle becaus...,2,2.0,I have eaten generally the same foods but I do...,3,4,2,2.0,4,1.0,4.0,IT,Italian,1,3.0,"Pasta, chicken tender, pizza",1,4,4,4,6,"A lean protein such as grilled chicken, green ...",Ideally I would like to be able to eat healthi...,2,6.0,2,5,1.0,1.0,"Chicken Parmesan, Pulled Pork, Spaghetti and m...",5.0,Substance Abuse Conselor,3,1.0,1,4,2.0,4.0,1.0,1.0,4,940.0,500,Softball,4,2,760,190


### **Pré-processamento dos dados**

In [5]:
{col: df[col].unique() for col in df.columns}

{'GPA': array(['2.4', '3.654', '3.3', '3.2', '3.5', '2.25', '3.8', '3.904', '3.4',
        '3.6', '3.1', nan, '4', '2.2', '3.87', '3.7', '3.9', '2.8', '3',
        '3.65', '3.89', '2.9', '3.605', '3.83', '3.292', '3.35',
        'Personal ', '2.6', '3.67', '3.73', '3.79 bitch', '2.71', '3.68',
        '3.75', '3.92', 'Unknown', '3.77', '3.63', '3.882'], dtype=object),
 'Gender': array([2, 1], dtype=int64),
 'breakfast': array([1, 2], dtype=int64),
 'calories_chicken': array([430, 610, 720, 265], dtype=int64),
 'calories_day': array([nan,  3.,  4.,  2.]),
 'calories_scone': array([315., 420., 980.,  nan]),
 'coffee': array([1, 2], dtype=int64),
 'comfort_food': array(['none', 'chocolate, chips, ice cream',
        'frozen yogurt, pizza, fast food',
        'Pizza, Mac and cheese, ice cream', 'Ice cream, chocolate, chips ',
        'Candy, brownies and soda.',
        'Chocolate, ice cream, french fries, pretzels',
        'Ice cream, cheeseburgers, chips.', 'Donuts, ice cream, chips',
 

In [6]:
for col in df.select_dtypes(include=np.number).columns:
    df[col].fillna(df[col].median(), inplace=True)

for col in df.select_dtypes(include=object).columns:
    df[col].fillna("Desconhecido", inplace=True)


def to_numeric(x):
    try:
        return pd.to_numeric(x)
    except Exception:
        return np.nan


for col in ["GPA", "weight"]:
    df[col] = (
        df[col].astype(str).str.replace(r"[^\d.]", "", regex=True).apply(to_numeric)
    )
    df[col].fillna(df[col].median(), inplace=True)

df.head()

Unnamed: 0,GPA,Gender,breakfast,calories_chicken,calories_day,calories_scone,coffee,comfort_food,comfort_food_reasons,comfort_food_reasons_coded,cook,comfort_food_reasons_coded.1,cuisine,diet_current,diet_current_coded,drink,eating_changes,eating_changes_coded,eating_changes_coded1,eating_out,employment,ethnic_food,exercise,father_education,father_profession,fav_cuisine,fav_cuisine_coded,fav_food,food_childhood,fries,fruit_day,grade_level,greek_food,healthy_feeling,healthy_meal,ideal_diet,ideal_diet_coded,income,indian_food,italian_food,life_rewarding,marital_status,meals_dinner_friend,mother_education,mother_profession,nutritional_check,on_off_campus,parents_cook,pay_meal_out,persian_food,self_perception_weight,soup,sports,thai_food,tortilla_calories,turkey_calories,type_sports,veggies_day,vitamins,waffle_calories,weight
0,2.4,2,1,430,3.0,315.0,1,none,we dont have comfort,9.0,2.0,9,1.0,eat good and exercise,1,1.0,eat faster,1,1,3,3.0,1,1.0,5.0,profesor,Arabic cuisine,3,1.0,rice and chicken,2,5,2,5,2,looks not oily,being healthy,8,5.0,5,5,1.0,1.0,"rice, chicken, soup",1.0,unemployed,5,1.0,1,2,5.0,3.0,1.0,1.0,1,1165.0,345,car racing,5,1,1315,187.0
1,3.654,1,1,610,3.0,420.0,2,"chocolate, chips, ice cream","Stress, bored, anger",1.0,3.0,1,1.0,I eat about three times a day with some snacks...,2,2.0,I eat out more than usual.,1,2,2,2.0,4,1.0,2.0,Self employed,Italian,1,1.0,"chicken and biscuits, beef soup, baked beans",1,4,4,4,5,"Grains, Veggies, (more of grains and veggies),...",Try to eat 5-6 small meals a day. While trying...,3,4.0,4,4,1.0,2.0,"Pasta, steak, chicken",4.0,Nurse RN,4,1.0,1,4,4.0,3.0,1.0,1.0,2,725.0,690,Basketball,4,2,900,155.0
2,3.3,1,1,720,4.0,420.0,2,"frozen yogurt, pizza, fast food","stress, sadness",1.0,1.0,1,3.0,"toast and fruit for breakfast, salad for lunch...",3,1.0,sometimes choosing to eat fast food instead of...,1,3,2,3.0,5,2.0,2.0,owns business,italian,1,3.0,"mac and cheese, pizza, tacos",1,5,3,5,6,usually includes natural ingredients; nonproce...,i would say my ideal diet is my current diet,6,6.0,5,5,7.0,2.0,"chicken and rice with veggies, pasta, some kin...",2.0,owns business,4,2.0,1,3,5.0,6.0,1.0,2.0,5,1165.0,500,none,5,1,900,155.0
3,3.2,1,1,430,3.0,420.0,2,"Pizza, Mac and cheese, ice cream",Boredom,2.0,2.0,2,2.0,"College diet, cheap and easy foods most nights...",2,2.0,Accepting cheap and premade/store bought foods,1,3,2,3.0,5,3.0,2.0,Mechanic,Turkish,3,1.0,"Beef stroganoff, tacos, pizza",2,4,4,5,7,"Fresh fruits& vegetables, organic meats","Healthy, fresh veggies/fruits & organic foods",2,6.0,5,5,2.0,2.0,Grilled chicken \rStuffed Shells\rHomemade Chili,4.0,Special Education Teacher,2,1.0,1,2,5.0,5.0,1.0,2.0,5,725.0,690,Desconhecido,3,1,1315,240.0
4,3.5,1,1,720,2.0,420.0,2,"Ice cream, chocolate, chips","Stress, boredom, cravings",1.0,1.0,1,2.0,I try to eat healthy but often struggle becaus...,2,2.0,I have eaten generally the same foods but I do...,3,4,2,2.0,4,1.0,4.0,IT,Italian,1,3.0,"Pasta, chicken tender, pizza",1,4,4,4,6,"A lean protein such as grilled chicken, green ...",Ideally I would like to be able to eat healthi...,2,6.0,2,5,1.0,1.0,"Chicken Parmesan, Pulled Pork, Spaghetti and m...",5.0,Substance Abuse Conselor,3,1.0,1,4,2.0,4.0,1.0,1.0,4,940.0,500,Softball,4,2,760,190.0


In [7]:
# df["Gender"] = df["Gender"].apply(lambda x: "male" if x != 1 else "female")
# df["breakfast"] = df["breakfast"].apply(lambda x: "good" if x == 1 else "bad")
df.drop(columns=["calories_chicken", "calories_day", "calories_scone"], inplace=True)
# df["coffee"] = df["coffee"].apply(lambda x: "good" if x != 1 else "bad")
df.head()

Unnamed: 0,GPA,Gender,breakfast,coffee,comfort_food,comfort_food_reasons,comfort_food_reasons_coded,cook,comfort_food_reasons_coded.1,cuisine,diet_current,diet_current_coded,drink,eating_changes,eating_changes_coded,eating_changes_coded1,eating_out,employment,ethnic_food,exercise,father_education,father_profession,fav_cuisine,fav_cuisine_coded,fav_food,food_childhood,fries,fruit_day,grade_level,greek_food,healthy_feeling,healthy_meal,ideal_diet,ideal_diet_coded,income,indian_food,italian_food,life_rewarding,marital_status,meals_dinner_friend,mother_education,mother_profession,nutritional_check,on_off_campus,parents_cook,pay_meal_out,persian_food,self_perception_weight,soup,sports,thai_food,tortilla_calories,turkey_calories,type_sports,veggies_day,vitamins,waffle_calories,weight
0,2.4,2,1,1,none,we dont have comfort,9.0,2.0,9,1.0,eat good and exercise,1,1.0,eat faster,1,1,3,3.0,1,1.0,5.0,profesor,Arabic cuisine,3,1.0,rice and chicken,2,5,2,5,2,looks not oily,being healthy,8,5.0,5,5,1.0,1.0,"rice, chicken, soup",1.0,unemployed,5,1.0,1,2,5.0,3.0,1.0,1.0,1,1165.0,345,car racing,5,1,1315,187.0
1,3.654,1,1,2,"chocolate, chips, ice cream","Stress, bored, anger",1.0,3.0,1,1.0,I eat about three times a day with some snacks...,2,2.0,I eat out more than usual.,1,2,2,2.0,4,1.0,2.0,Self employed,Italian,1,1.0,"chicken and biscuits, beef soup, baked beans",1,4,4,4,5,"Grains, Veggies, (more of grains and veggies),...",Try to eat 5-6 small meals a day. While trying...,3,4.0,4,4,1.0,2.0,"Pasta, steak, chicken",4.0,Nurse RN,4,1.0,1,4,4.0,3.0,1.0,1.0,2,725.0,690,Basketball,4,2,900,155.0
2,3.3,1,1,2,"frozen yogurt, pizza, fast food","stress, sadness",1.0,1.0,1,3.0,"toast and fruit for breakfast, salad for lunch...",3,1.0,sometimes choosing to eat fast food instead of...,1,3,2,3.0,5,2.0,2.0,owns business,italian,1,3.0,"mac and cheese, pizza, tacos",1,5,3,5,6,usually includes natural ingredients; nonproce...,i would say my ideal diet is my current diet,6,6.0,5,5,7.0,2.0,"chicken and rice with veggies, pasta, some kin...",2.0,owns business,4,2.0,1,3,5.0,6.0,1.0,2.0,5,1165.0,500,none,5,1,900,155.0
3,3.2,1,1,2,"Pizza, Mac and cheese, ice cream",Boredom,2.0,2.0,2,2.0,"College diet, cheap and easy foods most nights...",2,2.0,Accepting cheap and premade/store bought foods,1,3,2,3.0,5,3.0,2.0,Mechanic,Turkish,3,1.0,"Beef stroganoff, tacos, pizza",2,4,4,5,7,"Fresh fruits& vegetables, organic meats","Healthy, fresh veggies/fruits & organic foods",2,6.0,5,5,2.0,2.0,Grilled chicken \rStuffed Shells\rHomemade Chili,4.0,Special Education Teacher,2,1.0,1,2,5.0,5.0,1.0,2.0,5,725.0,690,Desconhecido,3,1,1315,240.0
4,3.5,1,1,2,"Ice cream, chocolate, chips","Stress, boredom, cravings",1.0,1.0,1,2.0,I try to eat healthy but often struggle becaus...,2,2.0,I have eaten generally the same foods but I do...,3,4,2,2.0,4,1.0,4.0,IT,Italian,1,3.0,"Pasta, chicken tender, pizza",1,4,4,4,6,"A lean protein such as grilled chicken, green ...",Ideally I would like to be able to eat healthi...,2,6.0,2,5,1.0,1.0,"Chicken Parmesan, Pulled Pork, Spaghetti and m...",5.0,Substance Abuse Conselor,3,1.0,1,4,2.0,4.0,1.0,1.0,4,940.0,500,Softball,4,2,760,190.0


In [8]:
util = df[
    [
        "GPA",
        "Gender",
        "grade_level",
        "employment",
        "eating_out",
        "nutritional_check",
        "on_off_campus",
        # "exercice",
        "breakfast",
        "coffee",
        "self_perception_weight",
        "type_sports",
        "sports",
        "weight",
    ]
].copy()
util.loc[:, "self_perception_overweight"] = util["self_perception_weight"].apply(
    lambda x: True if x > 3 else False
)
util.drop(columns=["self_perception_weight"], inplace=True)
util.to_csv("../data/interim/filtered.csv", index=False)
# 4. Remoção de caracteres irregulares:
util.head()

Unnamed: 0,GPA,Gender,grade_level,employment,eating_out,nutritional_check,on_off_campus,breakfast,coffee,type_sports,sports,weight,self_perception_overweight
0,2.4,2,2,3.0,3,5,1.0,1,1,car racing,1.0,187.0,False
1,3.654,1,4,2.0,2,4,1.0,1,2,Basketball,1.0,155.0,False
2,3.3,1,3,3.0,2,4,2.0,1,2,none,2.0,155.0,True
3,3.2,1,4,3.0,2,2,1.0,1,2,Desconhecido,2.0,240.0,True
4,3.5,1,4,2.0,2,3,1.0,1,2,Softball,1.0,190.0,True


In [9]:
fisicos = [
    "basketball",
    "softball",
    "soccer",
    "field hockey",
    "running",
    "hockey",
    "dancing",
    "volleyball",
    "gym",
]
util["type_sports"] = util["type_sports"].apply(
    lambda x: 1 if x.lower() in fisicos else 0
)

### Divisão do conjunto de dados

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    util.drop(columns=["self_perception_overweight"]),
    util["self_perception_overweight"],
    test_size=0.2,
    random_state=42,
)
# X_train["target"] = y_train
# X_test["target"] = y_test
# X_train.to_csv("../data/processed/train.csv", index=False)
# X_test.to_csv("../data/processed/test.csv", index=False)

### Avaliação dos modelos

In [11]:
def create_models():
    gnb = GaussianNB()
    knn = KNeighborsClassifier(n_neighbors=5)
    svm_linear = make_pipeline(StandardScaler(), SVC(kernel="linear", probability=True))
    svm_rbf = make_pipeline(StandardScaler(), SVC(kernel="rbf", probability=True))
    dt = DecisionTreeClassifier(random_state=42)
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    mlp = make_pipeline(
        StandardScaler(),
        MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42),
    )
    return gnb, knn, svm_linear, svm_rbf, dt, rf, mlp

In [12]:
gnb, knn, svm_linear, svm_rbf, dt, rf, mlp = create_models()

models = [
    ("Naive Bayes", gnb),
    ("k-NN", knn),
    ("SVM Linear", svm_linear),
    ("SVM RBF", svm_rbf),
    ("Decision Tree", dt),
    ("Random Forest", rf),
    ("MLP", mlp),
]

# Avaliar cada modelo
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Modelo: {name}")
    print(f"Acurácia: {accuracy_score(y_test, y_pred):.2f}")
    print(f"Precisão: {precision_score(y_test, y_pred):.2f}")
    print(f"Recall: {recall_score(y_test, y_pred):.2f}")
    print(f"F1-score: {f1_score(y_test, y_pred):.2f}")
    print("=" * 30)


Modelo: Naive Bayes
Acurácia: 0.72
Precisão: 0.67
Recall: 0.44
F1-score: 0.53
Modelo: k-NN
Acurácia: 0.72
Precisão: 1.00
Recall: 0.22
F1-score: 0.36
Modelo: SVM Linear
Acurácia: 0.76
Precisão: 0.80
Recall: 0.44
F1-score: 0.57
Modelo: SVM RBF
Acurácia: 0.68
Precisão: 0.67
Recall: 0.22
F1-score: 0.33
Modelo: Decision Tree
Acurácia: 0.48
Precisão: 0.33
Recall: 0.44
F1-score: 0.38
Modelo: Random Forest
Acurácia: 0.64
Precisão: 0.50
Recall: 0.22
F1-score: 0.31


Modelo: MLP
Acurácia: 0.52
Precisão: 0.38
Recall: 0.56
F1-score: 0.45


In [13]:
for name, model in models:
    scores = cross_val_score(model, X_test, y_test, cv=5, scoring="accuracy")
    print(f"Modelo: {name}")
    print(f"Acurácia média (Val. Cruzada): {scores.mean():.2f}")
    print("=" * 30)

Modelo: Naive Bayes
Acurácia média (Val. Cruzada): 0.48
Modelo: k-NN
Acurácia média (Val. Cruzada): 0.64
Modelo: SVM Linear
Acurácia média (Val. Cruzada): 0.60
Modelo: SVM RBF
Acurácia média (Val. Cruzada): 0.56
Modelo: Decision Tree
Acurácia média (Val. Cruzada): 0.48
Modelo: Random Forest
Acurácia média (Val. Cruzada): 0.52
Modelo: MLP
Acurácia média (Val. Cruzada): 0.56


In [14]:
grid_search_rf = GridSearchCV(
    RandomForestClassifier(random_state=42),
    {
        "n_estimators": [50, 100, 200],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5, 10],
    },
    cv=5,
    scoring="accuracy",
).fit(X_train, y_train)

print("Melhores hiperparâmetros para Random Forest:")
print(grid_search_rf.best_params_)
print(f"Melhor acurácia (Val. Cruzada): {grid_search_rf.best_score_:.2f}")

Melhores hiperparâmetros para Random Forest:
{'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}
Melhor acurácia (Val. Cruzada): 0.72


In [15]:
grid_search_svm = GridSearchCV(
    svm_rbf,
    {
        "svc__C": [0.1, 1, 10],
        "svc__gamma": ["scale", 0.1, 1],
        "svc__kernel": ["rbf", "linear"],
    },
    cv=5,
    scoring="accuracy",
).fit(X_train, y_train)

print("Melhores hiperparâmetros para SVM:")
print(grid_search_svm.best_params_)
print(f"Melhor acurácia (Val. Cruzada): {grid_search_svm.best_score_:.2f}")

Melhores hiperparâmetros para SVM:
{'svc__C': 1, 'svc__gamma': 0.1, 'svc__kernel': 'rbf'}
Melhor acurácia (Val. Cruzada): 0.75


### Treinamento e avaliação

In [16]:
gnb, knn, svm_linear, svm_rbf, dt, rf, mlp = create_models()

voting_clf = VotingClassifier(
    estimators=[
        ("gnb", gnb),
        ("knn", knn),
        ("svm_linear", svm_linear),
        ("svm_rbf", svm_rbf),
        ("dt", dt),
        ("rf", rf),
        ("mlp", mlp),
    ],
    voting="soft",
).fit(X_train, y_train)

print(f"Acurácia: {accuracy_score(y_test, voting_clf.predict(X_test)):.2f}")

Acurácia: 0.68


### Análise dos resultados