Preparació de les dades

In [10]:
import pandas as pd
import seaborn as sns

df = sns.load_dataset("penguins")
df.dropna(inplace=True)

df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


Separació en conjunts d'entrenament (80%) i test (20%)

In [11]:
from sklearn.model_selection import train_test_split

X = df.drop("species", axis=1)
y = df["species"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

print("Dades d'entrenament", len(X_train))
print("Dades de prova", len(X_test))

Dades d'entrenament 266
Dades de prova 67


Estandardització

In [12]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler

dv = DictVectorizer(sparse=False)
X_train_dict = X_train.to_dict(orient='records')
X_test_dict = X_test.to_dict(orient='records')

X_train_encoded = dv.fit_transform(X_train_dict)
X_test_encoded = dv.transform(X_test_dict)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)


Entrenament dels models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

models = {
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier()
}

# Entrenar y evaluar models
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.2f}")


Logistic Regression Accuracy: 1.00
SVM Accuracy: 1.00
Decision Tree Accuracy: 1.00
KNN Accuracy: 1.00


Serialització dels models

In [22]:
import pickle

for name, model in models.items():
    with open(f'{name.replace(" ", "_").lower()}_model.pkl', 'wb') as f:
        pickle.dump((dv, scaler, model), f)
        print(f'Model {name} serialitzat correctament.')


Model Logistic Regression serialitzat correctament.
Model SVM serialitzat correctament.
Model Decision Tree serialitzat correctament.
Model KNN serialitzat correctament.
