In [96]:
import pandas as pd

df = pd.read_csv("datasets/datos05_train.csv")

# Separar las variables (X) de las etiquetas (y)
X_train = df.drop(columns=["ActividadFisica"]) # Quitar la columna de clase
y_train = df["ActividadFisica"]

X_train

Unnamed: 0,Peso,Estatura,Edad,PctGrasaCorporal,EstadoCivil
0,80.1,172.5,74.0,17.8,casado
1,82.3,164.4,29.0,19.5,soltero
2,52.6,162.8,36.0,13.2,soltero
3,92.5,174.2,72.0,18.7,soltero
4,84.9,174.4,21.0,17.6,casado
...,...,...,...,...,...
1495,64.4,188.6,60.0,11.0,soltero
1496,84.1,178.5,18.0,10.6,casado
1497,71.5,182.0,46.0,12.0,casado
1498,72.5,169.0,57.0,14.1,casado


In [97]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier

procesador = ColumnTransformer(transformers=[
    ("num",
     Pipeline([
         ("imputador", SimpleImputer()),
         ("std",StandardScaler())
     ]),
        ['Peso','Estatura','Edad','PctGrasaCorporal']
    ),
    ("cat",
     Pipeline([
         ("imputador", SimpleImputer(strategy="most_frequent")),
         ("std",OneHotEncoder(handle_unknown="ignore"))
     ]),
        ['EstadoCivil']

    )
])
modelo = KNeighborsClassifier()


modeloFinal = Pipeline([
    ("Preprocesado", procesador),
    ("Clasificador",modelo)
])


modeloFinal.fit(X_train,y_train)
modeloFinal.predict(X_train)

array([0, 1, 1, ..., 0, 0, 1])

In [98]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
#Dataset completo
df = pd.read_csv("datasets/datos05_train.csv")
X_orig = df.drop(columns=["ActividadFisica"]) # Quitar la columna de clase
y_orig = df["ActividadFisica"]

X_train, X_test ,y_train, y_test = train_test_split(X_orig,y_orig,test_size=0.25)

modeloFinal.fit(X_train,y_train)

y_pred = modeloFinal.predict(X_test)

accuracy_score(y_pred,y_test)


0.952

In [99]:
#Dataset completo
df = pd.read_csv("datasets/datos05_train.csv")
X_orig = df.drop(columns=["ActividadFisica"]) # Quitar la columna de clase
y_orig = df["ActividadFisica"]

#stratificar siempre
X_train, X_test ,y_train, y_test = train_test_split(X_orig,y_orig,test_size=0.25,stratify=y_orig)

modeloFinal.fit(X_train,y_train)

y_pred = modeloFinal.predict(X_test)

accuracy_score(y_pred,y_test)

0.944

In [100]:
from sklearn.model_selection import cross_val_score

#Dataset completo
df = pd.read_csv("datasets/datos05_train.csv")
X_orig = df.drop(columns=["ActividadFisica"]) # Quitar la columna de clase
y_orig = df["ActividadFisica"]


accuracies = cross_val_score(modeloFinal,X_orig,y_orig, cv=10)

import numpy as np

print(np.mean(accuracies))
print(np.std(accuracies))

0.9453333333333334
0.009797958971132701


In [101]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
#Dataset completo
df = pd.read_csv("datasets/datos05_train.csv")
X_orig = df.drop(columns=["ActividadFisica"]) # Quitar la columna de clase
y_orig = df["ActividadFisica"]
 
cv = StratifiedKFold(n_splits=10,shuffle=True)
resultados = cross_validate(modeloFinal,X_orig,y_orig, cv=cv)
resultados

{'fit_time': array([0.01501703, 0.01197243, 0.01199698, 0.0109756 , 0.01299667,
        0.0109942 , 0.01100659, 0.01299262, 0.01199317, 0.01197505]),
 'score_time': array([0.01396322, 0.01500535, 0.01399231, 0.01400971, 0.01399231,
        0.01397872, 0.01399183, 0.01397943, 0.01301003, 0.01399183]),
 'test_score': array([0.97333333, 0.94      , 0.94      , 0.96      , 0.93333333,
        0.92666667, 0.94666667, 0.96      , 0.92666667, 0.96666667])}

In [102]:
#Ejercicio 1 clase 2

df = pd.read_csv("datasets/MathLearningDataset.csv")

X_orig = df.drop(columns=["Type of Answer"]) # Quitar la columna de clase
y_orig = df["Type of Answer"]
#Hold-out
procesador = ColumnTransformer(transformers=[
    ("num",
     Pipeline([
         ("imputador", SimpleImputer()),
         ("std",StandardScaler())
     ]),
        ['Student Age','Response Time (s)']
    ),
    ("cat",
     Pipeline([
         ("imputador", SimpleImputer(strategy="most_frequent")),
         ("std",OneHotEncoder(handle_unknown="ignore"))
     ]),
        ['Student Country','Question Level','Topic','Subtopic']

    )
])
modelo = KNeighborsClassifier(6)


modeloFinal1 = Pipeline([
    ("Preprocesado", procesador),
    ("Clasificador",modelo)
])


X_train, X_test ,y_train, y_test = train_test_split(X_orig,y_orig,test_size=0.25,stratify=y_orig)

modeloFinal1.fit(X_train,y_train)

y_pred = modeloFinal1.predict(X_test)

score = accuracy_score(y_pred,y_test)
print("Hold-Out=", score)
#Validacion cruzada 10 folds
cv = StratifiedKFold(n_splits=10,shuffle=True)
accuracies = cross_val_score(modeloFinal1,X_orig,y_orig, cv=cv)

import numpy as np

print("Validación cruzada =",np.mean(accuracies))
#print(np.std(accuracies))

Hold-Out= 0.6003351487222455
Validación cruzada = 0.5924002546456365


In [107]:
from sklearn.model_selection import GridSearchCV
#Dataset completo
df = pd.read_csv("datasets/datos05_train.csv")
X_orig = df.drop(columns=["ActividadFisica"]) # Quitar la columna de clase
y_orig = df["ActividadFisica"]



procesador = ColumnTransformer(transformers=[
    ("num",
     Pipeline([
         ("imputador", SimpleImputer()),
         ("std",StandardScaler())
     ]),
        ['Peso','Estatura','Edad','PctGrasaCorporal']
    ),
    ("cat",
     Pipeline([
         ("imputador", SimpleImputer(strategy="most_frequent")),
         ("std",OneHotEncoder(handle_unknown="ignore"))
     ]),
        ['EstadoCivil']

    )
])
modelo = KNeighborsClassifier()


modeloFinal = Pipeline([
    ("Preprocesado", procesador),
    ("Clasificador",modelo)
])


#stratificar siempre
X_train, X_test ,y_train, y_test = train_test_split(X_orig,y_orig,test_size=0.25,stratify=y_orig)

parametros = {
    "Clasificador__n_neighbors":[1,3,5,7,9,11]
}


cv = StratifiedKFold(n_splits=5,shuffle=True)

gridSearch =  GridSearchCV(modeloFinal, param_grid=parametros)
gridSearch.fit(X_train,y_train)
gridSearch.best_params_

mejorModelo = gridSearch.best_estimator_

y_pred =  mejorModelo.predict(X_test)
accuracy_score(y_test,y_pred)

{'Clasificador__n_neighbors': 1}