In [None]:
import pandas as pd

df = pd.read_csv("datasets/datos05_train.csv")

# Separar las variables (X) de las etiquetas (y)
X_train = df.drop(columns=["ActividadFisica"]) # Quitar la columna de clase
y_train = df["ActividadFisica"]

X_train

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier

procesador = ColumnTransformer(transformers=[
    ("num",
     Pipeline([
         ("imputador", SimpleImputer()),
         ("std",StandardScaler())
     ]),
        ['Peso','Estatura','Edad','PctGrasaCorporal']
    ),
    ("cat",
     Pipeline([
         ("imputador", SimpleImputer(strategy="most_frequent")),
         ("std",OneHotEncoder(handle_unknown="ignore"))
     ]),
        ['EstadoCivil']

    )
])
modelo = KNeighborsClassifier()


modeloFinal = Pipeline([
    ("Preprocesado", procesador),
    ("Clasificador",modelo)
])


modeloFinal.fit(X_train,y_train)
modeloFinal.predict(X_train)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
#Dataset completo
df = pd.read_csv("datasets/datos05_train.csv")
X_orig = df.drop(columns=["ActividadFisica"]) # Quitar la columna de clase
y_orig = df["ActividadFisica"]

X_train, X_test ,y_train, y_test = train_test_split(X_orig,y_orig,test_size=0.25)

modeloFinal.fit(X_train,y_train)

y_pred = modeloFinal.predict(X_test)

accuracy_score(y_pred,y_test)


In [None]:
#Dataset completo
df = pd.read_csv("datasets/datos05_train.csv")
X_orig = df.drop(columns=["ActividadFisica"]) # Quitar la columna de clase
y_orig = df["ActividadFisica"]

#stratificar siempre
X_train, X_test ,y_train, y_test = train_test_split(X_orig,y_orig,test_size=0.25,stratify=y_orig)

modeloFinal.fit(X_train,y_train)

y_pred = modeloFinal.predict(X_test)

accuracy_score(y_pred,y_test)

In [None]:
from sklearn.model_selection import cross_val_score

#Dataset completo
df = pd.read_csv("datasets/datos05_train.csv")
X_orig = df.drop(columns=["ActividadFisica"]) # Quitar la columna de clase
y_orig = df["ActividadFisica"]


accuracies = cross_val_score(modeloFinal,X_orig,y_orig, cv=10)

import numpy as np

print(np.mean(accuracies))
print(np.std(accuracies))

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
#Dataset completo
df = pd.read_csv("datasets/datos05_train.csv")
X_orig = df.drop(columns=["ActividadFisica"]) # Quitar la columna de clase
y_orig = df["ActividadFisica"]
 
cv = StratifiedKFold(n_splits=10,shuffle=True)
resultados = cross_validate(modeloFinal,X_orig,y_orig, cv=cv)
resultados

In [None]:
#Ejercicio 1 clase 2

df = pd.read_csv("datasets/MathLearningDataset.csv")

X_orig = df.drop(columns=["Type of Answer"]) # Quitar la columna de clase
y_orig = df["Type of Answer"]
#Hold-out
procesador = ColumnTransformer(transformers=[
    ("num",
     Pipeline([
         ("imputador", SimpleImputer()),
         ("std",StandardScaler())
     ]),
        ['Student Age','Response Time (s)']
    ),
    ("cat",
     Pipeline([
         ("imputador", SimpleImputer(strategy="most_frequent")),
         ("std",OneHotEncoder(handle_unknown="ignore"))
     ]),
        ['Student Country','Question Level','Topic','Subtopic']

    )
])
modelo = KNeighborsClassifier(6)


modeloFinal1 = Pipeline([
    ("Preprocesado", procesador),
    ("Clasificador",modelo)
])


X_train, X_test ,y_train, y_test = train_test_split(X_orig,y_orig,test_size=0.25,stratify=y_orig)

modeloFinal1.fit(X_train,y_train)

y_pred = modeloFinal1.predict(X_test)

score = accuracy_score(y_pred,y_test)
print("Hold-Out=", score)
#Validacion cruzada 10 folds
cv = StratifiedKFold(n_splits=10,shuffle=True)
accuracies = cross_val_score(modeloFinal1,X_orig,y_orig, cv=cv)

import numpy as np

print("Validación cruzada =",np.mean(accuracies))
#print(np.std(accuracies))

In [None]:
from sklearn.model_selection import GridSearchCV
#Dataset completo
df = pd.read_csv("datasets/datos05_train.csv")
X_orig = df.drop(columns=["ActividadFisica"]) # Quitar la columna de clase
y_orig = df["ActividadFisica"]



procesador = ColumnTransformer(transformers=[
    ("num",
     Pipeline([
         ("imputador", SimpleImputer()),
         ("std",StandardScaler())
     ]),
        ['Peso','Estatura','Edad','PctGrasaCorporal']
    ),
    ("cat",
     Pipeline([
         ("imputador", SimpleImputer(strategy="most_frequent")),
         ("std",OneHotEncoder(handle_unknown="ignore"))
     ]),
        ['EstadoCivil']

    )
])
modelo = KNeighborsClassifier()


modeloFinal = Pipeline([
    ("Preprocesado", procesador),
    ("Clasificador",modelo)
])


#stratificar siempre
X_train, X_test ,y_train, y_test = train_test_split(X_orig,y_orig,test_size=0.25,stratify=y_orig)

parametros = {
    "Clasificador__n_neighbors":[1,3,5,7,9,11]
}


gridSearch =  GridSearchCV(modeloFinal, param_grid=parametros)
gridSearch.fit(X_train,y_train)
gridSearch.best_params_

mejorModelo = gridSearch.best_estimator_

y_pred =  mejorModelo.predict(X_test)
accuracy_score(y_test,y_pred)

In [None]:
from sklearn.tree import DecisionTreeClassifier

df = pd.read_csv("datasets/datos05_train.csv")
X_orig = df.drop(columns=["ActividadFisica"]) # Quitar la columna de clase
y_orig = df["ActividadFisica"]



procesador = ColumnTransformer(transformers=[
    ("num",
     Pipeline([
         ("imputador", SimpleImputer()),
         ("std",StandardScaler())
     ]),
        ['Peso','Estatura','Edad','PctGrasaCorporal']
    ),
    ("cat",
     Pipeline([
         ("imputador", SimpleImputer(strategy="most_frequent")),
         ("std",OneHotEncoder(handle_unknown="ignore"))
     ]),
        ['EstadoCivil']

    )
])

#Se cambia el modelo de vecinos cercanod  a tree
modelo = DecisionTreeClassifier()


modeloFinal = Pipeline([
    ("Preprocesado", procesador),
    ("Clasificador",modelo)
])
modeloFinal.fit(X_train,y_train)

In [None]:
from sklearn.tree import export_text

r = export_text(modeloFinal.named_steps["Clasificador"],
                    feature_names=modeloFinal.named_steps["Preprocesado"].get_feature_names_out(),
                    class_names=["Alta", "Baja"])
print(r)

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

plt.figure(figsize=(8,8))
plot_tree(modeloFinal.named_steps["Clasificador"],
                    feature_names=modeloFinal.named_steps["Preprocesado"].get_feature_names_out(),
                    class_names=["Alta", "Baja"], filled=True, rounded=True, label="none")
plt.show()

In [None]:
#Ejercicio 2
#Ejercicio 1 clase 2

df = pd.read_csv("datasets/MathLearningDataset.csv")

X_orig = df.drop(columns=["Type of Answer"]) # Quitar la columna de clase
y_orig = df["Type of Answer"]
#Hold-out
procesador = ColumnTransformer(transformers=[
    ("num",
     Pipeline([
         ("imputador", SimpleImputer()),
         ("std",StandardScaler())
     ]),
        ['Student Age','Response Time (s)']
    ),
    ("cat",
     Pipeline([
         ("imputador", SimpleImputer(strategy="most_frequent")),
         ("std",OneHotEncoder(handle_unknown="ignore"))
     ]),
        ['Student Country','Question Level','Topic','Subtopic']

    )
])
modelo = KNeighborsClassifier()


modeloFinal1 = Pipeline([
    ("Preprocesado", procesador),
    ("Clasificador",modelo)
])


modelo1 = DecisionTreeClassifier()

modeloFinal2= Pipeline([
    ("Preprocesado", procesador),
    ("Clasificador",modelo1)
])

X_train, X_test ,y_train, y_test = train_test_split(X_orig,y_orig,test_size=0.25,stratify=y_orig)

parametros1 = {
    "Clasificador__n_neighbors":[1,3,5,7,9,11]
}

parametros2 = {
    "Clasificador__max_depth":[3,5,10,None],
    "Clasificador__min_samples_split":[5,10,None],
    "Clasificador__min_samples_leaf":[5,10,None],
    "Clasificador__max_leaf_nodes":[5,10,None]
}
#Validacion cruzada 10 folds
cv = StratifiedKFold(n_splits=5,shuffle=True)


gridSearch =  GridSearchCV(modeloFinal1, param_grid=parametros1,cv=cv)


gridSearch2 = GridSearchCV(modeloFinal2,param_grid=parametros2,cv=cv)



In [None]:
gridSearch.fit(X_train,y_train)

gridSearch2.fit(X_train,y_train)



In [None]:
mejorModelo1 = gridSearch.best_estimator_
mejorModelo2 = gridSearch2.best_estimator_

predicionKnn =  mejorModelo1.predict(X_test)

predicionArbol = mejorModelo2.predict(X_test)


print(accuracy_score(predicionKnn,y_test))
print(accuracy_score(predicionArbol,y_test))