In [17]:
## lo básico
import pandas as pd
import numpy as np

## visualización
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

## pre procesado
from sklearn.model_selection import train_test_split
#from ydata_profiling import ProfileReport

## modelado
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier

from sklearn.dummy import DummyClassifier, DummyRegressor




## métricas
from pandas.plotting import scatter_matrix
from sklearn.metrics import accuracy_score, auc, confusion_matrix, f1_score, precision_score, recall_score, roc_curve, roc_auc_score
from sklearn.metrics import classification_report, average_precision_score, ConfusionMatrixDisplay,mean_absolute_error,mean_squared_error
from sklearn.preprocessing import label_binarize

## mejora de modelos
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score


## automatización
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# 1. Carga de datos

In [2]:
url = "https://raw.githubusercontent.com/Valeriavinasl/ucm-tfm/main/data/F_E_Encuesta.csv"

df = pd.read_csv(url, sep="\t", engine="python")
df.head()

Unnamed: 0,edad,is_condicion_fisica,is_camino_realizado_prev,dias,distancia,is_calzado_adecuado,is_tr_mochila,is_reserva_aloj,nota,intensidad_km_dia,...,temporada_Primavera,temporada_Verano,grupo_edad_30-39,grupo_edad_40-49,grupo_edad_50-59,grupo_edad_60-70,is_volver_binaria,is_volver_talvez,is_volver_a_hacer_1.0,is_lesion_1
0,32.0,0,0,5.0,100,0,0,1,9,20.0,...,0,1,1,0,0,0,1,0,1,1
1,32.0,0,0,5.0,100,0,0,1,9,20.0,...,0,1,1,0,0,0,1,0,1,0
2,32.0,0,1,6.0,100,1,1,1,8,16.666667,...,1,0,1,0,0,0,1,0,1,1
3,32.0,0,0,6.0,100,1,1,1,9,16.666667,...,1,0,1,0,0,0,1,0,1,0
4,21.0,0,1,6.0,100,1,1,1,10,16.666667,...,0,1,0,0,0,0,1,0,1,1


# 2. Modelado supervisado (clasificación y regresión)
Objetivos

In [4]:
X = df.drop(columns=['nota', 'is_lesion_1', 'is_volver_a_hacer_1.0',
                        'is_tr_mochila', 'is_reserva_aloj', 'intensidad_km_dia',
                        'ruta_Francés', 'ruta_Inglés', 'ruta_Portugués', 'ruta_Primitivo', 'ruta_del Norte',
                        'epoca_Otoño', 'epoca_Primavera', 'epoca_Verano'])


In [5]:
def eval_clf(X, y, model, name):
    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(Xtr, ytr)
    ypred = model.predict_proba(Xte)[:,1]
    acc = accuracy_score(yte, (ypred > 0.5).astype(int))
    print(f"{name}: ACC = {acc:.3f}")
    return model, model.predict_proba(X)[:,1]  # devuelve proba completa


In [6]:
def eval_reg(X, y, model, name):
    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(Xtr, ytr)
    ypred = model.predict(Xte)
    mae = mean_absolute_error(yte, ypred)
    rmse = np.sqrt(mean_squared_error(yte, ypred))
    print(f"{name}: MAE={mae:.2f}, RMSE={rmse:.2f}")
    return model, model.predict(X)

In [7]:
# 3. Modelos de clasificación (probabilidades)

clf_mochila, prob_mochila = eval_clf(X, df['is_tr_mochila'], RandomForestClassifier(), "Prob transporte mochila")
clf_reserva, prob_reserva = eval_clf(X, df['is_reserva_aloj'], RandomForestClassifier(), "Prob reserva alojamiento")
clf_lesion, prob_lesion = eval_clf(X, df['is_lesion_1'], RandomForestClassifier(), "Prob lesión")
clf_volver, prob_volver = eval_clf(X, df['is_volver_a_hacer_1.0'], RandomForestClassifier(), "Prob volver")


Prob transporte mochila: ACC = 0.855
Prob reserva alojamiento: ACC = 0.690
Prob lesión: ACC = 0.963
Prob volver: ACC = 1.000


In [25]:
# Ejemplo con prob_lesion
Xtr, Xte, ytr, yte = train_test_split(X, df['is_lesion_1'], test_size=0.2, random_state=42)

clf = RandomForestClassifier()
clf.fit(Xtr, ytr)
ypred = clf.predict(Xte)
yprob = clf.predict_proba(Xte)[:,1]

acc = accuracy_score(yte, ypred)
print("Accuracy:", acc)


auc = roc_auc_score(yte, yprob)
print("ROC-AUC:", auc)


print(confusion_matrix(yte, ypred))

Accuracy: 0.965
ROC-AUC: 0.8509928385416666
[[381   3]
 [ 11   5]]


In [26]:
# Ejemplo con prob_volver_pred
Xtr, Xte, ytr, yte = train_test_split(X, df['is_volver_a_hacer_1.0'], test_size=0.2, random_state=42)

clf = RandomForestClassifier()
clf.fit(Xtr, ytr)
ypred = clf.predict(Xte)
yprob = clf.predict_proba(Xte)[:,1]


acc = accuracy_score(yte, ypred)
print("Accuracy:", acc)


auc = roc_auc_score(yte, yprob)
print("ROC-AUC:", auc)

print(confusion_matrix(yte, ypred))

Accuracy: 1.0
ROC-AUC: 1.0
[[ 22   0]
 [  0 378]]


In [14]:
# 4. Modelos de regresión

reg_intensidad, pred_intensidad = eval_reg(X, df['intensidad_km_dia'], RandomForestRegressor(), "Intensidad km/día")
reg_nota, pred_nota = eval_reg(X, df['nota'], RandomForestRegressor(), "Nota esperada")


Intensidad km/día: MAE=0.03, RMSE=0.29
Nota esperada: MAE=0.51, RMSE=0.61


In [27]:
Xtr, Xte, ytr, yte = train_test_split(X, df['nota'], test_size=0.2, random_state=42)

reg = RandomForestRegressor()
reg.fit(Xtr, ytr)
ypred = reg.predict(Xte)

mae = mean_absolute_error(yte, ypred)
rmse = np.sqrt(mean_squared_error(yte, ypred))
r2 = r2_score(yte, ypred)

print(f"MAE={mae:.2f}, RMSE={rmse:.2f}, R2={r2:.2f}")

NameError: name 'r2_score' is not defined

In [20]:
# 5. Recomendación con vecinos (ruta y época)
# Ruta
y_ruta = df[['ruta_Francés', 'ruta_Inglés', 'ruta_Portugués', 'ruta_Primitivo', 'ruta_del Norte']].idxmax(axis=1)
knn_ruta = KNeighborsClassifier(n_neighbors=5)
knn_ruta.fit(X, y_ruta)
ruta_recom = knn_ruta.predict(X)

# Época
y_epoca = df[['epoca_Otoño', 'epoca_Primavera', 'epoca_Verano']].idxmax(axis=1)
knn_epoca = KNeighborsClassifier(n_neighbors=5)
knn_epoca.fit(X, y_epoca)
epoca_recom = knn_epoca.predict(X)

In [None]:
def saca_metricas_multiclase(y_true, y_pred, y_proba=None, labels=None):
    print('Matriz de Confusión')
    print(confusion_matrix(y_true, y_pred))
    print('Accuracy:', accuracy_score(y_true, y_pred))
    print('Precision (macro):', precision_score(y_true, y_pred, average='macro'))
    print('Recall (macro):', recall_score(y_true, y_pred, average='macro'))
    print('F1 Score (macro):', f1_score(y_true, y_pred, average='macro'))

    if y_proba is not None and labels is not None:
        y_true_bin = label_binarize(y_true, classes=labels)
        roc_auc = roc_auc_score(y_true_bin, y_proba, average="macro", multi_class="ovr")
        print('AUC (macro, OvR):', roc_auc)

In [None]:
y_pred = model.predict(X_val)
y_proba = model.predict_proba(X_val)
labels = model.classes_

saca_metricas_multiclase(y_val, y_pred, y_proba=y_proba, labels=labels)

In [21]:
df_final = df.copy()
df_final['prob_transporte_mochila_pred'] = prob_mochila
df_final['prob_reserva_aloj_pred'] = prob_reserva
df_final['prob_lesion_pred'] = prob_lesion
df_final['prob_volver_pred'] = prob_volver
df_final['intensidad_km_dia_pred'] = pred_intensidad
df_final['nota_pred'] = pred_nota
df_final['ruta_recom_vecinos'] = ruta_recom
df_final['epoca_recom_vecinos'] = epoca_recom

In [22]:
df_final.to_csv("Salida_Modelo_Mejorado.csv", index=False)