# Imports y configuraciones iniciales

In [None]:
import warnings

from typing import List

warnings.filterwarnings("ignore")

In [None]:
# Dejar el path principal como el anterior.
import sys
sys.path.append('../')

Seteamos las configuraciones del Logging

In [None]:
import logging

# Crear el logger
log = logging.getLogger(__name__)

# Setear el nivel del registro
log.setLevel(logging.DEBUG)

# Formato de los mensajes
formatter = logging.Formatter("%(levelname)s: (%(asctime)s) [%(filename)s: %(lineno)s] %(message)s")

if not log.hasHandlers():
    # Handlers
    file_handler = logging.FileHandler("logging.log")
    file_handler.setFormatter(formatter)  # Setear el formato del handler
    # Agregar el handler al logger
    # log.addHandler(file_handler)

    stream_handler = logging.StreamHandler()
    stream_handler.setFormatter(formatter)
    # Agregar el handler al logger
    log.addHandler(stream_handler)

Instalar la librería plot-likert y otras librerías útiles

In [None]:
# Librería para hacer gráficos Likert
# !pip install plot-likert

# Para obtener datos de excel
# !pip install openpyxl

# Para tener un transformador de data
# !pip install -U scikit-learn

# Para tener herramientas estadísticas
# !python -m pip install statsmodels

# Para tener Seaborn
# !pip install seaborn

Empezamos importando la librería para verificar que estuvo bien instalada.

In [None]:
import plot_likert

Importamos algunas librerías útiles para el resto del notebook

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

In [None]:
DATA_PATH = Path(r"..\data")

if not DATA_PATH.exists():
    log.info(f"Creando carpeta {DATA_PATH = }")
    DATA_PATH.mkdir()

In [None]:
QNA_COMPLETE_PATH = DATA_PATH / "questions-and-answers-complete.xlsx"
QNA_COMPLETE_PATH

In [None]:
log.debug(f"Importando datos de {QNA_COMPLETE_PATH}")
df = pd.read_excel(QNA_COMPLETE_PATH)

df.head()

In [None]:
questions = list(df.columns)[1:]
questions

In [None]:
import copy


log.debug("Renombrando columnas")
questions_copy = copy.copy(questions)
for i in range(len(questions)):
    # questions_copy[i] = f"P{i+1}.: " + questions[i]
    questions_copy[i] = f"P{i+1}"

df.columns = questions = ["Notas"] +  questions_copy


df.head()

# Reescalamiento y codificación

In [None]:
scales = [
    "Nunca",
    "Rara vez",
    "Ocasionalmente",
    "Casi siempre",
    "Siempre, en todas las clases"
]

Codificamos las respuestas para hacer la regresión

In [None]:
from collections import OrderedDict
df_enc = OrderedDict()

encoder = {name: i-2 for i, name in enumerate(scales)}
for col_name in questions_copy:
    df_enc[col_name] = df[col_name].map(encoder)

df_enc["Notas binaria"] = (df["Notas"] >= 6).astype(int)
df_enc["Notas continuas"] = (df["Notas"] - min(df["Notas"])) / (max(df["Notas"]) - min(df["Notas"]))

df_enc = pd.DataFrame(df_enc)
df_enc

In [None]:
df["Notas"] >= 6

In [None]:
df_enc.describe(percentiles=[i / 10 for i in range(10)])

# Separación entrenamiento-test

In [None]:
import statsmodels.api as sm

Asignamos como variable endógena (la variable dependiente) las notas (recordando que es 1 si la nota es mayor a 6 y 0 en otro caso).

In [None]:
df_enc.endog = np.array(df_enc.pop("Notas binaria")).reshape((-1, 1))

In [None]:
df_enc.endog_lin = np.array(df_enc.pop("Notas continuas")).reshape((-1, 1))

# df_enc.endog

In [None]:
np.mean(df_enc.endog)

In [None]:
np.mean(df_enc.endog_lin)

Y asignamos como variables exógenas las respuestas a las preguntas.

In [None]:
df_enc.exog = sm.add_constant(df_enc)
df_enc.exog

In [None]:
from sklearn.model_selection import train_test_split

df_enc.exog_train, df_enc.exog_test, df_enc.endog_train, df_enc.endog_test = train_test_split(
    df_enc.exog, df_enc.endog, test_size=0.10, random_state=42
)

df_enc.exog_lin_train, df_enc.exog_lin_test, df_enc.endog_lin_train, df_enc.endog_lin_test = train_test_split(
    df_enc.exog, df_enc.endog_lin, test_size=0.10, random_state=42
)
print(f"{len(df_enc.exog_train) = },\n{len(df_enc.exog_test) = },\n{len(df_enc.endog_train) = },\n{len(df_enc.endog_test) = }")

En el índice estarían los alumnos para el test

In [None]:
df_enc.exog_test

In [None]:
df["Notas"][df_enc.exog_test.index]

# Primer modelo de regresión logística

Creamos el modelo y lo ajustamos. Luego revisamos los parámetros aprendidos

In [None]:
logit_mod = sm.Logit(df_enc.endog_train, df_enc.exog_train)

logit_res = logit_mod.fit()

logit_res.summary()

A partir de aquí notamos que las preguntas 21, 23 y 30 son significativas al $5\%$. Revisemos cuál es el error cuadrático medio para obtener una comparativa.

In [None]:
from sklearn.metrics import mean_squared_error


mean_squared_error(
    y_true=df_enc.endog_test.reshape(-1,),
    y_pred=logit_res.predict(df_enc.exog_test),
)

In [None]:
from sklearn.metrics import mean_squared_error


mean_squared_error(
    y_true=df_enc.endog_train.reshape(-1,),
    y_pred=logit_res.predict(df_enc.exog_train),
)

In [None]:
mean_squared_error(
    y_true=df_enc.endog_test.reshape(-1,),
    y_pred=(logit_res.predict(df_enc.exog_test) >= 0.5).astype(int),
)

In [None]:
mean_squared_error(
    y_true=df_enc.endog_train.reshape(-1,),
    y_pred=(logit_res.predict(df_enc.exog_train) >= 0.5).astype(int),
)

Notando que 
\begin{align}
0\leq (y_{true} - \hat y_{pred}) ^2  \leq 1
\end{align}
Pues el valor de la predicción siempre está entre en el intervalo $[0, 1]$, notamos que el este error resulta ser satisfactorio, pues de no ser así, al menos tendería a $0.5$, cosa que no sucede en este caso.

In [None]:

alumnos = [20, 81, 10, 36, 39]
alumnos

In [None]:
logit_res.predict(df_enc.exog.iloc[alumnos])

In [None]:
(logit_res.predict(df_enc.exog.iloc[alumnos]) >= 0.5).astype(int)

In [None]:
df_enc.endog.reshape(-1,)[alumnos]

In [None]:
df.Notas[alumnos]

In [None]:
# Regresión con variables

Se determinó a través del coeficiente VIF las variables que permitirán tener una buena correlación para hacer una regresión. Se utilizarán estas variables en el modelo. Empezaremos haciendo una regresión con aquellos que tengan VIF menor o igual a 5 (como se sugiere).

## $VIF \leq 5$

In [None]:
vif_leq_5 = [
    "P14",
    "P15",
    "P16",
    "P17",
    "P20",
    "P25",
    "P27",
    "P30",
    "P31",
]

In [None]:
logit_mod_VIF_5 = sm.Logit(df_enc.endog_train, df_enc.exog_train[vif_leq_5])

logit_res = logit_mod_VIF_5.fit()

logit_res.summary()

In [None]:
from sklearn.metrics import mean_squared_error


mean_squared_error(
    y_true=df_enc.endog_test.reshape(-1,),
    y_pred=logit_res.predict(df_enc.exog_test[vif_leq_5]),
)

In [None]:
from sklearn.metrics import mean_squared_error


mean_squared_error(
    y_true=df_enc.endog_train.reshape(-1,),
    y_pred=logit_res.predict(df_enc.exog_train[vif_leq_5]),
)

In [None]:
mean_squared_error(
    y_true=df_enc.endog_test.reshape(-1,),
    y_pred=(logit_res.predict(df_enc.exog_test[vif_leq_5]) >= 0.5).astype(int),
)

## $VIF \leq 12$

In [None]:
vif_leq_12 = [
    "P1",
    "P2",
    "P5",
    "P6",
    "P7",
    "P9",
    "P10",
    "P12",
    "P14",
    "P15",
    "P16",
    "P17",
    "P18",
    "P19",
    "P20",
    "P21",
    "P22",
    "P23",
    "P24",
    "P25",
    "P26",
    "P27",
    "P28",
    "P29",
    "P30",
]

In [None]:
logit_mod_VIF_12 = sm.Logit(df_enc.endog_train, df_enc.exog_train[vif_leq_12])

logit_res = logit_mod_VIF_12.fit()

logit_res.summary()

In [None]:
from sklearn.metrics import mean_squared_error


mean_squared_error(
    y_true=df_enc.endog_test.reshape(-1,),
    y_pred=logit_res.predict(df_enc.exog_test[vif_leq_12]),
)

In [None]:
from sklearn.metrics import mean_squared_error


mean_squared_error(
    y_true=df_enc.endog_train.reshape(-1,),
    y_pred=logit_res.predict(df_enc.exog_train[vif_leq_12]),
)

In [None]:
mean_squared_error(
    y_true=df_enc.endog_test.reshape(-1,),
    y_pred=(logit_res.predict(df_enc.exog_test[vif_leq_12]) >= 0.5).astype(int),
)

## Regresión lineal

Intentaremos con probar una regresión lineal, en caso de que tenga un mejor ajuste o un mejor error cuadrático medio

## $VIF \leq 5$

In [None]:
vif_leq_5 = [
    "P14",
    "P15",
    "P16",
    "P17",
    "P20",
    "P25",
    "P27",
    "P30",
    "P31",
]

In [None]:
ols_mod_VIF_5 = sm.OLS(df_enc.endog_lin_train, df_enc.exog_lin_train[vif_leq_5])

ols_res = ols_mod_VIF_5.fit()

ols_res.summary()

In [None]:
from sklearn.metrics import mean_squared_error


mean_squared_error(
    y_true=df_enc.endog_lin_test.reshape(-1,),
    y_pred=ols_res.predict(df_enc.exog_lin_test[vif_leq_5]),
)

In [None]:
from sklearn.metrics import mean_squared_error


mean_squared_error(
    y_true=df_enc.endog_lin_train.reshape(-1,),
    y_pred=ols_res.predict(df_enc.exog_lin_train[vif_leq_5]),
)

**ADVERTENCIA**: Esta métrica no puede ser comparable con el anterior, pues el tipo de modelo que se comparan son totalmente distintos.

Comparemos los datos predichos con los reales

In [None]:
# Datos predichos
y_pred = np.array(ols_res.predict(df_enc.exog_lin_test[vif_leq_5]))
y_pred

In [None]:
# Reales
y_train = df_enc.endog_lin_test.reshape(-1,)
y_train

In [None]:
y_train - y_pred

Se nota que hay varios que están se alejan del real por un 0.5, lo cuál es malo al pensar que tenemos valores entre el 0 (nota mínima) y el 1 (nota máxima).

## $VIF \leq 12$

In [None]:
vif_leq_12 = [
    "P1",
    "P2",
    "P5",
    "P6",
    "P7",
    "P9",
    "P10",
    "P12",
    "P14",
    "P15",
    "P16",
    "P17",
    "P18",
    "P19",
    "P20",
    "P21",
    "P22",
    "P23",
    "P24",
    "P25",
    "P26",
    "P27",
    "P28",
    "P29",
    "P30",
]

In [None]:
ols_mod_VIF_12 = sm.OLS(df_enc.endog_lin_train, df_enc.exog_lin_train[vif_leq_12])

ols_res = ols_mod_VIF_12.fit()

ols_res.summary()

In [None]:
from sklearn.metrics import mean_squared_error


mean_squared_error(
    y_true=df_enc.endog_lin_test.reshape(-1,),
    y_pred=ols_res.predict(df_enc.exog_lin_test[vif_leq_12]),
)

In [None]:
from sklearn.metrics import mean_squared_error


mean_squared_error(
    y_true=df_enc.endog_lin_train.reshape(-1,),
    y_pred=ols_res.predict(df_enc.exog_lin_train[vif_leq_12]),
)