In [8]:
import numpy as np
import pandas as pd
import seaborn as sns
from statsmodels.formula.api import ols
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import random


In [2]:
import matplotlib.pyplot as plt

In [11]:
#Pregunta 2
np.random.seed(1234)


# Generar datos: X y Y (n=1000, sin intercepto)
X = np.random.uniform(0, 1, 1000).reshape(-1, 1)
e = np.random.normal(0, 1, 1000).reshape(-1, 1)
Y = np.exp(4 * X) + e  

# Lista para almacenar resultados
resultados = []
features_list = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000]

n_test = 250

# Bucle para diferentes números de características
for n_features,i in enumerate(features_list,1):
    # Crear matriz de características polinómicas
    X_poly = np.hstack([X**i for i in range(1, n_features + 1)])
    
    # Estimación en muestra completa
    model_full = LinearRegression()
    model_full.fit(X_poly, Y)
    Y_pred_full = model_full.predict(X_poly)
    
    # Calcular MSE y R² en muestra completa
    mse_full = np.mean((Y - Y_pred_full) ** 2)
    R_sq_full = 1 - mse_full / np.mean((Y - np.mean(Y)) ** 2)
    
    # Calcular R² ajustado
    adj_mse = 1000 / (1000 -i -1) * mse_full
    adj_R_sq_full = 1 - adj_mse / np.mean((Y - np.mean(Y)) ** 2)

    # Dividir datos en entrenamiento (75%) y prueba (25%)
    X_train, X_test, Y_train, Y_test = train_test_split(
        X_poly, Y, train_size=0.75, random_state=1234
    )
    
    # Estimación en conjunto de entrenamiento
    model_train = LinearRegression()
    model_train.fit(X_train, Y_train)
    Y_pred_test = model_train.predict(X_test)
    
    # Calcular MSE y R² fuera de muestra
    mse_test = np.mean((Y_test - Y_pred_test) ** 2)
    R_sq_test = 1 - mse_test / np.mean((Y_test - np.mean(Y_test)) ** 2)
    
    # Calcular R² ajustado fuera de muestra (en test set)
    adj_mse_test = n_test / (n_test - i - 1) * mse_test
    adj_R_sq_test = 1 - adj_mse_test / np.mean((Y_test - np.mean(Y_test)) ** 2)
    
    # Guardar resultados
    resultados.append({
        'features': n_features,
        'R_squared': R_sq_full,
        'adjusted_R_squared': adj_R_sq_full,
        'out_of_sample_R_squared': R_sq_test,
        'out_of_sample_adjusted_R_squared': adj_R_sq_test
    })

# Crear DataFrame con resultados
R_cuadrados = pd.DataFrame(resultados)

# Imprimir resultados
print(R_cuadrados)

# Guardar resultados en CSV
R_cuadrados.to_csv("R_squares.csv", index=False)

# Crear gráficos
# Gráfico 1: R² (full sample)
plt.figure(figsize=(8, 6))
plt.plot(R_cuadrados['features'], R_cuadrados['R_squared'], marker='o', linewidth=2)
plt.xlabel("Number of Features")
plt.ylabel("R-squared (Full Sample)")
plt.title("R-squared vs Number of Features (Full Sample)")
plt.ylim(0.79, 1)
plt.grid(True)
plt.savefig("R-squared.pdf")
plt.close()

# Gráfico 2: R² ajustado (full sample)
plt.figure(figsize=(8, 6))
plt.plot(R_cuadrados['features'], R_cuadrados['adjusted_R_squared'], marker='o', linewidth=2)
plt.xlabel("Number of Features")
plt.ylabel("Adjusted R-squared (Full Sample)")
plt.title("Adjusted R-squared vs Number of Features (Full Sample)")
plt.ylim(0.79, 1)
plt.grid(True)
plt.savefig("Adjusted R-squared.pdf")
plt.close()

# Gráfico 3: R² fuera de muestra (test)
plt.figure(figsize=(8, 6))
plt.plot(R_cuadrados['features'], R_cuadrados['out_of_sample_R_squared'], marker='o', linewidth=2)
plt.xlabel("Number of Features")
plt.ylabel("Out-of-sample R-squared (Test)")
plt.title("Out-of-sample R-squared vs Number of Features (Test)")
plt.ylim(0.79, 1)
plt.grid(True)
plt.savefig("Out-of-sample R-squared.pdf")
plt.close()

# Gráfico adicional: Adjusted R² fuera de muestra (test)
plt.figure(figsize=(8, 6))
plt.plot(R_cuadrados['features'], R_cuadrados['out_of_sample_adjusted_R_squared'], marker='o', linewidth=2)
plt.xlabel("Number of Features")
plt.ylabel("Out-of-sample Adjusted R-squared (Test)")
plt.title("Out-of-sample Adjusted R-squared vs Number of Features (Test)")
plt.ylim(0.79, 1)
plt.grid(True)
plt.savefig("Out-of-sample Adjusted R-squared.pdf")
plt.close()

   features  R_squared  adjusted_R_squared  out_of_sample_R_squared  \
0         1   0.800658            0.800259                 0.792271   
1         2   0.976230            0.976159                 0.977937   
2         3   0.994233            0.994198                 0.995048   
3         4   0.995197            0.995144                 0.995999   
4         5   0.995224            0.995121                 0.996052   
5         6   0.995224            0.994968                 0.996039   
6         7   0.995226            0.994690                 0.996043   
7         8   0.995226            0.994025                 0.996040   
8         9   0.995229            0.990438                 0.996045   
9        10   0.995234            5.766094                 0.996034   

   out_of_sample_adjusted_R_squared  
0                          0.790596  
1                          0.977669  
2                          0.994927  
3                          0.995815  
4                          0