In [36]:
import pandas as pd
import statsmodels.api as sm

df = pd.read_excel("Motor Trend Car Road Tests.xlsx")

X = df["hp"] 
y = df["mpg"]  

X_const = sm.add_constant(X)

model = sm.OLS(y, X_const).fit()

print(model.summary())


                            OLS Regression Results                            
Dep. Variable:                    mpg   R-squared:                       0.602
Model:                            OLS   Adj. R-squared:                  0.589
Method:                 Least Squares   F-statistic:                     45.46
Date:                Thu, 10 Apr 2025   Prob (F-statistic):           1.79e-07
Time:                        17:57:45   Log-Likelihood:                -87.619
No. Observations:                  32   AIC:                             179.2
Df Residuals:                      30   BIC:                             182.2
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         30.0989      1.634     18.421      0.0

In [38]:
import numpy as np

df = pd.read_excel("Motor Trend Car Road Tests.xlsx")

beta_0_list = []
beta_1_list = []

n_iterations = 1000
n = len(df)

# Bootstrap: generar 1000 regresiones
for _ in range(n_iterations):
    sample = df.sample(n=n, replace=True)
    X_sample = sm.add_constant(sample['hp'])
    y_sample = sample['mpg']
    model = sm.OLS(y_sample, X_sample).fit()

    beta_0_list.append(model.params['const'])
    beta_1_list.append(model.params['hp'])

# Convertir a arrays
beta_0_array = np.array(beta_0_list)
beta_1_array = np.array(beta_1_list)

print("Estimaciones bootstrap (1000 muestras):")
print(f"Beta_0 (intercepto): media = {np.mean(beta_0_array):.4f}, desviación estándar = {np.std(beta_0_array, ddof=1):.4f}")
print(f"Beta_1 (hp)        : media = {np.mean(beta_1_array):.4f}, desviación estándar = {np.std(beta_1_array, ddof=1):.4f}")


Estimaciones bootstrap (1000 muestras):
Beta_0 (intercepto): media = 30.4401, desviación estándar = 2.0873
Beta_1 (hp)        : media = -0.0713, desviación estándar = 0.0141


## BAGGING CON DATASET 'ADVERTISING'

In [40]:
import itertools
df = pd.read_csv("Advertising.csv")
target = 'sales'

# Crear variables
df["TV_radio"] = df["TV"] * df["radio"]
df["TV_newspaper"] = df["TV"] * df["newspaper"]
df["radio_newspaper"] = df["radio"] * df["newspaper"]

# Lista de todas las posibles variables
all_features = ["TV", "radio", "newspaper", "TV_radio", "TV_newspaper", "radio_newspaper"]

# Listas para guardar los modelos y sus variables
models = []
features_list = []

# Entrenamiento de 1000 modelos con bootstrap y 4 variables aleatorias
for _ in range(1000):
    sample = df.sample(n=len(df), replace=True)
    selected_features = np.random.choice(all_features, size=4, replace=False).tolist()
    X_sample = sm.add_constant(sample[selected_features])
    y_sample = sample[target]
    model = sm.OLS(y_sample, X_sample).fit()
    models.append(model)
    features_list.append(selected_features)

# Función de predicción 
def predict_bagging_interactions(x_input):
    # Crear interacciones para el input
    x_input = x_input.copy()
    x_input["TV_radio"] = x_input["TV"] * x_input["radio"]
    x_input["TV_newspaper"] = x_input["TV"] * x_input["newspaper"]
    x_input["radio_newspaper"] = x_input["radio"] * x_input["newspaper"]
    
    predictions = []
    for model, features in zip(models, features_list):
        x_filtered = x_input[features]
        x_with_const = sm.add_constant(x_filtered, has_constant='add')
        pred = model.predict(x_with_const)
        predictions.append(pred)
    return np.mean(predictions, axis=0)
y_pred_final = predict_bagging_interactions(df)

y_pred_final


array([21.72468458, 11.02010843,  9.76168184, 17.69782662, 12.6646516 ,
        9.38215037, 11.20012259, 12.271503  ,  7.38716472, 11.36454356,
        8.60682862, 16.75024143,  9.27591394,  9.85305112, 18.74749819,
       21.82765969, 11.7425369 , 25.06746228, 10.42640861, 14.04587975,
       18.25115651, 12.8928968 ,  7.4507006 , 15.68654146,  9.31936096,
       13.05780142, 14.81388397, 16.03212945, 19.17710498,  9.72859576,
       21.88796919, 11.52551512,  8.74912833, 17.61210805,  8.98235897,
       13.788249  , 24.45121606, 14.04413885,  9.80597843, 21.0653754 ,
       16.04367859, 17.38596355, 20.99551581, 12.90187175,  8.79981805,
       14.94845119,  9.64674149, 22.52024789, 15.52416987,  9.13520456,
       11.47357413, 10.23496345, 21.51704857, 20.70945557, 20.18097253,
       22.56461831,  8.16915732, 12.7822434 , 23.08483921, 17.92060277,
        7.94273615, 24.76339414, 15.7212703 , 13.03570207, 16.58416318,
        9.38382272,  9.49988987, 12.10188613, 18.63527942, 21.91

In [41]:
from sklearn.metrics import r2_score

y_real = df["sales"]
r2 = r2_score(y_real, y_pred_final)

print(f"R² del promedio de modelos (bagging): {r2:.4f}")

R² del promedio de modelos (bagging): 0.9613
