Assignment 3

In [6]:
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, make_scorer
import numpy as np

# Load data
url = "https://raw.githubusercontent.com/Patrick0481/Intro-to-modeling/refs/heads/main/1654308boston.csv"
BostonData = pd.read_csv(url)



In [7]:
#Linear Regression
# Define features and target
X = BostonData.drop(columns=['MEDV'])
y = BostonData['MEDV']

# Convert categorical (text) columns into numeric dummy variables
X = pd.get_dummies(X, drop_first=True)

# Initialize model
model = LinearRegression()

# Define 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define custom scoring function for MAD (MAE)
mad_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

# Perform 5-fold CV
mad_scores = cross_val_score(model, X, y, scoring=mad_scorer, cv=kf)

# Convert to positive MAD values
mad_scores = -mad_scores

print("MAD for each fold:", mad_scores)
print("Average MAD across 5 folds:", np.mean(mad_scores))

MAD for each fold: [50.17820752 56.61508047 49.90871609 49.48648116 41.44634671]
Average MAD across 5 folds: 49.5269663890078


In [8]:

from sklearn.linear_model import RidgeCV, LassoCV

# Load and prepare data
url = "https://raw.githubusercontent.com/Patrick0481/Intro-to-modeling/refs/heads/main/1654308boston.csv"
BostonData = pd.read_csv(url)

X = BostonData.drop(columns=['MEDV'])
y = BostonData['MEDV']

# One-hot encode categorical variables
X = pd.get_dummies(X, drop_first=True)

# Define 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mad_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

#Lasso Regression
lasso = LassoCV(alphas=alphas, cv=kf, random_state=42)
lasso.fit(X, y)
lasso_alpha = lasso.alpha_

lasso_mad = -cross_val_score(LassoCV(alphas=[lasso_alpha], cv=kf, random_state=42),
                             X, y, scoring=mad_scorer, cv=kf)
print("\nOptimal Lasso alpha:", lasso_alpha)
print("Lasso MAD for each fold:", lasso_mad)
print("Average Lasso MAD:", np.mean(lasso_mad))


Optimal Lasso alpha: 0.1206792640639329
Lasso MAD for each fold: [50.10602111 56.95258294 49.42822367 49.27285137 41.69215057]
Average Lasso MAD: 49.490365931311686


In [9]:
#Ridge Regression
alphas = np.logspace(-3, 3, 50)  # 50 alpha values from 0.001 to 1000
ridge = RidgeCV(alphas=alphas, cv=kf, scoring=mad_scorer)
ridge.fit(X, y)
ridge_alpha = ridge.alpha_

ridge_mad = -cross_val_score(RidgeCV(alphas=[ridge_alpha]), X, y,
                             scoring=mad_scorer, cv=kf)
print("Optimal Ridge alpha:", ridge_alpha)
print("Ridge MAD for each fold:", ridge_mad)
print("Average Ridge MAD:", np.mean(ridge_mad))


Optimal Ridge alpha: 0.21209508879201905
Ridge MAD for each fold: [50.20095665 57.07464413 49.09781348 49.15122338 41.77457449]
Average Ridge MAD: 49.45984242532262


In [4]:
#%pip install tensorflow
#%pip install pandas 
#%pip install scikit-learn
#%pip install numpy

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Load and prepare data
url = "https://raw.githubusercontent.com/Patrick0481/Intro-to-modeling/refs/heads/main/1654308boston.csv"
BostonData = pd.read_csv(url)

X = BostonData.drop(columns=['MEDV'])
y = BostonData['MEDV']

# One-hot encode categorical variables
X = pd.get_dummies(X, drop_first=True)

# Standardize numeric features (important for neural networks)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mad_scores = []

# Function to build the model
def build_model(input_dim):
    model = Sequential([
        Dense(512, activation='relu', input_dim=input_dim),
        Dense(512, activation='relu'),
        Dense(512, activation='relu'),
        Dense(512, activation='relu'),
        Dense(1)  # output layer (regression -> no activation)
    ])
    model.compile(optimizer=Adam(),
                  loss='mean_absolute_error')  # MAD loss
    return model

# 5-fold CV training
for train_idx, test_idx in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    model = build_model(X_train.shape[1])
    model.fit(X_train, y_train,
              epochs=100,
              batch_size=16,
              verbose=0)  # silent training
    
    y_pred = model.predict(X_test).flatten()
    mad = mean_absolute_error(y_test, y_pred)
    mad_scores.append(mad)

print("MAD for each fold:", mad_scores)
print("Average MAD across 5 folds:", np.mean(mad_scores))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
MAD for each fold: [38.70954895019531, 49.25181198120117, 37.30810546875, 43.42402648925781, 46.83283233642578]
Average MAD across 5 folds: 43.105265045166014
