In [None]:
#problem1
import numpy as np
import pandas as pd
from ISLP import load_data
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

#Load Hitters dataset
Hitters = load_data('Hitters')

#Remove missing salary rows
Hitters = Hitters.dropna(subset=['Salary'])

#One-hot encode categorical variables
X = pd.get_dummies(Hitters.drop(columns=['Salary']), drop_first=True)
y = Hitters['Salary']

#Split data 70/30
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

#Fit Bagging model (RF with all predictors)
rf_bagging = RandomForestRegressor(
    n_estimators=300,
    max_features=X.shape[1],  # use all predictors Bagging
    random_state=42,
    n_jobs=-1
)
rf_bagging.fit(X_train, y_train)

#Predict and compute test MSE
y_pred = rf_bagging.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print(f"Test MSE: {mse:.2f}")


Test MSE: 121169.42


In [2]:
#problem2
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np

#X_train, X_test, y_train, y_test assumed to be already defined from previous code
rf = RandomForestRegressor(
    n_estimators=300,
    max_features=5,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

#MSE
y_pred = rf.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred)
print(f"Test MSE: {mse_rf:.2f}")

#Feature Importance DataFrame
importance_df = (
    pd.DataFrame({
        'Feature': X.columns,
        'Importance': rf.feature_importances_
    })
    .sort_values(by='Importance', ascending=False)
    .reset_index(drop=True)
)

importance_df.head(10)

Test MSE: 121789.39


Unnamed: 0,Feature,Importance
0,CHits,0.141768
1,CAtBat,0.117607
2,CRBI,0.110572
3,CRuns,0.104219
4,CHmRun,0.07249
5,CWalks,0.064816
6,PutOuts,0.055663
7,RBI,0.051619
8,Hits,0.047647
9,AtBat,0.047065


The variable CHits has the highest importance score (≈ 0.14).

In [None]:
#problem3
from ISLP import load_data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

#Load OJ dataset
OJ = load_data('OJ')

#Define features and target
X = pd.get_dummies(OJ.drop(columns=['Purchase']), drop_first=True)
y = OJ['Purchase']

#Split data into training (75%) and test (25%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

#Fit decision tree classifier
tree = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=4,
    min_samples_leaf=5,
    random_state=42
)
tree.fit(X_train, y_train)

#Predict and compute accuracy
y_pred = tree.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print(f"Test Accuracy: {acc:.3f}")


Test Accuracy: 0.821


In [5]:
#problem4
from ISLP import load_data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

#Load Auto dataset
Auto = load_data('Auto')

#Check if 'name' column exists
if 'name' in Auto.columns:
    print("'name' column found in the dataset.")
else:
    print("'name' column not found. This is expected because the ISLP version of the Auto dataset "
          "has already removed the 'name' variable compared to the original ISLR version.")

#Drop rows with missing values
Auto = Auto.dropna()

#Create binary target: 1 if mpg > median(mpg), else 0
median_mpg = Auto['mpg'].median()
Auto['mpg_high'] = (Auto['mpg'] > median_mpg).astype(int)

#Define features and target
#Only drop 'mpg' (not 'name', since it's not present)
X = pd.get_dummies(Auto.drop(columns=['mpg', 'mpg_high']), drop_first=True)
y = Auto['mpg_high']

#Split into training and test sets (70/30)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

#Fit Gradient Boosting Classifier
gb = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)
gb.fit(X_train, y_train)

#Predict and compute accuracy
y_pred = gb.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print(f"\nTest Accuracy: {acc:.3f}")



'name' column not found. This is expected because the ISLP version of the Auto dataset has already removed the 'name' variable compared to the original ISLR version.

Test Accuracy: 0.932


In [None]:
#problem5
from ISLP import load_data
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

#Load dataset and drop rows with missing Salary
Hitters = load_data('Hitters').dropna(subset=['Salary'])

#One-hot encode categorical variables
X = pd.get_dummies(Hitters.drop(columns=['Salary']), drop_first=True)
y = Hitters['Salary']

#Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

#Fit initial regression tree
base_tree = DecisionTreeRegressor(max_depth=6, random_state=42)
base_tree.fit(X_train, y_train)

#Get cost complexity pruning path (ccp_alpha values)
path = base_tree.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas

#Grid search for best ccp_alpha (5-fold CV)
param_grid = {'ccp_alpha': ccp_alphas}
grid = GridSearchCV(
    DecisionTreeRegressor(max_depth=6, random_state=42),
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)
grid.fit(X_train, y_train)

#Evaluate best estimator
best_tree = grid.best_estimator_
n_leaves = best_tree.get_n_leaves()
test_mse = mean_squared_error(y_test, best_tree.predict(X_test))

print(f"(a) Number of leaf nodes: {n_leaves}")
print(f"(b) Test MSE: {test_mse:.2f}")


(a) Number of leaf nodes: 4
(b) Test MSE: 161392.55
