In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
import sklearn.model_selection as skm
from ISLP import load_data, confusion_table
from ISLP.models import ModelSpec as MS
from sklearn.tree import (DecisionTreeClassifier as DTC,
                          DecisionTreeRegressor as DTR,
                          plot_tree,
                          export_text) #DTC for classification, DTR for regression, plot_tree to visualize tree, export_text to get rules
from sklearn.metrics import (accuracy_score,
                             log_loss) #accuracy_score for classification accuracy, log_loss for cross-entropy loss
from sklearn.ensemble import \
     (RandomForestRegressor as RF,
      GradientBoostingRegressor as GBR) #RF for random forest, GBR for boosting
from ISLP.bart import BART #Bayesian Additive Regression Trees
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingClassifier as GBC


Question 1

In [None]:
# Load the Hitters dataset and remove rows with missing salary values
Hitters = load_data('Hitters')
Hitters = Hitters.dropna(subset=['Salary'])
Hitters

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,A,W,282,421,25,750.0,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317,497,127,7,65,48,37,5,2703,806,32,379,311,138,N,E,325,9,3,700.0,N
318,492,136,5,76,50,94,12,5511,1511,39,897,451,875,A,E,313,381,20,875.0,A
319,475,126,3,61,43,52,6,1700,433,7,217,93,146,A,W,37,113,7,385.0,A
320,573,144,9,85,60,78,8,3198,857,97,470,420,332,A,E,1314,131,12,960.0,A


In [9]:
design = MS(Hitters.columns.drop('Salary')).fit(Hitters)
D = design.fit_transform(Hitters)
D = D.drop('intercept', axis=1) #drop intercept column
X = np.asarray(D)
y = np.array(Hitters['Salary'])
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [14]:
RFmodel_Hitters = RF(max_features=X_train.shape[1], n_estimators=300, random_state=42)
RFmodel_Hitters.fit(X_train, y_train)
y_pred_RF_Hitters = RFmodel_Hitters.predict(X_test)
mse_RF_Hitters = mean_squared_error(y_test, y_pred_RF_Hitters)
print (f"Random Forest Model Test MSE: {mse_RF_Hitters:.2f}")

Random Forest Model Test MSE: 120962.79


Question 2

In [15]:
RFmodel2_Hitters = RF(max_features=5, n_estimators=300, random_state=42)
RFmodel2_Hitters.fit(X_train, y_train)
y_pred2_RF_Hitters = RFmodel2_Hitters.predict(X_test)
mse2_RF_Hitters = mean_squared_error(y_test, y_pred2_RF_Hitters)
print(f"Random Forest Model with max_features=5 Test MSE: {mse2_RF_Hitters:.2f}")

Random Forest Model with max_features=5 Test MSE: 123904.56


In [17]:
feature_imp = pd.DataFrame(
    {'importance':RFmodel2_Hitters.feature_importances_},
    index=D.columns) # Create DataFrame of feature importances
feature_imp.sort_values(by='importance', ascending=False) # Sort feature importances in descending order

Unnamed: 0,importance
CHits,0.141982
CAtBat,0.11709
CRBI,0.113281
CRuns,0.101749
CHmRun,0.074938
CWalks,0.067593
PutOuts,0.057856
RBI,0.051779
AtBat,0.050193
Runs,0.047026


The feature CHits appears to be the most important predictor of a player's salary according to the Random Forest model with max_features set to 5.

Question 3

In [None]:
OJ = load_data('OJ')


In [19]:
design2 = MS(OJ.columns.drop('Purchase')).fit(OJ)
D2 = design2.transform(OJ)
D2 = D2.drop('intercept', axis=1) #drop intercept column
X2 = np.asarray(D2)
y2 = np.array(OJ['Purchase'])
X2_train, X2_test, y2_train, y2_test = train_test_split(
    X2, y2, test_size=0.25, random_state=42
)

In [21]:
DTC_model_OJ = DTC(criterion='entropy', max_depth=4, min_samples_leaf=5, random_state=42)
DTC_model_OJ.fit(X2_train, y2_train)
y2_pred_DTC_OJ = DTC_model_OJ.predict(X2_test)
DTC_accuracy = accuracy_score(y2_test, y2_pred_DTC_OJ)
print("Decision Tree Classifier Accuracy:", f"{DTC_accuracy:.3f}" )

Decision Tree Classifier Accuracy: 0.817


Question 4

In [34]:
Auto = load_data('Auto')
Auto = Auto.dropna()
Auto["mpg_high"] = np.where(Auto['mpg']>Auto['mpg'].median(), 1, 0)
Auto

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,mpg_high
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
chevrolet chevelle malibu,18.0,8,307.0,130,3504,12.0,70,1,0
buick skylark 320,15.0,8,350.0,165,3693,11.5,70,1,0
plymouth satellite,18.0,8,318.0,150,3436,11.0,70,1,0
amc rebel sst,16.0,8,304.0,150,3433,12.0,70,1,0
ford torino,17.0,8,302.0,140,3449,10.5,70,1,0
...,...,...,...,...,...,...,...,...,...
ford mustang gl,27.0,4,140.0,86,2790,15.6,82,1,1
vw pickup,44.0,4,97.0,52,2130,24.6,82,2,1
dodge rampage,32.0,4,135.0,84,2295,11.6,82,1,1
ford ranger,28.0,4,120.0,79,2625,18.6,82,1,1


In [35]:
design3 = MS(Auto.columns.drop(['mpg',"mpg_high"])).fit(Auto)
D3 = design3.transform(Auto)
D3 = D3.drop('intercept', axis=1) #drop intercept column
X3 = np.asarray(D3)
y3 = np.array(Auto['mpg_high'])
X3_train, X3_test, y3_train, y3_test = train_test_split(
    X3, y3, test_size=0.3, random_state=42
)

In [None]:
GBC_model_Auto = GBC(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42)
GBC_model_Auto.fit(X3_train, y3_train)
y3_pred_GBC_Auto = GBC_model_Auto.predict(X3_test)
GBC_accuracy = accuracy_score(y3_test, y3_pred_GBC_Auto)
print("Gradient Boosting Classifier Accuracy:", f"{GBC_accuracy:.3f}")

Gradient Boosting Classifier Accuracy: 0.890


Question 5

In [38]:
Hitters = load_data('Hitters')
Hitters = Hitters.dropna()
Hitters

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,A,W,282,421,25,750.0,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317,497,127,7,65,48,37,5,2703,806,32,379,311,138,N,E,325,9,3,700.0,N
318,492,136,5,76,50,94,12,5511,1511,39,897,451,875,A,E,313,381,20,875.0,A
319,475,126,3,61,43,52,6,1700,433,7,217,93,146,A,W,37,113,7,385.0,A
320,573,144,9,85,60,78,8,3198,857,97,470,420,332,A,E,1314,131,12,960.0,A


In [39]:
design = MS(Hitters.columns.drop('Salary')).fit(Hitters)
D = design.fit_transform(Hitters)
D = D.drop('intercept', axis=1) #drop intercept column
X = np.asarray(D)
y = np.array(Hitters['Salary'])
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [40]:
RT_model_Hitters = DTR(max_depth = 6, random_state=42)
RT_model_Hitters.fit(X_train, y_train)
y_pred_RT_Hitters = RT_model_Hitters.predict(X_test)

In [42]:
ccp_path = RT_model_Hitters.cost_complexity_pruning_path(X_train, y_train) # Get cost-complexity pruning path
kfold = skm.KFold(5,
                  shuffle=True,
                  random_state=10) # Define 5-fold cross-validation
grid = skm.GridSearchCV(RT_model_Hitters,
                        {'ccp_alpha': ccp_path.ccp_alphas},
                        refit=True,
                        cv=kfold, 
                        scoring='neg_mean_squared_error') # Set up grid search for optimal ccp_alpha using cross-validation
G = grid.fit(X_train, y_train) # Fit grid search
best_ = grid.best_estimator_
best_

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,6
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [44]:
# Get the number of leaf nodes in the best estimator
n_leaves = best_.get_n_leaves()
print(f"Number of leaf nodes in best estimator: {n_leaves}")

# Performance on test set
y_pred_best = best_.predict(X_test)
mse_best = mean_squared_error(y_test, y_pred_best)
print(f"Test MSE with best estimator: {mse_best:.2f}")

Number of leaf nodes in best estimator: 6
Test MSE with best estimator: 208848.95
