In [7]:
!pip install ISLP
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
                         summarize,
                         poly)



In [8]:
# Q1: Load the College dataset and show first 10 rows + all column names
import pandas as pd
College = load_data('College')
display(College.head(10))
print("\nColumn names:")
print(College.columns.tolist())

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15
5,Yes,587,479,158,38,62,678,41,13500,3335,500,675,67,73,9.4,11,9727,55
6,Yes,353,340,103,17,45,416,230,13290,5720,500,1500,90,93,11.5,26,8861,63
7,Yes,1899,1720,489,37,68,1594,32,13868,4826,450,850,89,100,13.7,37,11487,73
8,Yes,1038,839,227,30,63,973,306,15595,4400,300,500,79,84,11.3,23,11644,80
9,Yes,582,498,172,21,44,799,78,10468,3380,660,1800,40,41,11.5,15,8991,52



Column names:
['Private', 'Apps', 'Accept', 'Enroll', 'Top10perc', 'Top25perc', 'F.Undergrad', 'P.Undergrad', 'Outstate', 'Room.Board', 'Books', 'Personal', 'PhD', 'Terminal', 'S.F.Ratio', 'perc.alumni', 'Expend', 'Grad.Rate']


Q1: Brief description

The dataset College contains information on U.S. colleges and universities. Each row represents a single institution, and each column describes one of its characteristics such as tuition, student body, or faculty background.

- Private: A categorical variable indicating whether the institution is private (Yes/No).

- Outstate: The out-of-state tuition charged by the institution (numeric, in U.S. dollars).

- Room.Board: The cost of room and board (numeric, in U.S. dollars).

- PhD: The percentage of faculty with Ph.D. degrees (numeric, range 0–100).

- Top10perc: The percentage of new students who graduated in the top 10% of their high school class (numeric, range 0–100).

In [9]:
# Q2: Simple Linear Regression: Outstate ~ Top10perc
import numpy as np
import statsmodels.api as sm
X = pd.DataFrame({'Intercept': np.ones(College.shape[0]),
                  'Top10perc': College['Top10perc']})
y = College['Outstate']
model1 = sm.OLS(y, X) 
results1 = model1.fit() 
display(summarize(results1))

print(results1.params) # Coefficients

Unnamed: 0,coef,std err,t,P>|t|
Intercept,6906.4586,221.614,31.164,0.0
Top10perc,128.2437,6.774,18.931,0.0


Intercept    6906.458580
Top10perc     128.243669
dtype: float64


Q2: Report

Estimated coefficients:
- Intercept: 6906.46
- Top10perc: 128.24

Interpretation:

Each 1% increase in the proportion of students from the top 10% of their high school class is associated with an increase of about &#36;128 in out-of-state tuition. The intercept (approximately &#36;6906) is the predicted tuition when Top10perc = 0.

In [10]:
# Q3: Multiple Linear Regression: Outstate ~ TTop10perc + Room.Board + PhD
X = pd.DataFrame({'Intercept': np.ones(College.shape[0]),
                  'Top10perc': College['Top10perc'],
                  'Room.Board': College['Room.Board'],
                  'PhD': College['PhD']})
y = College['Outstate']
model2 = sm.OLS(y, X)
results2 = model2.fit()
display(summarize(results2))

print(results2.params) # Coefficients

Unnamed: 0,coef,std err,t,P>|t|
Intercept,-430.635,538.834,-0.799,0.424
Top10perc,82.0003,6.728,12.189,0.0
Room.Board,1.8825,0.097,19.397,0.0
PhD,5.6226,7.147,0.787,0.432


Intercept    -430.635022
Top10perc      82.000309
Room.Board      1.882480
PhD             5.622552
dtype: float64


Q3: Report

Estimated coefficients:
- Intercept: -430.64
- Top10perc: 82.00
- Room.Board: 1.88
- PhD: 5.62

Interpretation:

- When all predictors are zero, the baseline Outstate tuition is about –$431 (not meaningful in practice).
- Each 1% increase in Top10perc is linked to about $82 higher Outstate tuition, holding others constant.
- Each $1 increase in Room.Board adds about $1.88 to Outstate tuition, holding others constant.
- Each 1% increase in faculty with a PhD adds about $5.62 to Outstate tuition, holding others constant.

In [11]:
# Q4: Multiple Regression with quadratic term for Top10perc
X = pd.DataFrame({'Intercept': np.ones(College.shape[0]),
                  'Top10perc': College['Top10perc'],
                  'Top10perc2': College['Top10perc']**2,
                  'Room.Board': College['Room.Board'],
                  'PhD': College['PhD']})
y = College['Outstate']
model3 = sm.OLS(y, X)
results3 = model3.fit()
display(summarize(results3))

print(results3.params) 

Unnamed: 0,coef,std err,t,P>|t|
Intercept,-1087.2758,573.149,-1.897,0.058
Top10perc,138.0675,18.661,7.399,0.0
Top10perc2,-0.6859,0.213,-3.218,0.001
Room.Board,1.9081,0.097,19.712,0.0
PhD,1.9617,7.194,0.273,0.785


Intercept    -1087.275765
Top10perc      138.067460
Top10perc2      -0.685852
Room.Board       1.908079
PhD              1.961703
dtype: float64


Q4: Report

Estimated coefficients:

- Intercept: –1087.28
- Top10perc: 138.07
- Top10perc²: –0.69
- Room.Board: 1.91
- PhD: 1.96

Interpretation (for Outstate tuition):

- Each 1% increase in Top10perc initially raises Outstate tuition by about $138, holding other factors constant.
- The negative quadratic term (–0.69) means the effect of Top10perc diminishes as the percentage grows higher — the curve bends downward. In other words, tuition still increases with Top10perc, but at a decreasing rate.
- Room.Board and PhD continue to have positive effects on Outstate tuition, similar to the previous model.

In [12]:
# Q5: Compare Q3 and Q4 models using ANOVA
from statsmodels.stats.anova import anova_lm

# Q3 model = results2 (linear, with Top10perc, Room.Board, PhD)
# Q4 model = results3 (quadratic, add Top10perc^2)

anova_results = anova_lm(results2, results3)
print(anova_results)


   df_resid           ssr  df_diff       ss_diff          F    Pr(>F)
0     773.0  5.693394e+09      0.0           NaN        NaN       NaN
1     772.0  5.618020e+09      1.0  7.537388e+07  10.357499  0.001343


Q5: Interpretation

- F-Statistic: 10.36
- p-value: 0.0013

Interpretation:

Since the p < 0.05, the quadratic term for Top10perc significantly improves the prediction of outstate tuition compared to the linear model.
