# Graded Quizzes - Exercise Notebook

In [74]:
# Import the relevant libraries and load the data set
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV, Ridge
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('College.csv')

In [5]:
df.head()

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,1,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,1,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,1,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,1,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,1,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [6]:
df.columns

Index(['Private', 'Apps', 'Accept', 'Enroll', 'Top10perc', 'Top25perc',
       'F.Undergrad', 'P.Undergrad', 'Outstate', 'Room.Board', 'Books',
       'Personal', 'PhD', 'Terminal', 'S.F.Ratio', 'perc.alumni', 'Expend',
       'Grad.Rate'],
      dtype='object')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 777 entries, 0 to 776
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Private      777 non-null    int64  
 1   Apps         777 non-null    int64  
 2   Accept       777 non-null    int64  
 3   Enroll       777 non-null    int64  
 4   Top10perc    777 non-null    int64  
 5   Top25perc    777 non-null    int64  
 6   F.Undergrad  777 non-null    int64  
 7   P.Undergrad  777 non-null    int64  
 8   Outstate     777 non-null    int64  
 9   Room.Board   777 non-null    int64  
 10  Books        777 non-null    int64  
 11  Personal     777 non-null    int64  
 12  PhD          777 non-null    int64  
 13  Terminal     777 non-null    int64  
 14  S.F.Ratio    777 non-null    float64
 15  perc.alumni  777 non-null    int64  
 16  Expend       777 non-null    int64  
 17  Grad.Rate    777 non-null    int64  
dtypes: float64(1), int64(17)
memory usage: 109.4 KB


In [8]:
df['Grad.Rate']

0      60
1      56
2      54
3      59
4      15
       ..
772    40
773    83
774    49
775    99
776    99
Name: Grad.Rate, Length: 777, dtype: int64

In [9]:
# Define X and y here
X = df.drop(["Grad.Rate"], axis=1).values
y = df['Grad.Rate'].values

In [12]:
# Scale the X values and split the data into training and test sets
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state = 1)

In [13]:
# Define a function to return the RMSE value given a set of actual y values and predicted y values
def rmse(y_train, y_pred):
    return np.sqrt(mean_squared_error(y_train, y_pred))

## Linear Model

In [14]:
# Create the basic linear regression model here, that is, a linear regression model without regularization
lr = LinearRegression()
lr = lr.fit(X_train, y_train)

In [16]:
print("RMSE on train:", round(rmse(y_train,lr.predict(X_train)), 3))
print("RMSE on test:", round(rmse(y_test,lr.predict(X_test)), 2))

RMSE on train: 12.736
RMSE on test: 12.62


## LassoCV

In [25]:
alphas= [0.0001, 0.001, 0.01, 0.1, 0.2, 0.52, 1.0, 2.0, 5.0, 10.0, 20, 50, 100, 1000]

In [26]:
# Train a LASSO model on the data using LassoCV for different alphas and find best alpha
# Note: Use cv = 5
lassocv = LassoCV(alphas=alphas, cv=5)
lassocv = lassocv.fit(X_train, y_train)

In [27]:
print(f"The best alpha is: {lassocv.alpha_}")

The best alpha is: 0.52


In [None]:
print("RMSE on train:", round(rmse(y_train,lassocv.predict(X_train)), 4))
print("RMSE on test:", round(rmse(y_test,lassocv.predict(X_test)), 4))

## RidgeCV

In [36]:
# Train a Ridge model on the data using RidgeCV for different alphas and find best alpha
# Note: Use cv = 5
alphas = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.13, 20, 50, 78.14, 100, 1000]
ridgecv = RidgeCV(alphas=alphas, cv=5)
ridgecv = ridgecv.fit(X_train, y_train)

In [37]:
print(f"The best alpha is: {ridgecv.alpha_}")

The best alpha is: 78.14


In [78]:
print("RMSE on train:", round(rmse(y_train,ridgecv.predict(X_train)), 3))
print("RMSE on test:", round(rmse(y_test,ridgecv.predict(X_test)), 2))

RMSE on train: 12.815
RMSE on test: 12.4


## Predicting the value of the target variable for the first observation in the test set

In [103]:
# Split the first observation in the test set into X and y values
X_true = X_test[0].reshape(1, -1)
y_true = y_test[0]

Note: If you recieve "ValueError: Expected 2D array, got 1D array instead" during prediction, just refer to the following link - https://stackoverflow.com/questions/51150153/valueerror-expected-2d-array-got-1d-array-instead

In [104]:
# Predict the value of graduation rate for the first observation in the test set using the basic linear regression model
y_pred_lr = lr.predict(X_true)
print(f"Predicted: {y_pred_lr[0]}, Original: {y_true}")

Predicted: 58.12807249215691, Original: 61


In [105]:
# Predict the value of graduation rate for the first observation in the test set using the best LASSO model
# Note: Use the best alpha that was obtained in the list of alpha values provided in the corresponding questions in the quiz
y_pred_lasso = lassocv.predict(X_true)
print(f"Predicted: {y_pred_lasso[0]}, Original: {y_true}")

Predicted: 57.54771329621539, Original: 61


In [106]:
# Predict the value of graduation rate for the first observation in the test set using the best Ridge model
# Note: Use the best alpha that was obtained in the list of alpha values provided in the corresponding questions in the quiz
y_pred_ridge = ridgecv.predict(X_true)
print(f"Predicted: {y_pred_ridge[0]}, Original: {y_true}")

Predicted: 59.47687834043918, Original: 61
