#Loading Dataset

In [1]:
from sklearn.datasets import load_diabetes
import pandas as pd

# Load the diabetes dataset
diabetes = load_diabetes(as_frame=True)
diabetes_df = diabetes.frame

# Rename columns for better understanding
diabetes_df.rename(columns={"s1": "tc",
                             "s2": "ldl",
                             "s3": "hdl",
                             "s4": "tch",
                             "s5": "ltg",
                             "s6": "glu"}, inplace=True)

# Display the first 5 rows of data
diabetes_df.head()


Unnamed: 0,age,sex,bmi,bp,tc,ldl,hdl,tch,ltg,glu,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0


#Q1

In [2]:
# Number of observations (rows)
num_observations = diabetes_df.shape[0]

# Number of variables (columns)
num_variables = diabetes_df.shape[1]

print("Number of observations in diabetes:", num_observations)
print("Number of variables in diabetes:", num_variables)


Number of observations in diabetes: 442
Number of variables in diabetes: 11


#Q2 a

In [3]:
import numpy as np

# Create one-dimensional arrays for each predictor
age = diabetes_df['age'].to_numpy()
bmi = diabetes_df['bmi'].to_numpy()
bp = diabetes_df['bp'].to_numpy()
ldl = diabetes_df['ldl'].to_numpy()
hdl = diabetes_df['hdl'].to_numpy()
ltg = diabetes_df['ltg'].to_numpy()
glu = diabetes_df['glu'].to_numpy()

# Create a matrix A with each column containing one of the six predictors
A = np.column_stack((age, bmi, bp, ldl, hdl, ltg, glu))

# Check the shape of A
np.shape(A)


(442, 7)

#Q2 B

In [4]:
# Adding a column of 1's to matrix A using np.append() method
X = np.append(np.ones((442, 1)), A, axis=1)

# Display first and last three rows of X
print("First three rows of X:")
print(X[:3])
print("\nLast three rows of X:")
print(X[-3:])


First three rows of X:
[[ 1.          0.03807591  0.06169621  0.02187239 -0.03482076 -0.04340085
   0.01990749 -0.01764613]
 [ 1.         -0.00188202 -0.05147406 -0.02632753 -0.01916334  0.07441156
  -0.06833155 -0.09220405]
 [ 1.          0.08529891  0.04445121 -0.00567042 -0.03419447 -0.03235593
   0.00286131 -0.02593034]]

Last three rows of X:
[[ 1.          0.04170844 -0.01590626  0.01729339 -0.01383982 -0.02499266
  -0.04688253  0.01549073]
 [ 1.         -0.04547248  0.03906215  0.00121528  0.01528299 -0.02867429
   0.04452873 -0.02593034]
 [ 1.         -0.04547248 -0.0730303  -0.08141314  0.02780893  0.17381578
  -0.00422151  0.00306441]]


#Q2 C

In [5]:
# Create a one-dimensional array of target values y
y = diabetes_df['target'].to_numpy()


#Q3

In [6]:
# Solve the normal equations
XTX = np.dot(X.T, X)
XTy = np.dot(X.T, y)
B = np.linalg.solve(XTX, XTy)

# Print the regression coefficients
print("Regression coefficients:")
print("β₀ =", round(B[0], 3))
print("β₁ =", round(B[1], 3))
print("β₂ =", round(B[2], 3))
print("β₃ =", round(B[3], 3))
print("β₄ =", round(B[4], 3))
print("β₅ =", round(B[5], 3))
print("β₆ =", round(B[6], 3))


Regression coefficients:
β₀ = 152.133
β₁ = -30.987
β₂ = 563.285
β₃ = 271.989
β₄ = -122.669
β₅ = -195.594
β₆ = 503.259


#Q4

To interpret the practical meaning of the regression coefficients β₂ and β₃:

β₂: The coefficient β₂ represents the effect of the predictor variable "bmi" (body mass index) on the target variable. Specifically, for every one-unit increase in the BMI, the target variable (disease progression one year after baseline) is expected to increase by approximately 563.285 units, holding all other predictors constant.

β₃: The coefficient β₃ represents the effect of the predictor variable "bp" (diastolic blood pressure) on the target variable. Specifically, for every one-unit increase in diastolic blood pressure, the target variable is expected to increase by approximately 271.989 units, holding all other predictors constant.

#Q5

In [8]:
# import linear regression model
import sklearn.linear_model

# Load the Linear Regression model
model = sklearn.linear_model.LinearRegression()

# Fit the model to the data
model.fit(X, y)

# Get the intercept and coefficients
intercept = model.intercept_
slopes = model.coef_

# Print the results
print("The intercept is:", intercept)
print("The slopes are: ", slopes)


The intercept is: 152.13348416289602
The slopes are:  [   0.          -30.98678426  563.28456126  271.98903084 -122.66943974
 -195.59395751  503.25866298   51.70799526]
