In [1]:
# First import of libraries
import numpy as np
import pandas as pd
from sklearn import datasets

In [2]:
# Load database set
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True, as_frame=True)

# keep only BMI
diabetes_X = diabetes_X.loc[:, ["bmi"]]

# The BMI is 0-centered. We recenter for ease of information
diabetes_X = diabetes_X * 30 + 25

# Collect 20 data points
diabetes_X_train = diabetes_X[-20:]
diabetes_y_train = diabetes_y[-20:]

# Display some of the data points
pd.concat([diabetes_X_train, diabetes_y_train], axis=1).head()

Unnamed: 0,bmi,target
422,27.335902,233.0
423,23.811456,91.0
424,25.331171,111.0
425,23.779122,152.0
426,23.973128,120.0


In [6]:
# Describe Dataset
print(datasets.load_diabetes(as_frame=True).DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood sugar level

Note: Each of these 1

In [3]:
# Second import of libraries
%matplotlib inline
import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = [12, 4]

plt.scatter(diabetes_X, diabetes_y, color="black")
plt.scatter(diabetes_X_train, diabetes_y_train, color="red")
plt.xlabel("Body Mass Index (BMI)")
plt.ylabel("Diabetes Risk")

## Linear Regression Solution

In [4]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

In [5]:
# Create Linear Regression object
regr = linear_model.LinearRegression()

# Train the model using the training set
regr.fit(diabetes_X_train, diabetes_y_train.values)

# Make Prediction on the training sets
diabetes_y_train_pred = regr.predict(diabetes_X_train)

print(f"Slope {regr.coef_[0]}")
print(f"Intercept {regr.intercept_}")

Slope 37.378842160517664
Intercept -797.0817390342369
