In [1]:
# Step 1: Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [2]:
# Step 2: Load the Dataset
dataset = load_diabetes()

In [3]:
# Step 3: Inspect the Dataset
print("Dataset Keys: {}".format(dataset.keys()))            # Keys in the dataset (feature data, target, description, etc.)
print("Dataset Features:\n", dataset.data)                 # The actual feature data (numerical values)
print("Dataset Feature names:\n", dataset.feature_names)   # The feature names (columns)
print("Dataset Target:\n", dataset.target)                 # The target data (outcome variable to predict)
print("Dataset Description:\n", dataset.DESCR)             # A detailed description of the dataset


Dataset Keys: dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename', 'data_module'])
Dataset Features:
 [[ 0.03807591  0.05068012  0.06169621 ... -0.00259226  0.01990749
  -0.01764613]
 [-0.00188202 -0.04464164 -0.05147406 ... -0.03949338 -0.06833155
  -0.09220405]
 [ 0.08529891  0.05068012  0.04445121 ... -0.00259226  0.00286131
  -0.02593034]
 ...
 [ 0.04170844  0.05068012 -0.01590626 ... -0.01107952 -0.04688253
   0.01549073]
 [-0.04547248 -0.04464164  0.03906215 ...  0.02655962  0.04452873
  -0.02593034]
 [-0.04547248 -0.04464164 -0.0730303  ... -0.03949338 -0.00422151
   0.00306441]]
Dataset Feature names:
 ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
Dataset Target:
 [151.  75. 141. 206. 135.  97. 138.  63. 110. 310. 101.  69. 179. 185.
 118. 171. 166. 144.  97. 168.  68.  49.  68. 245. 184. 202. 137.  85.
 131. 283. 129.  59. 341.  87.  65. 102. 265. 276. 252.  90. 100.  55.
  61.  92. 259.  53. 190. 142.  75. 142. 

In [4]:
# Step 4: Convert to Pandas DataFrame for better handling
dataset_frame = pd.DataFrame(dataset.data, columns=dataset.feature_names)
dataset_frame['Target'] = dataset.target  # Add the target variable to the DataFrame


In [5]:
# Step 5: Exploratory Data Analysis (EDA)
# Pairplot to visualize relationships (scatter plot matrix)
sns.pairplot(dataset_frame)
plt.show()

Output hidden; open in https://colab.research.google.com to view.

In [6]:
# Check for missing values
print("Missing values per column:\n", dataset_frame.isnull().sum())


Missing values per column:
 age       0
sex       0
bmi       0
bp        0
s1        0
s2        0
s3        0
s4        0
s5        0
s6        0
Target    0
dtype: int64


In [7]:
# Correlation matrix to check relationships between features
print("Correlation matrix:\n", dataset_frame.corr())


Correlation matrix:
              age       sex       bmi        bp        s1        s2        s3  \
age     1.000000  0.173737  0.185085  0.335428  0.260061  0.219243 -0.075181   
sex     0.173737  1.000000  0.088161  0.241010  0.035277  0.142637 -0.379090   
bmi     0.185085  0.088161  1.000000  0.395411  0.249777  0.261170 -0.366811   
bp      0.335428  0.241010  0.395411  1.000000  0.242464  0.185548 -0.178762   
s1      0.260061  0.035277  0.249777  0.242464  1.000000  0.896663  0.051519   
s2      0.219243  0.142637  0.261170  0.185548  0.896663  1.000000 -0.196455   
s3     -0.075181 -0.379090 -0.366811 -0.178762  0.051519 -0.196455  1.000000   
s4      0.203841  0.332115  0.413807  0.257650  0.542207  0.659817 -0.738493   
s5      0.270774  0.149916  0.446157  0.393480  0.515503  0.318357 -0.398577   
s6      0.301731  0.208133  0.388680  0.390430  0.325717  0.290600 -0.273697   
Target  0.187889  0.043062  0.586450  0.441482  0.212022  0.174054 -0.394789   

              s4  

In [8]:
# Step 6: Separate Features (X) and Target (y)
X = dataset_frame.drop('Target', axis=1)  # Drop the target column for features
y = dataset_frame['Target']               # Target column (what we want to predict)


In [9]:
# Step 7: Train-Test Split
# Split the dataset into training (80%) and testing (20%) subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
# Step 8: Standardize the Features
# StandardScaler standardizes features by removing the mean and scaling to unit variance
scaler = StandardScaler()


In [11]:
# Fit on the training data and transform both training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)  # only transform the test data


In [12]:
# Step 9: Train the Linear Regression Model
model_reg = LinearRegression()


In [13]:
# Train (fit) the model on the training data
model_reg.fit(X_train_scaled, y_train)

In [14]:
# Step 10: Model Evaluation
# Coefficients (weights) and Intercept of the trained model
print("Model Coefficients:\n", model_reg.coef_)
print("Model Intercept:\n", model_reg.intercept_)

Model Coefficients:
 [  1.75375799 -11.51180908  25.60712144  16.82887167 -44.44885564
  24.64095356   7.67697768  13.1387839   35.16119521   2.35136365]
Model Intercept:
 153.73654390934846


In [15]:
# Predict on test data
y_pred = model_reg.predict(X_test_scaled)

In [16]:
# Model performance evaluation
from sklearn.metrics import mean_squared_error, r2_score

In [17]:
# Calculate Mean Squared Error and R2 score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [18]:
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 2900.1936284934823
R-squared: 0.45260276297191926
