In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_diabetes
from sklearn.impute import SimpleImputer

# Load the diabetes dataset (already available in scikit-learn)
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target

# Introduce missing values in the dataset (replace 10% of values with NaN)
import numpy as np
np.random.seed(42)
missing_mask = np.random.rand(X.shape[0], X.shape[1]) < 0.1
X[missing_mask] = np.nan

# Display the dataset with missing values
df = pd.DataFrame(data=np.c_[X, y], columns=[f'feature_{i}' for i in range(X.shape[1])] + ['target'])
print("Dataset with Missing Values:")
print(df.head())

# Handling missing values using SimpleImputer (replace NaN with mean)
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Display the dataset after handling missing values
df_imputed = pd.DataFrame(data=np.c_[X_imputed, y], columns=[f'feature_{i}' for i in range(X.shape[1])] + ['target'])
print("\nDataset after Handling Missing Values:")
print(df_imputed.head())

# Splitting the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Display the shapes of training and test sets
print("\nShapes of Training and Test Sets:")
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Display the scaled features
print("\nScaled Features:")
print("X_train_scaled:")
print(X_train_scaled[:5, :])  # Displaying the first 5 rows

print("\nX_test_scaled:")
print(X_test_scaled[:5, :])  # Displaying the first 5 rows

Dataset with Missing Values:
   feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
0   0.038076   0.050680   0.061696   0.021872  -0.044223  -0.034821   
1        NaN  -0.044642  -0.051474  -0.026328  -0.008449  -0.019163   
2   0.085299   0.050680   0.044451  -0.005670  -0.045599  -0.034194   
3  -0.089063  -0.044642        NaN  -0.036656   0.012191   0.024991   
4   0.005383  -0.044642        NaN   0.021872   0.003935   0.015596   

   feature_6  feature_7  feature_8  feature_9  target  
0        NaN  -0.002592   0.019907  -0.017646   151.0  
1   0.074412  -0.039493  -0.068332  -0.092204    75.0  
2  -0.032356  -0.002592   0.002861        NaN   141.0  
3  -0.036038        NaN   0.022688  -0.009362   206.0  
4   0.008142  -0.002592  -0.031988  -0.046641   135.0  

Dataset after Handling Missing Values:
   feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
0   0.038076   0.050680   0.061696   0.021872  -0.044223  -0.034821   
1   0.000426  -0.044642  -