# Implement a Python script using PyTorch to preprocess a dataset suitable for regression, including data normalization and handling missing values.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Custom PyTorch Dataset class
class CustomDataset(Dataset):
    def __init__(self, features, targets):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.targets = torch.tensor(targets, dtype=torch.float32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, index):
        return self.features[index], self.targets[index]

# 1. Load dataset (replace with your actual dataset path)
def load_dataset():
    # For demonstration, we create a synthetic dataset
    np.random.seed(42)
    data = np.random.randn(1000, 5)  # 1000 samples, 5 features
    target = 2 * data[:, 0] + 3 * data[:, 1] - data[:, 2] + np.random.randn(1000) * 0.5  # Synthetic target
    return pd.DataFrame(data, columns=[f"feature_{i}" for i in range(5)]), target
# 2. Handle missing values
def handle_missing_values(df):
    # For simplicity, fill missing values with the mean of the column
    df = df.fillna(df.mean())
    return df
# 3. Normalize data
def normalize_data(df, target):
    scaler = StandardScaler()
    df_normalized = scaler.fit_transform(df)
    target_normalized = (target - np.mean(target)) / np.std(target)
    return df_normalized, target_normalized
# 4. Create DataLoader
def create_dataloader(features, targets, batch_size=32):
    dataset = CustomDataset(features, targets)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return dataloader
# 5. Example usage
if __name__ == "__main__":
    # Load dataset
    df, target = load_dataset()

    # Introduce some missing values for demonstration
    df.iloc[::10, 0] = np.nan
    # Handle missing values
    df = handle_missing_values(df)
    # Normalize data
    features, targets = normalize_data(df, target)
    # Create DataLoader
    dataloader = create_dataloader(features, targets, batch_size=32)
    # Print a sample batch
    for batch_features, batch_targets in dataloader:
        print("Features:", batch_features)
        print("Targets:", batch_targets)
        break  # Print only the first batch


Features: tensor([[ 1.4545e+00, -5.2974e-01, -2.1274e-01, -1.7616e+00, -8.4861e-01],
        [-1.4056e+00, -5.7796e-01, -4.1380e-02, -4.5429e-01, -7.3382e-01],
        [-9.7347e-01, -8.9362e-01, -6.6910e-01,  2.9776e-01, -2.2277e-01],
        [-7.6809e-01,  5.2345e-01,  3.1562e-01,  1.6837e-01, -1.0401e+00],
        [ 6.2588e-01,  2.7147e-01, -1.9447e+00, -3.7252e-01, -2.2778e+00],
        [ 7.1746e-01,  1.1574e+00, -2.7576e-01, -3.6183e-01,  4.7281e-01],
        [ 9.1907e-01, -7.2524e-01,  3.3235e-02,  7.0547e-01, -2.4502e-01],
        [ 9.1349e-18, -3.5760e-01, -6.7820e-01,  6.3078e-01,  1.0099e+00],
        [ 1.0244e+00, -5.7104e-02, -3.2283e-01,  1.5138e-01, -8.7823e-01],
        [ 2.6461e-01, -4.0780e-01, -7.0337e-02,  2.1819e+00, -2.8317e-01],
        [-1.8966e-03,  4.8315e-01, -2.7188e-01, -1.9210e+00, -4.7111e-01],
        [-5.5591e-01, -2.0549e-01, -9.9410e-01, -2.6777e+00, -2.2651e-01],
        [ 1.1505e+00, -1.0029e-01, -2.1668e+00,  8.7651e-01, -5.7488e-01],
        [-1.310