# Data Pre-Processing

#### Import Libraries

In [102]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#### Load the data

In [15]:
data_frame = pd.read_csv('../data/BostonHousing.csv')

#### Handle missing values (none in this dataset, but included for completeness)

In [17]:
data_frame = data_frame.dropna()

#### Separate features and target variable 'X' contains all features except 'medv', which is the target variable

In [19]:
X = data_frame.drop('medv', axis=1)
y = data_frame['medv']  # 'y' contains the target variable 'medv'

#### Normalize features using StandardScaler 'X_scaled' is the normalized version of 'X' with mean 0 and standard deviation 1

In [52]:
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)


X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### Split the dataset into training and testing sets 80% of the data is used for training, and 20% is used for testing and save the preprocessed data to CSV files for future use Training features and target

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
X_train.to_csv('../data/X_train.csv', index=False)
y_train.to_csv('../data/y_train.csv', index=False)

X_test.to_csv('../data/X_test.csv', index=False)
y_test.to_csv('../data/y_test.csv', index=False)

#### Display the shapes of the training and testing datasets

In [94]:
print(f"Training set shape: X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Testing set shape: X_test: {X_test.shape}, y_test: {y_test.shape}")

Training set shape: X_train: (404, 13), y_train: (404,)
Testing set shape: X_test: (102, 13), y_test: (102,)
