In [6]:
# Import necessary libraries

import pandas as pd
import numpy as np
import matplotlib
import seaborn as sns
from matplotlib import pyplot as plt
matplotlib.rcParams["figure.figsize"] = (20, 10)

#### Task 2: Data Preprocessing


Notebook: notebooks/Data_Preprocessing.ipynb
Steps:

- Handle missing values and outliers.
- Encode categorical variables.
- Normalize/standardize numerical features.
- Split the data into training and testing sets.
- Script: scripts/data_preprocessing.py


In [7]:
# Data Preprocessing Workflow
# Import necessary libraries

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# Step 1: Load the data
# File Path
file_path = "../data/BostonHousing.csv"
data = pd.read_csv(file_path)
print("Data Loaded Successfully")

# Step 2: Handle Missing Values
def handle_missing_values(data):
    """Handles missing values using SimpleImputer."""
    imputer = SimpleImputer(strategy="median")
    numerical_cols = [col for col in data.columns if pd.api.types.is_numeric_dtype(data[col])]
    data[numerical_cols] = imputer.fit_transform(data[numerical_cols])
    return data

data = handle_missing_values(data)
print("Missing Values Handled With Simple Imputer(Median)")

# Step 3: Handle Outliers
def handle_outliers(data):
    """Handle outliers using the IQR method."""
    for col in data.columns:
        if pd.api.types.is_numeric_dtype(data[col]):
            Q1 = data[col].quantile(0.25)
            Q3 = data[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]
    return data

data = handle_outliers(data)
print("Outliers Handled")

# Step 4: Encode Categorical Variables
def encode_categorical_variables(data):
    """Encodes categorical variables using LabelEncoder."""
    if 'chas' in data.columns:  # 'chas' is a binary categorical column
        le = LabelEncoder()
        data['chas'] = le.fit_transform(data['chas'])
    return data

data = encode_categorical_variables(data)
print("Categorical Variables Encoded")

# Step 5: Normalize Numerical Features
def normalize_numerical_features(data, target_column):
    """Normalize numerical features using StandardScaler."""
    scaler = StandardScaler()
    numerical_features = [col for col in data.columns if pd.api.types.is_numeric_dtype(data[col]) and col != target_column]
    data[numerical_features] = scaler.fit_transform(data[numerical_features])
    return data

target_column = 'medv'  # Target variable
data = normalize_numerical_features(data, target_column)
print("Numerical Features Normalized")

# Step 6: Split Data into Training and Testing Sets
def split_data(data, target_column, test_size=0.2, random_state=42):
    """Splits the dataset into training and testing sets."""
    X = data.drop(target_column, axis=1)
    y = data[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_data(data, target_column)
print("Training and Testing Data Split")

# Step 7: Save Preprocessed Data
processed_file_path = "../data/processed_boston_housing.csv"
data.to_csv(processed_file_path, index=False)
print(f"Processed Data Saved to {processed_file_path}")

# Summary
print(f"Training Set Size: {X_train.shape}")
print(f"Testing Set Size: {X_test.shape}")


Data Loaded Successfully
Missing Values Handled With Simple Imputer(Mean)
Outliers Handled
Categorical Variables Encoded
Numerical Features Normalized
Training and Testing Data Split
Processed Data Saved to ../data/processed_boston_housing.csv
Training Set Size: (170, 13)
Testing Set Size: (43, 13)
