# Data Preprocessing: Data Normalization

Normalization is a data preprocessing technique used to transform the values of numeric columns in the dataset to a common scale, without distorting differences in the ranges of values or losing information. It’s about adjusting the scale of your data to level the playing field for all the features in your dataset.

## Example:

Housing prices dataset. This dataset includes features such as the size of the house (in square feet), the number of bedrooms, and the age of the house (in years), with the target variable being the house price.

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Seed for reproducibility
np.random.seed(42)

In [None]:
# Generate synthetic housing data with noise
size = np.random.normal(3000, 750, 100) + np.random.normal(0, 200, 100)  # Adding noise
bedrooms = np.random.randint(1, 5, 100)
age = np.random.randint(1, 30, 100)

In [None]:
# Introduce outliers
size[98:100] += 5000  # Extreme size values for outliers
bedrooms[98:100] = 6  # More bedrooms than typical houses
age[98:100] -= 25  # Significantly older

# Simulate house prices with added noise
prices = size * 200 + bedrooms * 5000 + age * -1000 + np.random.normal(0, 15000, 100)

In [None]:
# Create a DataFrame
housing_data = pd.DataFrame({
    'Size': size,
    'Bedrooms': bedrooms,
    'Age': age,
    'Price': prices
})

In [None]:
# Initialize the MinMaxScaler
scaler_features = MinMaxScaler()
scaler_price = MinMaxScaler()

# Normalize the features
features_to_normalize = ['Size', 'Bedrooms', 'Age']
housing_data[features_to_normalize] = scaler_features.fit_transform(housing_data[features_to_normalize])

# Normalize the target variable
housing_data['Price'] = scaler_price.fit_transform(housing_data[['Price']])


In [None]:
# Split the dataset into training and testing sets
X = housing_data.drop('Price', axis=1)
y = housing_data['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the testing set
predictions = model.predict(X_test)


In [None]:
# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the testing set
predictions = model.predict(X_test)


In [None]:
# Evaluate the model using mean squared error in the original price scale
mse = mean_squared_error(y_test_original_scale, predictions_original_scale)
print(f'Mean Squared Error: {mse}')

In [None]:
# Visualize actual vs. predicted prices in the original scale
plt.figure(figsize=(10, 6))
plt.scatter(y_test_original_scale, predictions_original_scale)
plt.plot([y_test_original_scale.min(), y_test_original_scale.max()], [y_test_original_scale.min(), y_test_original_scale.max()], 'k--', lw=4)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual vs. Predicted Prices')
plt.show()