In [59]:
#import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [61]:
#Load housing dataset
df = pd.read_pickle("../data/eda_df.pkl")

In [62]:
#Check missing data
df.isnull().sum()

crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
b          0
lstat      0
medv       0
dtype: int64

In [63]:
# Handle missing values if any
df = df.dropna()

In [64]:
# Handle outliers using Z-score

from scipy.stats import zscore  # Import the zscore function from scipy to compute standard scores

z_scores = np.abs(zscore(df.select_dtypes(include='number')))  
# Compute the absolute Z-scores for all numeric columns in the DataFrame

df = df[(z_scores < 3).all(axis=1)]  
# Keep only those rows where all numeric column values have Z-scores less than 3 (i.e., not outliers)


In [65]:
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [7]:
print(df.dtypes)

crim       float64
zn         float64
indus      float64
chas         int64
nox        float64
rm         float64
age        float64
dis        float64
rad          int64
tax          int64
ptratio    float64
b          float64
lstat      float64
medv       float64
dtype: object


### Encode Categorical Variables
Categorical data refers to variables that represent categories or groups rather than numerical values. These can be:

Nominal: No natural order (e.g., colors: red, green, blue)

Ordinal: Have a meaningful order (e.g., education level: high school < bachelor < master)

In [8]:
# Encode categorical column if it's object type or binary
if df["chas"].dtype == 'object':
    df = pd.get_dummies(df, columns=["chas"], drop_first=True)


In [28]:
# Split the data into the train-test split i.e. Separate target from the features 
X = df.drop(columns=["medv"])  # Drop the target column 'medv' to create the feature matrix X

y = df["medv"]  # Extract the target variable 'medv' into y

In [29]:
# Split into train and test sets
from sklearn.model_selection import train_test_split  # Import function to split the dataset

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Split X and y into training and testing sets (80% train, 20% test); random_state ensures reproducibility

In [30]:
# Standardization
from sklearn.preprocessing import StandardScaler  # Import StandardScaler to standardize features

scaler = StandardScaler()  # Create a scaler object

X_train = scaler.fit_transform(X_train)  # Fit scaler on training data and transform it (mean=0, std=1)

X_test = scaler.transform(X_test)  # Transform the test data using the same scaler (do not fit again)
#Because we want to ensure that everything learned by the model comes only from the training data—not from the test data.

In [31]:
# Target train data
# The target data doesn't need to be standardized.
y_train.head()

292    27.9
449    13.0
217    28.7
164    22.7
5      28.7
Name: medv, dtype: float64

In [32]:
# Target test data
# The target data doesn't need to be standardized.
y_test.head()

58     23.3
146    15.6
187    32.0
59     19.6
407    27.9
Name: medv, dtype: float64

In [36]:
# Convert to DataFrame
# X_train and X-test are no longer Pandas DataFrames after standardization, it becomes a NumPy array, and NumPy arrays don’t have a .head() method — that’s specific to Pandas DataFrames.
X_train = pd.DataFrame(X_train, columns=X.columns)  # Convert back to DataFrame with original column names
X_test = pd.DataFrame(X_test, columns=X.columns)

In [37]:
# View scaled train data
X_train.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
0,-0.500207,3.55789,-0.860253,0.0,-1.236528,0.53919,-1.566056,0.62382,-0.583701,-0.932525,0.373738,0.440129,-1.15285
1,1.06372,-0.481158,1.064854,0.0,1.510161,0.196565,1.093971,-0.834838,1.855426,1.673877,0.836728,-1.325933,1.064275
2,-0.493112,-0.481158,0.448527,0.0,0.027677,0.558493,0.625182,-0.21982,-0.461744,-0.740605,-0.922632,0.361629,-0.395598
3,-0.039537,-0.481158,1.28152,0.0,0.527902,-0.709059,0.863128,-0.716919,-0.461744,0.045649,-1.709714,0.406023,-0.099678
4,-0.501522,-0.481158,-1.265769,0.0,-0.809063,0.217477,-0.312398,1.09425,-0.705657,-1.074918,0.142244,0.38716,-1.075455


In [38]:
# View scaled test data
X_test.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
0,-0.475505,0.781045,-0.833901,0.0,-0.854538,-0.240965,-1.360073,1.96625,-0.095875,-0.691077,0.605233,0.321617,-0.825061
1,-0.057768,-0.481158,1.28152,0.0,2.947171,-1.072595,1.154346,-1.167398,-0.461744,0.045649,-1.709714,-3.897002,0.66061
2,-0.491312,1.790807,-1.08131,0.0,-1.000058,0.783693,-0.937451,-0.036972,-0.461744,0.014694,-1.478219,0.382397,-0.852377
3,-0.48619,0.781045,-0.833901,0.0,-0.854538,-0.591633,-0.720814,1.527016,-0.095875,-0.691077,0.605233,0.440129,-0.466922
4,1.987704,-0.481158,1.064854,0.0,1.019031,-1.104767,1.154346,-1.28253,1.855426,1.673877,0.836728,-0.794723,-0.025318


In [40]:
X_train.to_pickle("../data/X_train.pkl") # saving to pkl file
X_test.to_pickle("../data/X_test.pkl") # saving to pkl file
y_train.to_pickle("../data/y_train.pkl") # saving to pkl file
y_test.to_pickle("../data/y_test.pkl") # saving to pkl file