In [11]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
data = pd.read_pickle("data.pkl")
data.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [13]:
# handling missing values and outliers
data.isnull().sum()

crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
b          0
lstat      0
medv       0
dtype: int64

In [14]:
# no missing values 
# now handling the outliers using IQR method
def remove_outliers(data, column):
    q1 = data[column].quantile(0.25)
    q3 = data[column].quantile(0.75)
    IQR = q3-q1
    lower_bound = q1 - 1.5 * IQR
    upper_bound = q3 + 1.5* IQR
    data = data[(data[column]>=lower_bound) & (data[column]<= upper_bound)]
    return data

numeric_columns = ['crim', 'zn', 'indus', 'nox', 'rm', 'age', 'dis', 'tax', 'ptratio', 'b', 'lstat', 'medv']
for col in numeric_columns:
    data = remove_outliers(data, col)

In [15]:
# Encode categorical variable
# there is only one categorical variable 'chas' which is in binary i.e. 0 and 1 so doesnt require to encode categorical variable.

In [16]:
# moramlizing numerical features
# seperate features and target
X = data.drop('medv', axis = 1)
y = data['medv']

# standarize the feature

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)


In [17]:
# split the dataset into train test 
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state = 42, test_size = 0.2)

print(f"training shape: {X_scaled.shape}")
print(f"testing shape: {X_test.shape}")

training shape: (231, 13)
testing shape: (47, 13)


In [18]:
# save preprocessed data
X_train.to_pickle("X_train.pkl")
X_test.to_pickle("X_test.pkl")
y_train.to_pickle("y_train.pkl")
y_test.to_pickle("y_test.pkl")
print("Preprocessed data saved as X_train.pkl, X_test.pkl, y_train.pkl, y_test.pkl")

Preprocessed data saved as X_train.pkl, X_test.pkl, y_train.pkl, y_test.pkl
