In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_pickle("../data/eda_df.pkl")

In [None]:
df.head()

# Handling Missing values

In [None]:
# Handling Missing Values @sabinvankathmandu
missing_values = df.isnull().sum()

print("Missing Values:\n", missing_values)

In [None]:
# Dropping rows with missing values @sabinvankathmandu
df = df.dropna()
# check if rows with missing values are deleted
df.isnull().sum()

### Removing Outliers (Use IQR methods) 

In [None]:
# using Interquartile Range (IQR) to remove outliers - Chatgpt assisted code here @sabinvankathmandu
df.shape


In [None]:
def remove_outliers(df):
    Q1 = df.quantile(0.25)  # 25th percentile
    Q3 = df.quantile(0.75)  # 75th percentile
    IQR = Q3 - Q1           # Interquartile range

    # Define outlier threshold
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Remove outliers
    df_cleaned = df[~((df < lower_bound) | (df > upper_bound)).any(axis=1)]
    return df_cleaned

# Apply function
df = remove_outliers(df)
print(f"New dataset shape after outlier removal: {df.shape}")

### Check to see if outliers from boxplots @sabinvankathmandu

In [None]:
# Boxplots for Each Column for outliers detections @sabinvankathmandu

# Setting the figure size @sabinvankathmandu
plt.figure(figsize=(15, 10))

# create a boxplot by looping for each columns @sabinvankathmandu
for i, column in enumerate(df.columns, 1):
    plt.subplot(4, 4, i)  # adjusting rows & cols as needed (4x4 for 16 features)
    sns.boxplot(y=df[column])
    plt.title(f"Boxplot of {column}")

plt.tight_layout()  # layout adujst to prevent overlapping @sabinvankathmandu
plt.show()

# Normalize / Standardize Numerical Features

In [None]:
scaler = StandardScaler()
# select onlynumerical features excluding target medv variables @sabinvankathmandu

features = df.drop(columns=["medv"])
target = df["medv"]

#standard scaler fitting
feature_scaled = scaler.fit_transform(features)

# DF scaled
df_scaled = pd.DataFrame(feature_scaled, columns=features.columns)


In [None]:
df_scaled

In [None]:
# appending our target columns @sabinvankathmandu
df_scaled["medv"] = target.values

In [None]:
df_scaled.head(2)

# Split into Train and Test data

In [None]:
# X - feature and y - target vaiable defining @sabinvankathmandu
X = df_scaled.drop(columns=['medv'])
y = df_scaled['medv']

# splitting into 80 - 20 train test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")

# Showing train and test data

In [None]:
# train data
print(f"TRAINED DATA:: \n", X_train.head(5))

# Test data
print(f"\nTEST DATA:: \n", X_test.head(5))


# Save Processed data to pkl

In [None]:
# Save Processed data to pkl
df_scaled.to_pickle("../data/datapreprocessing_df.pkl")
X_train.to_pickle("../data/X_train.pkl")
X_test.to_pickle("../data/X_test.pkl")
y_train.to_pickle("../data/y_train.pkl")
y_test.to_pickle("../data/y_test.pkl")
