In [None]:
# Import libraries and configuration

import yaml
import pandas as pd

from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

# Load the dataset from config.yaml

config_path = Path("../config.yaml")

with open(config_path, "r") as f:
    config = yaml.safe_load(f)

csv_relative_path = config["data"]["clean_data_csv"]["clean_data"]

csv_path = config_path.parent / csv_relative_path

df = pd.read_csv(csv_path)

In [None]:
df

In [None]:
df.isnull().sum()

In [None]:
# Define features and target variable 

features = df.drop(columns = ["selling_price"])
target = df["selling_price"]

In [None]:
# Split data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=0)

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
# KNN model (before scaling)

from sklearn.neighbors import KNeighborsRegressor

In [None]:
knn = KNeighborsRegressor(n_neighbors=10)

In [None]:
knn.fit(X_train, y_train)

In [None]:
print(f"The R2 of the model is {knn.score(X_test, y_test): .2f}")

In [None]:
# Apply feature scaling

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
knn.fit(X_train_scaled, y_train)
print(f"The R2 of the model is {knn.score(X_test_scaled, y_test):.2f}")


In [None]:
# Evaluate model performance for different k values
scores = []
for k in range(1, 21):
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    scores.append(knn.score(X_test_scaled, y_test))

plt.plot(range(1, 21), scores)
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('R² Score')
plt.title('KNN Performance by Number of Neighbors')
plt.show()


In [None]:
# Evaluate model performance with metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

y_pred = knn.predict(X_test_scaled)

print(f"R²: {r2_score(y_test, y_pred):.3f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred):,.0f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.0f}")


In [None]:
# Visualize predictions vs actual values
plt.figure(figsize=(6,6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel("Actual Selling Price")
plt.ylabel("Predicted Selling Price")
plt.title("KNN Predictions vs Actual Values")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.show()


In [None]:
#Train and evaluate Linear Regression
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)

print(f"R²: {r2_score(y_test, y_pred_lr):.3f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred_lr):,.0f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_lr)):.0f}")


In [None]:
# Comparación de desempeño entre modelos
models = ['KNN', 'Linear Regression']
r2_scores = [0.906, 0.860]
mae_scores = [107791, 159881]
rmse_scores = [250156, 304139]

fig, axes = plt.subplots(1, 3, figsize=(12,4))

axes[0].bar(models, r2_scores, color=['royalblue', 'orange'])
axes[0].set_title("R² Score")
axes[0].set_ylim(0,1)

axes[1].bar(models, mae_scores, color=['royalblue', 'orange'])
axes[1].set_title("Mean Absolute Error (MAE)")

axes[2].bar(models, rmse_scores, color=['royalblue', 'orange'])
axes[2].set_title("Root Mean Squared Error (RMSE)")

plt.suptitle("Model Performance Comparison", fontsize=14)
plt.tight_layout()
plt.show()
