Data Preprocessing (src/data_processing.py)

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os

# โหลดข้อมูล
df = pd.read_csv("data/raw/house_prices.csv")

# จัดการข้อมูลที่หายไป
# เติมค่า NaN ในคอลัมน์ที่มีชนิดข้อมูลเป็นตัวเลขด้วยค่ามัธยฐาน
numeric_columns = df.select_dtypes(include=['number']).columns  # เลือกคอลัมน์ที่เป็นตัวเลข

# เติมค่า NaN ด้วยค่า median
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].median())

# แปลงข้อมูลให้เป็นตัวเลข
df = pd.get_dummies(df, drop_first=True)

# แยก Features & Target
X = df.drop(columns=["id","price"])
y = df["price"]

# แบ่งข้อมูล train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# บันทึกข้อมูล
os.makedirs("data/processed", exist_ok=True)
X_train.to_csv("data/processed/X_train.csv", index=False)
X_test.to_csv("data/processed/X_test.csv", index=False)
y_train.to_csv("data/processed/y_train.csv", index=False)
y_test.to_csv("data/processed/y_test.csv", index=False)

print("✅ Data processing complete!")


✅ Data processing complete!


train_model (src/train_model.py)

In [1]:
import pandas as pd
import joblib
import yaml
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

params = yaml.safe_load(open("params.yaml"))["train"]

X_train = pd.read_csv("data/processed/X_train.csv")
y_train = pd.read_csv("data/processed/y_train.csv")

model = RandomForestRegressor(n_estimators=params["n_estimators"], max_depth=params["max_depth"], random_state=42)

model.fit(X_train, y_train.values.ravel())

joblib.dump(model, "models/house_price_model.pkl")

print("✅ Model training complete!")


✅ Model training complete!


evaluate model (src/evaluate_model.py)

In [2]:
import pandas as pd
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error

X_test = pd.read_csv("data/processed/X_test.csv")
y_test = pd.read_csv("data/processed/y_test.csv")

model = joblib.load("models/house_price_model.pkl")

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"📊 Model Evaluation:\n - MAE: {mae}\n - MSE: {mse}")


📊 Model Evaluation:
 - MAE: 251983.3896785563
 - MSE: 84491458718.53607


Prediction Model (src/inference.py)

In [3]:
# import sys
# import pickle
# import pandas as pd

# model_path = "models/house_price_model.pkl"
# with open(model_path, "rb") as file:
#     model = pickle.load(file)

# square_feet = int(sys.argv[1])
# bedrooms = int(sys.argv[2])
# bathrooms = int(sys.argv[3])
# location = sys.argv[4]
# year_built = int(sys.argv[5])

# locations = ["Countryside", "Downtown", "Mountain", "Suburban"]  
# location_encoded = {f"location_{loc}": 0 for loc in locations}
# if f"location_{location}" in location_encoded:
#     location_encoded[f"location_{location}"] = 1 

# input_data = {
#     "square_feet": [square_feet],
#     "bedrooms": [bedrooms],
#     "bathrooms": [bathrooms],
#     "year_built": [year_built],
#     **location_encoded  
# }

# input_df = pd.DataFrame(input_data)

# predicted_price = model.predict(input_df)[0]

# print(f"Predicted House Price: ${predicted_price:,.2f}")


import joblib
import pandas as pd

model_path = "models/house_price_model.pkl"
model = joblib.load(model_path)

print(f"✅ Model Loaded! Type: {type(model)}")  

square_feet = int(input("Enter square feet: "))
bedrooms = int(input("Enter number of bedrooms: "))
bathrooms = int(input("Enter number of bathrooms: "))
location = input("Enter location (Countryside/Downtown/Mountain/Suburban): ")
year_built = int(input("Enter year built: "))

locations = ["Countryside", "Downtown", "Mountain", "Suburban"]
location_encoded = {f"location_{loc}": 0 for loc in locations}
if f"location_{location}" in location_encoded:
    location_encoded[f"location_{location}"] = 1

input_data = {
    "square_feet": [square_feet],
    "bedrooms": [bedrooms],
    "bathrooms": [bathrooms],
    "year_built": [year_built],
    **location_encoded  
}

input_df = pd.DataFrame(input_data)

predicted_price = model.predict(input_df)[0]
print(f"Predicted House Price: ${predicted_price:,.2f}")




✅ Model Loaded! Type: <class 'sklearn.ensemble._forest.RandomForestRegressor'>


ValueError: invalid literal for int() with base 10: ''