In [None]:
# ติดตั้งไลบรารีที่จำเป็น
!pip install pandas numpy scikit-learn joblib

# Import Libraries
import pandas as pd
import numpy as np
import joblib
from google.colab import files
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error




In [None]:
# อัปโหลดไฟล์ train.csv
uploaded = files.upload()

# โหลด dataset
df = pd.read_csv("train.csv")

# แสดงข้อมูล 5 แถวแรก
df.head()


Saving train.csv to train (2).csv


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [None]:
# แยก Numeric กับ Categorical Columns
numeric_cols = df.select_dtypes(include=['number']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# เช็ค Missing Values ใน Numeric
print("🔍 Missing Values in Numeric Columns:")
print(df[numeric_cols].isnull().sum())

# เช็ค Missing Values ใน Categorical
print("\n🔍 Missing Values in Categorical Columns:")
print(df[categorical_cols].isnull().sum())


🔍 Missing Values in Numeric Columns:
Id                 0
MSSubClass         0
LotFrontage      259
LotArea            0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
MasVnrArea         8
BsmtFinSF1         0
BsmtFinSF2         0
BsmtUnfSF          0
TotalBsmtSF        0
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       0
BsmtHalfBath       0
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
TotRmsAbvGrd       0
Fireplaces         0
GarageYrBlt       81
GarageCars         0
GarageArea         0
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
ScreenPorch        0
PoolArea           0
MiscVal            0
MoSold             0
YrSold             0
SalePrice          0
dtype: int64

🔍 Missing Values in Categorical Columns:
MSZoning            0
Street              0
Alley            1369
LotShape            0
LandContour         0


In [None]:
# เติมค่าเฉลี่ยให้คอลัมน์ตัวเลข
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# เติมค่าที่พบบ่อยที่สุดให้คอลัมน์ตัวอักษร
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])


In [None]:
df = pd.get_dummies(df, drop_first=True)


In [None]:
df.drop(columns=["Id"], inplace=True)


In [None]:
# แยก Features และ Target
X = df.drop(columns=["SalePrice"])
y = df["SalePrice"]

# บันทึกชื่อฟีเจอร์ที่ใช้
joblib.dump(X.columns.tolist(), "feature_columns.pkl")

# แบ่ง Train / Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# บันทึก Scaler
joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']

In [None]:
linear_model = Ridge(alpha=1.0)
linear_model.fit(X_train_scaled, y_train)

# บันทึกโมเดล
joblib.dump(linear_model, "linear_regression_model.pkl")


['linear_regression_model.pkl']

In [None]:
rf_model = RandomForestRegressor(n_estimators=300, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# บันทึกโมเดล
joblib.dump(rf_model, "random_forest_model.pkl")


['random_forest_model.pkl']

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# ทำนายผล
y_pred_linear = linear_model.predict(X_test_scaled)
y_pred_rf = rf_model.predict(X_test_scaled)

# คำนวณค่า MAE และ RMSE
mae_linear = mean_absolute_error(y_test, y_pred_linear)
rmse_linear = np.sqrt(mean_squared_error(y_test, y_pred_linear))  # ใช้ np.sqrt() แทน squared=False

mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))  # ใช้ np.sqrt() แทน squared=False

# แสดงผล
print(f"📊 Linear Regression - MAE: {mae_linear:.2f}, RMSE: {rmse_linear:.2f}")
print(f"🌲 Random Forest - MAE: {mae_rf:.2f}, RMSE: {rmse_rf:.2f}")


📊 Linear Regression - MAE: 19878.89, RMSE: 47871.42
🌲 Random Forest - MAE: 17635.03, RMSE: 28900.08


In [None]:
from google.colab import files

files.download("linear_regression_model.pkl")
files.download("random_forest_model.pkl")
files.download("scaler.pkl")
files.download("feature_columns.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>