In [53]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler 

In [54]:
df = pd.read_csv("../5_merge/complete_data.csv")

# แปลงคอลัมน์หมวดหมู่ (เช่น คณะ) เป็นดัมมี่
df_dummies = pd.get_dummies(df, columns=['Faculty'], drop_first=True)

# แยกตัวแปรต้น (X) และตัวแปรตาม (y)
X = df_dummies.drop(columns=['Count'])  # ตัวแปรต้น (ปีและคณะ)
y = df_dummies['Count']                # ตัวแปรตาม

y_counts = y.value_counts()
valid_classes = y_counts[y_counts >= 2].index
df_filtered = df_dummies[df_dummies['Count'].isin(valid_classes)]

X = df_filtered.drop(columns=['Count'])
y = df_filtered['Count']


# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2020)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=2020, stratify=y)

        

In [55]:
# สร้างโมเดล Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# เทรนโมเดล
rf_model.fit(X_train, y_train)


In [56]:
# พยากรณ์ผลลัพธ์
y_pred = rf_model.predict(X_test)

# ดูผลลัพธ์ที่ได้
print(y_pred)


[  3.5    5.91   2.44   8.37   2.18   3.23  30.48  12.51  66.18  15.86
   9.57   8.21  22.32   3.8   18.45  14.79   9.89  11.05  36.97  12.39
   4.95  28.97  35.97   6.6    3.33   1.88   3.3   19.3    2.28  86.39
  44.89   2.     2.4   55.77   2.35  29.64  48.03  14.85   1.61  27.9
   3.47  33.34   3.88  30.48  44.06  48.08  18.89  23.72   3.8   22.84
   5.8   19.73   9.66  73.87   5.23   3.6    3.98  37.57  13.53  11.09
  16.08  11.06  22.5    2.19  31.65   4.36   2.36   1.76  18.86   6.02
  37.91  76.65  14.81   9.97  12.44   9.94   1.76  86.37   8.75   5.99
   2.13   7.55  31.04  14.24 100.28   2.35   6.15  19.3    5.05  14.36
  66.97  13.94  35.84   2.36   2.03   9.34  34.86  47.94   5.2   10.3
  15.49   2.36   4.61  18.54   9.79  15.36  11.81   1.93  51.89   4.63
  19.27   1.86  11.92  12.68   5.6   14.32  42.14   3.33  18.04  14.44
   6.56   2.11  11.34  40.14  43.92   1.36   2.09   2.17   1.39   7.12
   1.85  30.71  24.    18.14  72.54   5.53   1.61  45.01  36.93  37.36
   3.91 

In [57]:
# คำนวณค่าความผิดพลาด
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R² Score: {r2}")


Mean Absolute Error (MAE): 10.465466666666666
Mean Squared Error (MSE): 311.87137466666667
Root Mean Squared Error (RMSE): 17.659880369545732
R² Score: 0.5562443745638082


In [58]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Hyperparameter tuning for RandomForest
param_grid = {
    'n_estimators': [100, 200, 300],                # จำนวนต้นไม้ในป่า
    'max_depth': [None, 10, 20, 30],                 # ความลึกสูงสุดของต้นไม้
    'min_samples_split': [2, 5, 10],                 # จำนวนตัวอย่างขั้นต่ำในการแบ่งต้นไม้
    'min_samples_leaf': [1, 2, 4],                   # จำนวนตัวอย่างขั้นต่ำในแต่ละใบไม้
    'max_features': ['sqrt', 'log2', 0.5],          # จำนวนฟีเจอร์ที่ใช้ในการแบ่งต้นไม้
    'bootstrap': [True, False]                       # การสุ่มตัวอย่างข้อมูล
}

rf_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=2)

# Fit the model with GridSearch
grid_search.fit(X_train_scaled, y_train)

# Best parameters and model
best_rf_model = grid_search.best_estimator_

# Evaluate the model
y_pred = best_rf_model.predict(X_test_scaled)

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R² Score: {r2}")


Fitting 5 folds for each of 648 candidates, totalling 3240 fits
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estim