## ML監督學習 - 房價預測（迴歸）

### import相關套件

In [19]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

### 房價資料集載入

In [20]:

# 載入 california 房價資料集
train_data = pd.read_csv('dataset/california_housing_train.csv')
test_data = pd.read_csv('dataset/california_housing_test.csv')

# 查看dataset資訊 確保資料正確讀取
print(train_data.head())
print(test_data.head())


   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -114.31     34.19                15.0       5612.0          1283.0   
1    -114.47     34.40                19.0       7650.0          1901.0   
2    -114.56     33.69                17.0        720.0           174.0   
3    -114.57     33.64                14.0       1501.0           337.0   
4    -114.57     33.57                20.0       1454.0           326.0   

   population  households  median_income  median_house_value  
0      1015.0       472.0         1.4936             66900.0  
1      1129.0       463.0         1.8200             80100.0  
2       333.0       117.0         1.6509             85700.0  
3       515.0       226.0         3.1917             73400.0  
4       624.0       262.0         1.9250             65500.0  
   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.05     37.37                27.0       3885.0           661.0   
1    -118.30     34.2

### 資料前處理

In [21]:
# 處理缺失值
data = train_data.dropna()
data = test_data.dropna()


# 分離特徵（X）和目標變量（y）
X_train = train_data.drop('median_house_value', axis=1)
y_train = train_data['median_house_value']

X_test = test_data.drop('median_house_value', axis=1)
y_test = test_data['median_house_value']

# 創造新的特徵
# rooms_per_household: 每戶房間數
# bedrooms_per_room: 每房間臥室數
# population_per_household: 每戶人口數
X_train['rooms_per_household'] = X_train['total_rooms'] / X_train['households']
X_train['bedrooms_per_room'] = X_train['total_bedrooms'] / X_train['total_rooms']
X_train['population_per_household'] = X_train['population'] / X_train['households']

X_test['rooms_per_household'] = X_test['total_rooms'] / X_test['households']
X_test['bedrooms_per_room'] = X_test['total_bedrooms'] / X_test['total_rooms']
X_test['population_per_household'] = X_test['population'] / X_test['households']


# 標準化資料
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### 模型的建立與評估(隨機森林)

MSE 越低越好  
MAE 越低越好  
R-Squared, R² 越接近1越好


In [22]:

# 建立隨機森林回歸模型
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train_scaled, y_train)

# 預測和評估
y_pred_rf = model_rf.predict(X_test_scaled)
mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
# 在迴歸問題中, 常用的評估是均方誤差（MSE）、平均絕對誤差（MAE）和決定係數（R²）
print(f'Random Forest Mean Squared Error: {mse_rf}')
print(f'Random Forest Mean Absolute Error: {mae_rf}')
print(f'Random Forest R^2 Score: {r2_rf}')

Random Forest Mean Squared Error: 2600842349.887111
Random Forest Mean Absolute Error: 32827.73381666667
Random Forest R^2 Score: 0.7966789124539675


### 模型的建立與評估(線性回歸)

In [23]:

# 建立線性迴歸模型
model_lr = LinearRegression()
model_lr.fit(X_train_scaled, y_train)

# 預測和評估
y_pred_lr = model_lr.predict(X_test_scaled)
mse_lr = mean_squared_error(y_test, y_pred_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)

print(f'Linear Regression Mean Squared Error: {mse_lr}')
print(f'Linear Regression Mean Absolute Error: {mae_lr}')
print(f'Linear Regression R^2 Score: {r2_lr}')

Linear Regression Mean Squared Error: 4745655369.276094
Linear Regression Mean Absolute Error: 49503.574244060896
Linear Regression R^2 Score: 0.6290079593475687


## ML監督學習 - 垃圾郵件分類（分類器）

### 載入套件

In [15]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


### 資料集前處理

In [27]:

# 加載數據
data = pd.read_csv('dataset/ml_sms.tsv', sep='\t', header=None)
data.columns = ['label', 'message']

# 將標籤轉換為數值
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# 切分訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(data['message'], data['label'], test_size=0.2, random_state=42)

print(data.head())

   label                                            message
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...


### 轉換文本資料特徵

轉換文字特徵為數值向量的過程涉及到  
將非結構化的文本資料轉換為模型可以理解的數值格式  
這通常稱為"特徵提取"或"向量化"
使用套件 CountVectorizer 是一種常用的方法  
原理是將文本數據轉換為詞袋（Bag-of-Words）表示形式

In [28]:
# 特徵提取（文本轉換為數值向量）
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)# 測試中如果包含訓練中未出現的詞彙 這些詞彙將被忽略 不會影響轉換結果
print(X_train_vec)

  (0, 5688)	1
  (0, 6889)	3
  (0, 7475)	1
  (0, 258)	1
  (0, 7397)	1
  (0, 7438)	1
  (0, 7472)	1
  (0, 6774)	1
  (0, 354)	1
  (0, 2806)	1
  (0, 7556)	1
  (0, 2107)	1
  (0, 1272)	1
  (0, 3365)	1
  (0, 5981)	1
  (0, 6461)	1
  (0, 695)	1
  (0, 2569)	1
  (0, 6000)	1
  (1, 7472)	1
  (1, 3370)	1
  (1, 6305)	1
  (1, 4855)	1
  (1, 4977)	1
  (1, 3604)	1
  :	:
  (4452, 7536)	1
  (4452, 2503)	1
  (4452, 7520)	1
  (4453, 1607)	1
  (4453, 6304)	1
  (4453, 4130)	1
  (4453, 3998)	1
  (4454, 2985)	1
  (4454, 3352)	1
  (4454, 5585)	1
  (4454, 5322)	1
  (4454, 6310)	1
  (4454, 6309)	1
  (4455, 6905)	1
  (4455, 4813)	1
  (4455, 3813)	1
  (4455, 5997)	1
  (4456, 3713)	1
  (4456, 3842)	1
  (4456, 2295)	1
  (4456, 3340)	1
  (4456, 5896)	1
  (4456, 6667)	1
  (4456, 6269)	1
  (4456, 4661)	1


### Naive Bayes classifier

In [30]:
# 建立Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_vec, y_train)

# 預測
y_pred = model.predict(X_test_vec)

# 計算評估指標
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print('Confusion Matrix:')
print(conf_matrix)


Accuracy: 0.9919282511210762
Precision: 1.0
Recall: 0.9395973154362416
F1 Score: 0.9688581314878892
Confusion Matrix:
[[966   0]
 [  9 140]]


### SVM分類器

In [29]:
from sklearn.svm import SVC

# 建立SVM模型
model_svm = SVC(kernel='linear')
model_svm.fit(X_train_vec, y_train)

# 預測
y_pred_svm = model_svm.predict(X_test_vec)

# 計算評估指標
accuracy = accuracy_score(y_test, y_pred_svm)
precision = precision_score(y_test, y_pred_svm)
recall = recall_score(y_test, y_pred_svm)
f1 = f1_score(y_test, y_pred_svm)
conf_matrix = confusion_matrix(y_test, y_pred_svm)

print(f'SVM Accuracy: {accuracy}')
print(f'SVM Precision: {precision}')
print(f'SVM Recall: {recall}')
print(f'SVM F1 Score: {f1}')
print('Confusion Matrix:')
print(conf_matrix)


SVM Accuracy: 0.989237668161435
SVM Precision: 1.0
SVM Recall: 0.9194630872483222
SVM F1 Score: 0.958041958041958
Confusion Matrix:
[[966   0]
 [ 12 137]]


### 隨機森林

In [31]:
from sklearn.ensemble import RandomForestClassifier

# 建立隨機森林模型
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(X_train_vec, y_train)

# 預測
y_pred_rf = model_rf.predict(X_test_vec)

# 計算評估指標
accuracy = accuracy_score(y_test, y_pred_rf)
precision = precision_score(y_test, y_pred_rf)
recall = recall_score(y_test, y_pred_rf)
f1 = f1_score(y_test, y_pred_rf)
conf_matrix = confusion_matrix(y_test, y_pred_rf)

print(f'Random Forest Accuracy: {accuracy}')
print(f'Random Forest Precision: {precision}')
print(f'Random Forest Recall: {recall}')
print(f'Random Forest F1 Score: {f1}')
print('Confusion Matrix:')
print(conf_matrix)


Random Forest Accuracy: 0.9847533632286996
Random Forest Precision: 1.0
Random Forest Recall: 0.8859060402684564
Random Forest F1 Score: 0.9395017793594307
Confusion Matrix:
[[966   0]
 [ 17 132]]


### Logistic回歸

In [33]:

from sklearn.linear_model import LogisticRegression
# 建立邏輯回歸模型
model_lr = LogisticRegression()
model_lr.fit(X_train_vec, y_train)

# 預測
y_pred_lr = model_lr.predict(X_test_vec)

# 計算評估指標
accuracy = accuracy_score(y_test, y_pred_lr)
precision = precision_score(y_test, y_pred_lr)
recall = recall_score(y_test, y_pred_lr)
f1 = f1_score(y_test, y_pred_lr)
conf_matrix = confusion_matrix(y_test, y_pred_lr)

print(f'Logistic Regression Accuracy: {accuracy}')
print(f'Logistic Regression Precision: {precision}')
print(f'Logistic Regression Recall: {recall}')
print(f'Logistic Regression F1 Score: {f1}')
print('Confusion Matrix:')
print(conf_matrix)

Logistic Regression Accuracy: 0.9883408071748879
Logistic Regression Precision: 1.0
Logistic Regression Recall: 0.912751677852349
Logistic Regression F1 Score: 0.9543859649122807
Confusion Matrix:
[[966   0]
 [ 13 136]]
