In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from graphviz import Source

from sklearn.svm import SVC
from sklearn import linear_model
from sklearn import tree

**設定檔案名**

In [8]:
FILENAME = f"data/{input('File Name (in data/)')}"
# FILENAME = f"data/processed_data.csv"
FILENAME

'data/202404022239_6e+06.csv'

In [19]:
rename_mark = { "220045" : "Heart Rate"                           , \
                "220210" : "Respiratory Rate"                     , \
                "220179" : "Non Invasive Blood Pressure systolic" , \
                "220180" : "Non Invasive Blood Pressure diastolic", \
                "220277" : "O2 saturation pulseoxymetry"          }

**載入資料**

In [20]:
df = pd.read_csv(FILENAME, engine='python')
df.rename(columns=rename_mark, inplace=True)

**分割測試集和訓練集**

In [21]:
# scale = StandardScaler()
df_spl, df_label_spl = df.drop(columns="label"), df["label"]
# df_spl = pd.DataFrame(scale.fit_transform(df_spl), columns=df_spl.keys())
X_train, X_test, y_train, y_test = train_test_split(df_spl, df_label_spl, test_size=0.2, stratify=df_label_spl)

**訓練集和測試集的資料數**

In [22]:
len(X_train), len(X_test)

(1304, 326)

**定義測試模型用函數**

In [23]:
def test_mse(m):
    m.fit(X_train, y_train)
    a = m.predict(X_test)
    b = m.predict(X_train)
    A_score = mean_squared_error(a.astype('int'), y_test.astype('int'))
    B_score = mean_squared_error(b.astype('int'), y_train.astype('int'))
    return (round(A_score, 4), round(B_score, 4))
def test_model(m):
    A = m
    A.fit(X_train, y_train)
    y_pred = A.predict(X_test)
    cm=confusion_matrix(y_test,y_pred)
    print(cm)
    print(round(A.score(X_test, y_test), 2))
    print(test_mse(m))

**隨機森林**

In [24]:
rfc = RandomForestClassifier(
    n_estimators=200, \
    max_depth=5, \
    max_leaf_nodes= 20
)
test_model(rfc)

[[137  26]
 [ 48 115]]
0.77
(0.2239, 0.1879)


**繪製樹的模型**

In [25]:
features = df.drop(columns="label").keys()
dot_data = tree.export_graphviz(rfc[0], feature_names=features)
graph = Source(dot_data)
graph.render(view=True, format="pdf", filename=f"../Charts/forest{FILENAME}")

'../Charts/forestdata/202404022239_6e+06.csv.pdf'

**支持向量機**

In [26]:
clf = SVC(kernel='linear')
test_model(clf)

[[130  33]
 [ 60 103]]
0.71
(0.2853, 0.2531)


**邏輯回歸**

In [27]:
model=linear_model.LogisticRegression()
test_model(model)

[[123  40]
 [ 55 108]]
0.71
(0.2914, 0.2776)
