# Step 3: Xây dựng mô hình dự đoán

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, OrdinalEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [16]:
# Read data from CSV
df = pd.read_csv("healthcare_dataset.csv")

In [17]:
columns_drop = ['Age','Name','Date of Admission','Hospital','Doctor','Insurance Provider','Room Number','Discharge Date']
df.drop(columns=columns_drop, inplace= True)
features = [i for i in df.columns if i not in ['Test Results', 'Billing Amount']]
df

Unnamed: 0,Gender,Blood Type,Medical Condition,Billing Amount,Admission Type,Medication,Test Results
0,Female,O-,Diabetes,37490.983364,Elective,Aspirin,Inconclusive
1,Male,O+,Asthma,47304.064845,Emergency,Lipitor,Normal
2,Male,B-,Obesity,36874.896997,Emergency,Lipitor,Normal
3,Male,B-,Asthma,23303.322092,Urgent,Penicillin,Abnormal
4,Male,O-,Arthritis,18086.344184,Urgent,Paracetamol,Normal
...,...,...,...,...,...,...,...
9995,Male,A+,Obesity,39606.840083,Elective,Ibuprofen,Abnormal
9996,Female,AB+,Arthritis,5995.717488,Emergency,Ibuprofen,Normal
9997,Male,B-,Arthritis,49559.202905,Elective,Ibuprofen,Normal
9998,Male,A+,Arthritis,25236.344761,Urgent,Penicillin,Normal


#### Chuẩn hóa dữ liệu kiểu category

In [18]:
enc = OneHotEncoder(handle_unknown='ignore')

for feature in features:
  df_enc = enc.fit_transform(df[[feature]]).toarray()
  for i in range(len(enc.categories_[0])):
    df[enc.categories_[0][i]] = df_enc[:,i]
df

Unnamed: 0,Gender,Blood Type,Medical Condition,Billing Amount,Admission Type,Medication,Test Results,Female,Male,A+,...,Hypertension,Obesity,Elective,Emergency,Urgent,Aspirin,Ibuprofen,Lipitor,Paracetamol,Penicillin
0,Female,O-,Diabetes,37490.983364,Elective,Aspirin,Inconclusive,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,Male,O+,Asthma,47304.064845,Emergency,Lipitor,Normal,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,Male,B-,Obesity,36874.896997,Emergency,Lipitor,Normal,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,Male,B-,Asthma,23303.322092,Urgent,Penicillin,Abnormal,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,Male,O-,Arthritis,18086.344184,Urgent,Paracetamol,Normal,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,Male,A+,Obesity,39606.840083,Elective,Ibuprofen,Abnormal,0.0,1.0,1.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9996,Female,AB+,Arthritis,5995.717488,Emergency,Ibuprofen,Normal,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
9997,Male,B-,Arthritis,49559.202905,Elective,Ibuprofen,Normal,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9998,Male,A+,Arthritis,25236.344761,Urgent,Penicillin,Normal,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [19]:
X = df.drop(columns=features)

In [20]:
# Scale Feature
scaler = MinMaxScaler()
X['Billing Amount'] = scaler.fit_transform(X[['Billing Amount']])

In [21]:
y = X['Test Results'].to_numpy()
X = X.drop(columns="Test Results").to_numpy()
X

array([[0.74477529, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.94505974, 0.        , 1.        , ..., 1.        , 0.        ,
        0.        ],
       [0.732201  , 0.        , 1.        , ..., 1.        , 0.        ,
        0.        ],
       ...,
       [0.99108699, 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       [0.49465878, 0.        , 1.        , ..., 0.        , 0.        ,
        1.        ],
       [0.73932547, 0.        , 1.        , ..., 0.        , 0.        ,
        1.        ]])

In [22]:
enc = OrdinalEncoder()
y = enc.fit_transform(y.reshape(-1,1))
y = y.reshape(1,-1)[0]

In [23]:
# Split data into train_data and test_data
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.8, random_state= 42)

In [24]:
clf = SVC(C=1e3,kernel='sigmoid')
clf.fit(X_train, y_train)

In [25]:
y_pred = clf.predict(X_test)

In [26]:
y_pred = pd.array(y_pred)
y_true = pd.array(y_test)

In [27]:
pd.DataFrame({'y':y_true,
              'y_predict': y_pred})

Unnamed: 0,y,y_predict
0,1.0,0.0
1,2.0,0.0
2,1.0,1.0
3,1.0,0.0
4,0.0,2.0
...,...,...
1995,1.0,0.0
1996,0.0,0.0
1997,0.0,2.0
1998,0.0,2.0


In [28]:
print(f'Ti le chinh xac: {accuracy_score(y_test, y_pred)*100} %')

Ti le chinh xac: 34.4 %
