In [6]:
import pandas as pd
import numpy as np
import requests
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 엑셀 데이터 로드 함수
def load_data(file_path):
    x = df.iloc[:, :-1]  # 마지막 열을 제외한 특성 데이터
    y = df.iloc[:, -1]   # 마지막 열(타겟 데이터)
    return x, y

file_path = 'car_evaluation.csv'
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


In [16]:
# 'unacc' 열의 개수 확인
unacc_count = df['unacc'].value_counts()

# 결과 출력
print("unacc 개수:")
print(unacc_count)

unacc 개수:
unacc
2    1209
0     384
1      69
3      65
Name: count, dtype: int64


In [8]:
# 결측치 확인
missing_values = df.isnull().sum()

print("\n결측치 개수:")
print(missing_values)


결측치 개수:
vhigh      0
vhigh.1    0
2          0
2.1        0
small      0
low        0
unacc      0
dtype: int64


In [10]:
# 결측치 제거
df = df.dropna()
missing_values = df.isnull().sum()

print("\n결측치 개수:")
print(missing_values)


결측치 개수:
vhigh      0
vhigh.1    0
2          0
2.1        0
small      0
low        0
unacc      0
dtype: int64


In [12]:
# 모든 컬럼에 대해 LabelEncoder를 적용
for column in df.columns:
    if df[column].dtype == 'object':  # 범주형 컬럼만 변환
        df[column] = label_encoder.fit_transform(df[column])

df.head()

Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,3,3,0,0,2,2,2
1,3,3,0,0,2,0,2
2,3,3,0,0,1,1,2
3,3,3,0,0,1,2,2
4,3,3,0,0,1,0,2


In [14]:
# 'unacc' 열의 개수 확인
unacc_count = df['unacc'].value_counts()

# 결과 출력
print("unacc 개수:")
print(unacc_count)

unacc 개수:
unacc
2    1209
0     384
1      69
3      65
Name: count, dtype: int64


In [18]:
x, y = load_data(file_path)  # 데이터 로드

In [50]:
# 데이터를 훈련 세트와 테스트 세트로 분할 (80% 훈련, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [52]:
# 데이터 정규화 (평균 0, 표준편차 1로 변환)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [54]:
# Decision Tree (의사결정나무 분류 모델)
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_scaled, y_train)  # 모델 학습

dt_pred = dt_model.predict(X_test_scaled)  # 테스트 데이터 예측
dt_acc = accuracy_score(y_test, dt_pred)  # 정확도 평가
print(f'Decision Tree Accuracy: {dt_acc:.4f}\n')
print(classification_report(y_test, dt_pred), '\n')
print(confusion_matrix(y_test, dt_pred))

Decision Tree Accuracy: 0.9913

              precision    recall  f1-score   support

           0       0.97      0.99      0.98        77
           1       0.93      1.00      0.97        14
           2       1.00      1.00      1.00       242
           3       1.00      0.92      0.96        13

    accuracy                           0.99       346
   macro avg       0.98      0.98      0.98       346
weighted avg       0.99      0.99      0.99       346
 

[[ 76   1   0   0]
 [  0  14   0   0]
 [  1   0 241   0]
 [  1   0   0  12]]


In [78]:
# Logistic Regression (로지스틱 회귀 분류 모델)
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=200)
lr_model.fit(X_train_scaled, y_train)

lr_pred = lr_model.predict(X_test_scaled)
lr_acc = accuracy_score(y_test, lr_pred)
print(f'Logistic Regression Accuracy: {lr_acc:.4f}\n')
print(classification_report(y_test, lr_pred, zero_division=1), '\n')
print(confusion_matrix(y_test, lr_pred))

Logistic Regression Accuracy: 0.6821

              precision    recall  f1-score   support

           0       0.26      0.08      0.12        77
           1       1.00      0.00      0.00        14
           2       0.73      0.94      0.82       242
           3       0.18      0.15      0.17        13

    accuracy                           0.68       346
   macro avg       0.54      0.29      0.28       346
weighted avg       0.62      0.68      0.61       346
 

[[  6   0  63   8]
 [  1   0  13   0]
 [ 13   0 228   1]
 [  3   0   8   2]]


In [72]:
# SVM (서포트 벡터 머신 분류 모델)
from sklearn.svm import SVC

svm_model = SVC()
svm_model.fit(X_train_scaled, y_train)

svm_pred = svm_model.predict(X_test_scaled)
svm_acc = accuracy_score(y_test, svm_pred)
print(f'SVM Accuracy: {svm_acc:.4f}\n')
print(classification_report(y_test, svm_pred), '\n')
print(confusion_matrix(y_test, svm_pred))

SVM Accuracy: 0.9133

              precision    recall  f1-score   support

           0       0.75      0.91      0.82        77
           1       1.00      0.14      0.25        14
           2       0.97      0.97      0.97       242
           3       0.91      0.77      0.83        13

    accuracy                           0.91       346
   macro avg       0.91      0.70      0.72       346
weighted avg       0.92      0.91      0.90       346
 

[[ 70   0   6   1]
 [ 12   2   0   0]
 [  8   0 234   0]
 [  3   0   0  10]]


In [60]:
# Random Forest (랜덤 포레스트 분류 모델)
from sklearn.ensemble import RandomForestClassifier

# 모델 학습
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# 예측 및 평가
rf_pred = rf_model.predict(X_test_scaled)
rf_acc = accuracy_score(y_test, rf_pred)
print(f'Random Forest Accuracy: {rf_acc:.4f}\n')
print(classification_report(y_test, rf_pred), '\n')
print(confusion_matrix(y_test, rf_pred))

Random Forest Accuracy: 0.9855

              precision    recall  f1-score   support

           0       0.96      0.97      0.97        77
           1       1.00      1.00      1.00        14
           2       0.99      0.99      0.99       242
           3       1.00      0.92      0.96        13

    accuracy                           0.99       346
   macro avg       0.99      0.97      0.98       346
weighted avg       0.99      0.99      0.99       346
 

[[ 75   0   2   0]
 [  0  14   0   0]
 [  2   0 240   0]
 [  1   0   0  12]]
