In [161]:
import pandas as pd
import numpy as np
import requests
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 엑셀 데이터 로드 함수
def load_data(file_path):
    x = df.iloc[:, df.columns != 'Survived']  # 마지막 열을 제외한 특성 데이터
    y = df['Survived']   # 마지막 열(타겟 데이터)
    return x, y

# LabelEncoder 객체 생성
label_encoder = LabelEncoder()

# Titanic 데이터셋을 로컬 파일로 불러오기
url = "https://raw.githubusercontent.com/MyungKyuYi/AI-class/refs/heads/main/titanic.csv"
response = requests.get(url)

with open('titanic.csv', 'wb') as file:
    file.write(response.content)

file_path = 'titanic.csv'
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [163]:
# 'Name', 'Ticket' 컬럼 삭제 (삭제할 컬럼을 리스트로 지정)
df.drop(columns=['Name', 'Ticket', 'Cabin', 'Embarked'], inplace=True)

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,1,0,3,male,22.0,1,0,7.25
1,2,1,1,female,38.0,1,0,71.2833
2,3,1,3,female,26.0,0,0,7.925
3,4,1,1,female,35.0,1,0,53.1
4,5,0,3,male,35.0,0,0,8.05


In [165]:
# 모든 컬럼에 대해 LabelEncoder를 적용 (이 경우에는 Sex만 변경)
for column in df.columns:
    if df[column].dtype == 'object':  # 범주형 컬럼만 변환
        df[column] = label_encoder.fit_transform(df[column])

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,1,0,3,1,22.0,1,0,7.25
1,2,1,1,0,38.0,1,0,71.2833
2,3,1,3,0,26.0,0,0,7.925
3,4,1,1,0,35.0,1,0,53.1
4,5,0,3,1,35.0,0,0,8.05


In [167]:
# 'Survived' 열의 개수 확인
survived_count = df['Survived'].value_counts()

# 결과 출력
print("Survived 개수:")
print(survived_count)

Survived 개수:
Survived
0    549
1    342
Name: count, dtype: int64


In [169]:
# 결측치 확인
missing_values = df.isnull().sum()

print("\n결측치 개수:")
print(missing_values)



결측치 개수:
PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
dtype: int64


In [171]:
# 'Age' 컬럼의 결측치를 평균값으로 채우기
df['Age'] = df['Age'].fillna(df['Age'].mean())

# 결측치 확인
missing_values = df.isnull().sum()

print("\n결측치 개수:")
print(missing_values)


결측치 개수:
PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
dtype: int64


In [173]:
#최종결과 head
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,1,0,3,1,22.0,1,0,7.25
1,2,1,1,0,38.0,1,0,71.2833
2,3,1,3,0,26.0,0,0,7.925
3,4,1,1,0,35.0,1,0,53.1
4,5,0,3,1,35.0,0,0,8.05


In [175]:
x, y = load_data(file_path)  # 데이터 로드

In [177]:
# 데이터를 훈련 세트와 테스트 세트로 분할 (80% 훈련, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [209]:
# 데이터 정규화 (평균 0, 표준편차 1로 변환)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [211]:
# Decision Tree (의사결정나무 분류 모델)
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_scaled, y_train)  # 모델 학습

dt_pred = dt_model.predict(X_test_scaled)  # 테스트 데이터 예측
dt_acc = accuracy_score(y_test, dt_pred)  # 정확도 평가
print(f'Decision Tree Accuracy: {dt_acc:.4f}\n')
print(classification_report(y_test, dt_pred), '\n')
print(confusion_matrix(y_test, dt_pred))

Decision Tree Accuracy: 0.7542

              precision    recall  f1-score   support

           0       0.77      0.84      0.80       105
           1       0.73      0.64      0.68        74

    accuracy                           0.75       179
   macro avg       0.75      0.74      0.74       179
weighted avg       0.75      0.75      0.75       179
 

[[88 17]
 [27 47]]


In [213]:
# Logistic Regression (로지스틱 회귀 분류 모델)
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=200)
lr_model.fit(X_train_scaled, y_train)
lr_pred = lr_model.predict(X_test_scaled)
lr_acc = accuracy_score(y_test, lr_pred)
print(f'Logistic Regression Accuracy: {lr_acc:.4f}\n')
print(classification_report(y_test, lr_pred), '\n')
print(confusion_matrix(y_test, lr_pred))

Logistic Regression Accuracy: 0.8101

              precision    recall  f1-score   support

           0       0.81      0.88      0.84       105
           1       0.80      0.72      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179
 

[[92 13]
 [21 53]]


In [215]:
# SVM (서포트 벡터 머신 분류 모델)
from sklearn.svm import SVC

svm_model = SVC()
svm_model.fit(X_train_scaled, y_train)
svm_pred = svm_model.predict(X_test_scaled)
svm_acc = accuracy_score(y_test, svm_pred)
print(f'SVM Accuracy: {svm_acc:.4f}\n')
print(classification_report(y_test, svm_pred), '\n')
print(confusion_matrix(y_test, svm_pred))

SVM Accuracy: 0.8101

              precision    recall  f1-score   support

           0       0.82      0.87      0.84       105
           1       0.79      0.73      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179
 

[[91 14]
 [20 54]]


In [217]:
# Random Forest (랜덤 포레스트 분류 모델)
from sklearn.ensemble import RandomForestClassifier

# 모델 학습
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# 예측 및 평가
rf_pred = rf_model.predict(X_test_scaled)
rf_acc = accuracy_score(y_test, rf_pred)
print(f'Random Forest Accuracy: {rf_acc:.4f}\n')
print(classification_report(y_test, rf_pred), '\n')
print(confusion_matrix(y_test, rf_pred))

Random Forest Accuracy: 0.8212

              precision    recall  f1-score   support

           0       0.82      0.89      0.85       105
           1       0.82      0.73      0.77        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179
 

[[93 12]
 [20 54]]


In [219]:
from sklearn.neighbors import KNeighborsClassifier

# 최적의 K 값 찾기
k_values = range(1, 21)  # 1부터 20까지 K 값 테스트
accuracies = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"K가 {k}일 때, 정확도: {best_accuracy:.4f}")
    accuracies.append(acc)

# 최적 K 값 출력
best_k = k_values[np.argmax(accuracies)]
best_accuracy = max(accuracies)
print(f"최적의 K 값: {best_k}, 정확도: {best_accuracy:.4f}")

# 최적 K 값으로 최종 모델 학습
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

# 예측 및 평가
accuracy = accuracy_score(y_test, y_pred)
print(f"최종 모델 (K={best_k}) 정확도: {accuracy:.4f}")

K가 1일 때, 정확도: 0.6872
K가 2일 때, 정확도: 0.6872
K가 3일 때, 정확도: 0.6872
K가 4일 때, 정확도: 0.6872
K가 5일 때, 정확도: 0.6872
K가 6일 때, 정확도: 0.6872
K가 7일 때, 정확도: 0.6872
K가 8일 때, 정확도: 0.6872
K가 9일 때, 정확도: 0.6872
K가 10일 때, 정확도: 0.6872
K가 11일 때, 정확도: 0.6872
K가 12일 때, 정확도: 0.6872
K가 13일 때, 정확도: 0.6872
K가 14일 때, 정확도: 0.6872
K가 15일 때, 정확도: 0.6872
K가 16일 때, 정확도: 0.6872
K가 17일 때, 정확도: 0.6872
K가 18일 때, 정확도: 0.6872
K가 19일 때, 정확도: 0.6872
K가 20일 때, 정확도: 0.6872
최적의 K 값: 19, 정확도: 0.6872
최종 모델 (K=19) 정확도: 0.6872
