In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

file_path = "/Users/marchen/Desktop/programming/AI/Second week/file/titanic.csv"
df = pd.read_csv(file_path)

X = df.iloc[ :, :-1]
Y = LabelEncoder().fit_transform(df['Survived'])

print(df.head)
print(df.columns)

<bound method NDFrame.head of      PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                     

In [12]:
df["Survived"].value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

In [14]:
# 결측치 확인
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [16]:
# 결측치 제거
df = df.dropna()
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [18]:
# 레이블 확인
print(df.head)
print(df.columns)

<bound method NDFrame.head of      PassengerId  Survived  Pclass  \
1              2         1       0   
3              4         1       0   
6              7         0       0   
10            11         1       2   
11            12         1       0   
..           ...       ...     ...   
871          872         1       0   
872          873         0       0   
879          880         1       0   
887          888         1       0   
889          890         1       0   

                                                  Name  Sex  Age  SibSp  \
1    Cumings, Mrs. John Bradley (Florence Briggs Th...    0   32      1   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)    0   28      1   
6                              McCarthy, Mr. Timothy J    1   49      0   
10                     Sandstrom, Miss. Marguerite Rut    0    4      1   
11                            Bonnell, Miss. Elizabeth    0   53      0   
..                                                 ...  ...  ..

## Survived, Sex, Age, Fare만 데이터 프레임 생성

In [19]:
exclude_columns = ['PassengerId', 'Name', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'SibSp'] # 제외할 columns
target_columns = [col for col in df.columns if col not in exclude_columns] # 인코딩할 columns

label_encoders = {} # 라벨 인코더 저장 딕셔너리

for col in target_columns : 
    label_encoders[col] = LabelEncoder() # LabelEncoder를 label_encoders[col]에 KeyError를 방지
    df[col] = label_encoders[col].fit_transform(df[col])

### 만든 데이터 프레임을 저장 후 출력

In [20]:
df = df.drop(columns = exclude_columns)

print(df)

     Survived  Pclass  Sex  Age  Fare
1           1       0    0   32    57
3           1       0    0   28    44
6           0       0    1   49    41
10          1       2    0    4    10
11          1       0    0   53    17
..        ...     ...  ...  ...   ...
871         1       0    0   42    43
872         0       0    1   26     1
879         1       0    0   51    68
887         1       0    0   12    23
889         1       0    1   18    23

[183 rows x 5 columns]


In [29]:
df['Survived'].value_counts()

Survived
1    123
0     60
Name: count, dtype: int64

### 결측치 제거하기 전 df['Survived'].value_counts()
Survived 
0    549 
1    342 
Name: count, dtype: int64

### 결측치 제거한 후 df['Survived'].value_counts()
Survived
1    123
0    60
Name: count, dtype: int64

### 모델에 넣을 test, train 생성

In [22]:
X = df.iloc[ :, :-1] # X에 새로운 데이터 입력
Y = df['Survived']

X_test, X_train, Y_test, Y_train = train_test_split(X, Y, test_size = 0.3, random_state = 40)

In [23]:
# DT
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# DT 모델 학습
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, Y_train)

# 예측 및 평가
Y_pred = dt_model.predict(X_test)
DT_accuracy = accuracy_score(Y_test, Y_pred)
print(f"DT  예측값 : {Y_pred[:5]}")
print(f"DT  정확도 : {DT_accuracy : .4f}\n")

DT  예측값 : [0 1 1 1 1]
DT  정확도 :  1.0000



In [24]:
# RF
from sklearn.ensemble import RandomForestClassifier

# RF 모델 학습
rf_model = RandomForestClassifier()
rf_model.fit(X_train, Y_train)

# 예측 및 평가
Y_pred = rf_model.predict(X_test)
RF_accuracy = accuracy_score(Y_test, Y_pred)
print(f"RF  예측값 : {Y_pred[:5]}")
print(f"RF  정확도 : {RF_accuracy : .4f}\n")

RF  예측값 : [0 1 1 1 1]
RF  정확도 :  1.0000



In [25]:
# SVM
from sklearn.svm import SVC

# SVM 모델 학습
svm_model = SVC()
svm_model.fit(X_train, Y_train)

# 예측 및 평가
Y_pred = svm_model.predict(X_test)
SVM_accuracy = accuracy_score(Y_test, Y_pred)
print(f"SVM 예측값 : {Y_pred[:5]}")
print(f"SVM 정확도 : {SVM_accuracy : .4f}\n")

SVM 예측값 : [1 1 1 1 1]
SVM 정확도 :  0.6875



In [26]:
# LF
from sklearn.linear_model import LogisticRegression

# LF 모델 학습
lr_model = LogisticRegression(max_iter = 200)
lr_model.fit(X_train, Y_train)

# 예측 및 평가
Y_pred = lr_model.predict(X_test)
LF_accuracy = accuracy_score(Y_test, Y_pred)
print(f"LF  예측값 : {Y_pred[:5]}")
print(f"LF  정확도 : {LF_accuracy : .4f}\n")

LF  예측값 : [0 1 1 1 1]
LF  정확도 :  1.0000



In [27]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

# KNN 모델 학습
knn_model = KNeighborsClassifier(n_neighbors=5)  # k값을 5로 설정
knn_model.fit(X_train, Y_train)

# 예측 및 평가
Y_pred = knn_model.predict(X_test)
knn_accuracy = accuracy_score(Y_test, Y_pred)

print(f"KNN 예측값 : {Y_pred[:5]}")
print(f"KNN 정확도 : {knn_accuracy:.4f}")

KNN 예측값 : [1 0 1 1 1]
KNN 정확도 : 0.7734
