## Import Libraries

In [1]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

## Load Datasets and Display Index Information

In [2]:
train_df = pd.read_csv('/kaggle/input/competition-titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/competition-titanic/test.csv')

train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


## Computing Each Data Column

### Pclass

In [5]:
train_df['Pclass'].value_counts()

Pclass
3    491
1    216
2    184
Name: count, dtype: int64

Reference

* `pd.get_dummies()`
  * [[pandas] pd.get_dummies() : 데이터전처리/가변수 만들기](https://devuna.tistory.com/67)

In [6]:
pclass_train_dummies = pd.get_dummies(train_df['Pclass'])
pclass_test_dummies = pd.get_dummies(test_df['Pclass'])

pclass_train_dummies.columns = ['1', '2', '3']
pclass_test_dummies.columns = ['1', '2', '3']

train_df.drop(['Pclass'], axis=1, inplace=True)
test_df.drop(['Pclass'], axis=1, inplace=True)

train_df = train_df.join(pclass_train_dummies)
test_df = test_df.join(pclass_test_dummies)

### Name

이름에서 추출할 수 있는 정보가 있는 것으로 보이나 일단 이번엔 드랍

In [7]:
train_df = train_df.drop(['Name'], axis=1)
test_df = test_df.drop(['Name'], axis=1)

### Sex

In [8]:
train_df['Sex'].value_counts()

Sex
male      577
female    314
Name: count, dtype: int64

In [9]:
test_df['Sex'].value_counts()

Sex
male      266
female    152
Name: count, dtype: int64

In [10]:
sex_train_dummies = pd.get_dummies(train_df['Sex'])
sex_test_dummies = pd.get_dummies(test_df['Sex'])

sex_train_dummies.columns = ['Female', 'Male']
sex_test_dummies.columns = ['Female', 'Male']

train_df.drop(['Sex'], axis=1, inplace=True)
test_df.drop(['Sex'], axis=1, inplace=True)

train_df = train_df.join(sex_train_dummies)
test_df = test_df.join(sex_test_dummies)

### Age

In [11]:
train_df['Age'].value_counts()

Age
24.00    30
22.00    27
18.00    26
19.00    25
28.00    25
         ..
36.50     1
55.50     1
0.92      1
23.50     1
74.00     1
Name: count, Length: 88, dtype: int64

In [12]:
test_df['Age'].value_counts()

Age
21.0    17
24.0    17
22.0    16
30.0    15
18.0    13
        ..
76.0     1
28.5     1
22.5     1
62.0     1
38.5     1
Name: count, Length: 79, dtype: int64

In [13]:
train_df['Age'].fillna(train_df['Age'].median() , inplace=True)
test_df['Age'].fillna(train_df['Age'].median() , inplace=True)

### SibSp, Parach
전체 가족 규모 열 생성

In [14]:
for row in train_df, test_df:
    row['FamilySize'] = row['SibSp'] + row['Parch'] + 1

### Ticket

티켓 정보에서 추출할 수 있는 정보가 있는 것으로 보이나 일단 이번엔 드랍

In [15]:
train_df = train_df.drop(['Ticket'], axis=1)
test_df = test_df.drop(['Ticket'], axis=1)

### Fare

In [16]:
train_df['Fare'].value_counts()

Fare
8.0500     43
13.0000    42
7.8958     38
7.7500     34
26.0000    31
           ..
35.0000     1
28.5000     1
6.2375      1
14.0000     1
10.5167     1
Name: count, Length: 248, dtype: int64

In [17]:
test_df['Fare'].value_counts()

Fare
7.7500     21
26.0000    19
13.0000    17
8.0500     17
7.8958     11
           ..
7.8208      1
8.5167      1
78.8500     1
52.0000     1
22.3583     1
Name: count, Length: 169, dtype: int64

In [18]:
test_df['Fare'].fillna(test_df['Fare'].median(), inplace=True)

### Cabin

In [19]:
train_df = train_df.drop(['Cabin'], axis=1)
test_df = test_df.drop(['Cabin'], axis=1)

### Embarked

In [20]:
train_df['Embarked'].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [21]:
test_df['Embarked'].value_counts()

Embarked
S    270
C    102
Q     46
Name: count, dtype: int64

In [22]:
train_df['Embarked'].fillna('S', inplace=True)
test_df['Embarked'].fillna('S', inplace=True)

In [23]:
embarked_train_dummies = pd.get_dummies(train_df['Embarked'])
embarked_test_dummies = pd.get_dummies(test_df['Embarked'])

embarked_train_dummies.columns = ['S', 'C', 'Q']
embarked_test_dummies.columns = ['S', 'C', 'Q']

train_df.drop(['Embarked'], axis=1, inplace=True)
test_df.drop(['Embarked'], axis=1, inplace=True)

train_df = train_df.join(embarked_train_dummies)
test_df = test_df.join(embarked_test_dummies)

## Prepare Data for Training

In [24]:
print('check the nan value in train data')
print(train_df.isnull().sum())
print('___'*30)
print('check the nan value in test data')
print(test_df.isnull().sum())

check the nan value in train data
PassengerId    0
Survived       0
Age            0
SibSp          0
Parch          0
Fare           0
1              0
2              0
3              0
Female         0
Male           0
FamilySize     0
S              0
C              0
Q              0
dtype: int64
__________________________________________________________________________________________
check the nan value in test data
PassengerId    0
Age            0
SibSp          0
Parch          0
Fare           0
1              0
2              0
3              0
Female         0
Male           0
FamilySize     0
S              0
C              0
Q              0
dtype: int64


In [25]:
X_train = train_df.drop(['PassengerId', 'Survived'], axis=1)
Y_train = train_df['Survived']
X_test  = test_df.drop('PassengerId', axis=1).copy()

In [26]:
X_train.value_counts()

Age   SibSp  Parch  Fare      1      2      3      Female  Male   FamilySize  S      C      Q    
28.0  0      0      7.8958    False  False  True   False   True   1           False  False  True     15
                    8.0500    False  False  True   False   True   1           False  False  True     12
                    7.7500    False  False  True   False   True   1           False  True   False    10
                                                   True    False  1           False  True   False     8
                    7.2292    False  False  True   False   True   1           True   False  False     6
                                                                                                     ..
24.0  2      1      27.0000   False  True   False  True    False  4           False  False  True      1
             3      18.7500   False  True   False  True    False  6           False  False  True      1
      3      2      263.0000  True   False  False  True    False  6   

In [27]:
Y_train.value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

## Compute Machine Learning Algorithm

In [28]:
X_train

Unnamed: 0,Age,SibSp,Parch,Fare,1,2,3,Female,Male,FamilySize,S,C,Q
0,22.0,1,0,7.2500,False,False,True,False,True,2,False,False,True
1,38.0,1,0,71.2833,True,False,False,True,False,2,True,False,False
2,26.0,0,0,7.9250,False,False,True,True,False,1,False,False,True
3,35.0,1,0,53.1000,True,False,False,True,False,2,False,False,True
4,35.0,0,0,8.0500,False,False,True,False,True,1,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,27.0,0,0,13.0000,False,True,False,False,True,1,False,False,True
887,19.0,0,0,30.0000,True,False,False,True,False,1,False,False,True
888,28.0,1,2,23.4500,False,False,True,True,False,4,False,False,True
889,26.0,0,0,30.0000,True,False,False,False,True,1,True,False,False


In [29]:
X_test

Unnamed: 0,Age,SibSp,Parch,Fare,1,2,3,Female,Male,FamilySize,S,C,Q
0,34.5,0,0,7.8292,False,False,True,False,True,1,False,True,False
1,47.0,1,0,7.0000,False,False,True,True,False,2,False,False,True
2,62.0,0,0,9.6875,False,True,False,False,True,1,False,True,False
3,27.0,0,0,8.6625,False,False,True,False,True,1,False,False,True
4,22.0,1,1,12.2875,False,False,True,True,False,3,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,28.0,0,0,8.0500,False,False,True,False,True,1,False,False,True
414,39.0,0,0,108.9000,True,False,False,True,False,1,True,False,False
415,38.5,0,0,7.2500,False,False,True,False,True,1,False,False,True
416,28.0,0,0,8.0500,False,False,True,False,True,1,False,False,True


In [30]:
Y_train

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

**참조**
```
ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
```
train 데이터와 test 데이터 사이즈가 서로 맞지 않으면 발생

In [31]:
# Logistic Regression

logreg = LogisticRegression(max_iter=5000)
logreg.fit(X_train, Y_train)

Y_pred = logreg.predict(X_test)

logreg.score(X_train, Y_train)

0.8058361391694725

In [32]:
# Support Vector Machines

svc = SVC()

svc.fit(X_train, Y_train)

Y_pred = svc.predict(X_test)

svc.score(X_train, Y_train)

0.6879910213243546

In [33]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)

random_forest.fit(X_train, Y_train)

Y_pred = random_forest.predict(X_test)

random_forest.score(X_train, Y_train)

0.9797979797979798

In [34]:
# K-nearest-neighbor

knn = KNeighborsClassifier(n_neighbors = 3)

knn.fit(X_train, Y_train)

Y_pred = knn.predict(X_test)

knn.score(X_train, Y_train)

0.8417508417508418

## Build Submission File

In [35]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)

random_forest.fit(X_train, Y_train)

Y_pred = random_forest.predict(X_test)

random_forest.score(X_train, Y_train)

submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': Y_pred
})
submission.to_csv('competition-titanic__submission.csv', index=False)