# Modeling

In [1]:
from function import *

In [2]:
# load data
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

### 1. Preprocessing

In [3]:
train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",0,27.0,0,0,211536,13.0,,0.0
887,888,1,1,"Graham, Miss. Margaret Edith",1,19.0,0,0,112053,30.0,B42,0.0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,,1,2,W./C. 6607,23.45,,0.0
889,890,1,1,"Behr, Mr. Karl Howell",0,26.0,0,0,111369,30.0,C148,1.0
890,891,0,3,"Dooley, Mr. Patrick",0,32.0,0,0,370376,7.75,,2.0


In [4]:
train.isnull().any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool

In [5]:
# Null값 처리
train['Age'] = train['Age'].fillna(train['Age'].mean())
test['Age'] = test['Age'].fillna(test['Age'].mean())

train['Embarked'] = train['Embarked'].fillna(train['Embarked'].max())
test['Embarked'] = test['Embarked'].fillna(test['Embarked'].max())

train['Fare'] = train['Fare'].fillna(train['Fare'].mean())
test['Fare'] = test['Fare'].fillna(test['Fare'].mean())

In [6]:
# Check VIF
train_vif = train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Survived'], axis = 1)
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(train_vif.values, i) for i in range(train_vif.shape[1])]
vif["features"] = train_vif.columns
vif

Unnamed: 0,VIF Factor,features
0,4.004062,Pclass
1,1.657283,Sex
2,3.975274,Age
3,1.563103,SibSp
4,1.631716,Parch
5,1.76964,Fare
6,1.398586,Embarked


### 2. Select columns & Encoding

In [7]:
y = train['Survived']
train = train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Survived'], axis = 1)
submission_id = test['PassengerId']
test = test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1)

In [8]:
# one-hot encoding

# pclass
pclass = pd.get_dummies(train.Pclass)
pclass.rename(columns={1: 'pclass1', 2: 'pclass2', 3: 'pclass3'}, inplace=True)
train = pd.concat([train, pclass], axis=1)
train = train.drop(['Pclass'], axis=1)

# sibsp
sibsp = pd.get_dummies(train.SibSp)
sibsp.rename(columns={0: 'sibsp0', 1: 'sibsp1', 2: 'sibsp2', 3: 'sibsp3', 4: 'sibsp4', 5: 'sibsp5', 8:'sibsp8'}, inplace=True)
train = pd.concat([train, sibsp], axis=1)
train = train.drop(['SibSp'], axis=1)

# parch
parch = pd.get_dummies(train.Parch)
parch.rename(columns={0: 'parch0', 1: 'parch1', 2: 'parch2', 3: 'parch3', 4: 'parch4', 5: 'parch5', 6:'parch6'}, inplace=True)
train = pd.concat([train, parch], axis=1)
train = train.drop(['Parch'], axis=1)

# embarked
embarked = pd.get_dummies(train.Embarked)
embarked.rename(columns={0: 'embarked0', 1: 'embarked1', 2: 'embarked2'}, inplace=True)
train = pd.concat([train, embarked], axis=1)
train = train.drop(['Embarked'], axis=1)

In [9]:
# one-hot encoding

# pclass
pclass = pd.get_dummies(test.Pclass)
pclass.rename(columns={1: 'pclass1', 2: 'pclass2', 3: 'pclass3'}, inplace=True)
test = pd.concat([test, pclass], axis=1)
test = test.drop(['Pclass'], axis=1)

# sibsp
sibsp = pd.get_dummies(test.SibSp)
sibsp.rename(columns={0: 'sibsp0', 1: 'sibsp1', 2: 'sibsp2', 3: 'sibsp3', 4: 'sibsp4', 5: 'sibsp5', 8:'sibsp8'}, inplace=True)
test = pd.concat([test, sibsp], axis=1)
test = test.drop(['SibSp'], axis=1)

# parch
parch = pd.get_dummies(test.Parch)
parch.rename(columns={0: 'parch0', 1: 'parch1', 2: 'parch2', 3: 'parch3', 4: 'parch4', 5: 'parch5', 6:'parch6'}, inplace=True)
test = pd.concat([test, parch], axis=1)
test = test.drop(['Parch'], axis=1)

# embarked
embarked = pd.get_dummies(test.Embarked)
embarked.rename(columns={0: 'embarked0', 1: 'embarked1', 2: 'embarked2'}, inplace=True)
test = pd.concat([test, embarked], axis=1)
test = test.drop(['Embarked'], axis=1)

In [10]:
test = test.drop([9], axis = 1)

### 3. Modeling

In [11]:
# split data
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=42)

In [12]:
# DecisionTree
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print('confusion matrix :\n\n', confusion_matrix(y_test, y_pred))
print('\n')
print('classification_report :\n\n', classification_report(y_test, y_pred, target_names=['Unsurvived','Survived']))

confusion matrix :

 [[85 20]
 [21 53]]


classification_report :

              precision    recall  f1-score   support

 Unsurvived       0.80      0.81      0.81       105
   Survived       0.73      0.72      0.72        74

avg / total       0.77      0.77      0.77       179



In [13]:
# RandomForest
clf = RandomForestClassifier(n_estimators = 1000, max_depth=5, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print('confusion matrix :\n\n', confusion_matrix(y_test, y_pred))
print('\n')
print('classification_report :\n\n', classification_report(y_test, y_pred, target_names=['Unsurvived','Survived']))

confusion matrix :

 [[96  9]
 [23 51]]


classification_report :

              precision    recall  f1-score   support

 Unsurvived       0.81      0.91      0.86       105
   Survived       0.85      0.69      0.76        74

avg / total       0.82      0.82      0.82       179



In [14]:
clf = xgboost.XGBClassifier(n_estimators=1000, max_depth=3)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print('confusion matrix :\n\n', confusion_matrix(y_test, y_pred))
print('\n')
print('classification_report :\n\n', classification_report(y_test, y_pred, target_names=['Unsurvived','Survived']))

confusion matrix :

 [[84 21]
 [18 56]]


classification_report :

              precision    recall  f1-score   support

 Unsurvived       0.82      0.80      0.81       105
   Survived       0.73      0.76      0.74        74

avg / total       0.78      0.78      0.78       179



  if diff:


In [15]:
clf = GradientBoostingClassifier(n_estimators=1000, max_depth=5, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print('confusion matrix :\n\n', confusion_matrix(y_test, y_pred))
print('\n')
print('classification_report :\n\n', classification_report(y_test, y_pred, target_names=['Unsurvived','Survived']))

confusion matrix :

 [[86 19]
 [21 53]]


classification_report :

              precision    recall  f1-score   support

 Unsurvived       0.80      0.82      0.81       105
   Survived       0.74      0.72      0.73        74

avg / total       0.78      0.78      0.78       179



### 4. Grid Search 

In [16]:
import warnings
warnings.filterwarnings('ignore')

param_grid = [
    {'n_estimators' : [10, 20, 50, 100, 200, 500, 1000], 'max_depth' : [2, 4, 6, 8, 10]}]

model = xgboost.XGBClassifier()
grid_search = GridSearchCV(model, param_grid, cv = 5, return_train_score = True)
grid_search.fit(X_train, y_train)

print('Best Parameter :\n\n', grid_search.best_params_)

Best Parameter :

 {'max_depth': 2, 'n_estimators': 100}


In [17]:
import warnings
warnings.filterwarnings('ignore')

param_grid = [
    {'n_estimators' : [10, 20, 50, 100, 200, 500, 1000], 'max_depth' : [2, 4, 6, 8, 10]}]

model = RandomForestClassifier()
grid_search = GridSearchCV(model, param_grid, cv = 5, return_train_score = True)
grid_search.fit(X_train, y_train)

print('Best Parameter :\n\n', grid_search.best_params_)

Best Parameter :

 {'max_depth': 8, 'n_estimators': 50}


### 5. Cross Validation

In [18]:
# validation score
clf1 = xgboost.XGBClassifier(n_estimators=100, max_depth=2)
scores = cross_val_score(clf1, X_train, y_train, cv= 10)
print('Corss Validation Score :\n\n', scores)

Corss Validation Score :

 [0.86111111 0.79166667 0.73611111 0.94444444 0.87323944 0.77464789
 0.78873239 0.77464789 0.78571429 0.9       ]


In [19]:
# validation score
clf2 = RandomForestClassifier(max_depth = 8, min_samples_split = 15, n_estimators = 1000).fit(X_train, y_train)
scores = cross_val_score(clf2, X_train, y_train, cv= 10)
print('Corss Validation Score :\n\n', scores)

Corss Validation Score :

 [0.81944444 0.79166667 0.76388889 0.93055556 0.87323944 0.8028169
 0.76056338 0.70422535 0.77142857 0.91428571]


### 6. Submit

In [20]:
clf = xgboost.XGBClassifier(n_estimators=100, max_depth=2)
clf.fit(X_train, y_train)
y_pred = clf.predict(test)

submission = pd.DataFrame(columns = ['PassengerId', 'Survived'])
submission['PassengerId'] = submission_id
submission['Survived'] = y_pred

submission.to_csv('data/xgboost.csv', index = False)

In [21]:
clf = RandomForestClassifier(max_depth = 8, min_samples_split = 15, n_estimators = 1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(test)

submission = pd.DataFrame(columns = ['PassengerId', 'Survived'])
submission['PassengerId'] = submission_id
submission['Survived'] = y_pred

submission.to_csv('data/RandomForest.csv', index = False)