In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [27]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [28]:
X_train = train.drop(columns=['Name', 'Survived', 'Ticket', 'Cabin', 'PassengerId'])
X_test = test.drop(columns=['Name', 'Ticket', 'Cabin', 'PassengerId'])
y_train = train['Survived']

In [29]:
X_train.loc[X_train['Embarked'] == 'S', 'Embarked'] = 1
X_train.loc[X_train['Embarked'] == 'Q', 'Embarked'] = 2
X_train.loc[X_train['Embarked'] == 'C', 'Embarked'] = 3

X_test.loc[X_test['Embarked'] == 'S', 'Embarked'] = 1
X_test.loc[X_test['Embarked'] == 'Q', 'Embarked'] = 2
X_test.loc[X_test['Embarked'] == 'C', 'Embarked'] = 3

X_train.loc[X_train['Sex'] == 'male', 'Sex'] = 1
X_train.loc[X_train['Sex'] == 'female', 'Sex'] = 0

X_test.loc[X_test['Sex'] == 'male', 'Sex'] = 1
X_test.loc[X_test['Sex'] == 'female', 'Sex'] = 0
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,1
1,1,0,38.0,1,0,71.2833,3
2,3,0,26.0,0,0,7.925,1
3,1,0,35.0,1,0,53.1,1
4,3,1,35.0,0,0,8.05,1


In [30]:
X_train.corr()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
Pclass,1.0,0.1319,-0.369226,0.083081,0.018443,-0.5495,-0.164681
Sex,0.1319,1.0,0.093254,-0.114631,-0.245489,-0.182333,-0.11032
Age,-0.369226,0.093254,1.0,-0.308247,-0.189119,0.096067,0.032565
SibSp,0.083081,-0.114631,-0.308247,1.0,0.414838,0.159651,-0.0689
Parch,0.018443,-0.245489,-0.189119,0.414838,1.0,0.216225,-0.040449
Fare,-0.5495,-0.182333,0.096067,0.159651,0.216225,1.0,0.226311
Embarked,-0.164681,-0.11032,0.032565,-0.0689,-0.040449,0.226311,1.0


In [31]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

def missing_value_columns(X):
    miss_col_names = []
    for col in X.columns:
        if any(X[col].isnull()):
            miss_col_names.append(col)
    return miss_col_names

missing_value_columns(X_train)

['Age', 'Embarked']

In [32]:
imputer = SimpleImputer(strategy='median')
concat_train_test = pd.concat([X_test, X_train])
miss = missing_value_columns(concat_train_test)
imputer.fit(concat_train_test[miss])
X_train[miss] = imputer.transform(X_train[miss])
X_test[miss] = imputer.transform(X_test[miss])
concat_train_test[miss] = imputer.transform(concat_train_test[miss])


In [38]:
scaler = StandardScaler()
scaler.fit(concat_train_test[['Age', 'Fare']])
X_test[['Age', 'Fare']] = scaler.transform(X_test[['Age', 'Fare']])
X_train[['Age', 'Fare']] = scaler.transform(X_train[['Age', 'Fare']])

In [39]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

params = {'n_neighbors': range(2, 35), 'weights': ('distance', 'uniform'), 'metric': ('cosine', 'euclidean', 'manhattan')}

search = GridSearchCV(KNeighborsClassifier(), params)
search.fit(X_train, y_train)
print(search.best_params_)
print(accuracy_score(y_train, search.predict(X_train)))

{'metric': 'manhattan', 'n_neighbors': 20, 'weights': 'uniform'}
0.835016835016835


In [40]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
accuracy_score(y_train, tree.predict(X_train))
live_or_not = search.predict(X_test)
df = pd.DataFrame(columns=['PassengerId', 'Survived'])
df['PassengerId'] = test['PassengerId']
df['Survived'] = live_or_not
df.to_csv('gender_submission.csv', index=False)