In [321]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV


In [322]:
df_train=pd.read_csv('../data/train.csv')
df_test=pd.read_csv('../data/test.csv')


In [323]:
df=pd.concat([df_train,df_test],keys=['train','test'])

In [324]:
df.head()

Unnamed: 0,Unnamed: 1,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
train,0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
train,1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
train,2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
train,3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
train,4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [325]:
def extract_title_from_string(full_name):
	split_string=full_name.split()
	for string in split_string:
		if string[-1] == ".":
			return string[:-1]
	return np.nan

In [326]:
df["Name"] = df["Name"].map(lambda x: extract_title_from_string(x))

In [327]:
text_columns = ["Sex", "Cabin", "Ticket", "Embarked", "Name"]
for column in text_columns:
	df[column]=pd.Categorical(df[column]).codes
	

In [None]:
df["Age"][df["Age"].isnull()==True]=df["Age"].mean()

In [329]:
df.head()


Unnamed: 0,Unnamed: 1,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
train,0,1,0.0,3,13,1,22.0,1,0,720,7.25,-1,2
train,1,2,1.0,1,14,0,38.0,1,0,816,71.2833,106,0
train,2,3,1.0,3,10,0,26.0,0,0,914,7.925,-1,2
train,3,4,1.0,1,14,0,35.0,1,0,65,53.1,70,2
train,4,5,0.0,3,13,1,35.0,0,0,649,8.05,-1,2


In [330]:
df_train=df.loc["train"]
df_test=df.loc["test"]
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,13,1,22.0,1,0,720,7.25,-1,2
1,2,1.0,1,14,0,38.0,1,0,816,71.2833,106,0
2,3,1.0,3,10,0,26.0,0,0,914,7.925,-1,2
3,4,1.0,1,14,0,35.0,1,0,65,53.1,70,2
4,5,0.0,3,13,1,35.0,0,0,649,8.05,-1,2


In [331]:
X_train, X_test, y_train, y_test=train_test_split(df_train[["Pclass","Name","Sex","Age","SibSp","Parch","Ticket","Fare","Cabin","Embarked"]], df_train["Survived"], random_state=100)

In [332]:
csv_linear=SVC(kernel='linear')
csv_rbf=SVC(kernel='rbf')
csv_sig=SVC(kernel='sigmoid')
gaus_nb=GaussianNB()
parameters = {'clf__kernel': ['linear','rbf', 'sigmoid']}
pipe = Pipeline([('scaler', StandardScaler()), ('clf', SVC())])
clf = GridSearchCV(pipe, parameters)
clf.fit(X_train, y_train)
print(f'best score: {clf.best_score_}')
print(f'best_parameters:{clf.best_params_}')

best score: 0.8263270115587475
best_parameters:{'clf__kernel': 'rbf'}


In [None]:
X_test=df_test[["Pclass","Name","Sex","Age","SibSp","Parch","Ticket","Fare","Cabin","Embarked"]]
X_test['Fare'][X_test["Fare"].isnull()==True]=X_test['Fare'].mean()

In [334]:
predictions=clf.predict(X_test)

In [335]:
PassengerId=df_test["PassengerId"].to_numpy()

In [337]:
df_upload=pd.DataFrame([PassengerId,predictions]).T

In [338]:
df_upload.columns = ['PassengerId','Survived'] 

In [347]:
df_upload=df_upload.apply(pd.to_numeric, downcast='integer')

In [348]:
df_upload

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [349]:
df_upload.to_csv('../data/upload.csv',index=False)