In [509]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


In [510]:
data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [511]:
data.drop(['PassengerId','Name','Ticket','Cabin'],axis=1,inplace=True)
test_data.drop(['PassengerId','Name','Ticket','Cabin'],axis=1,inplace=True)


In [512]:
splitter = StratifiedShuffleSplit(n_splits=1,test_size=0.2)

In [513]:
for train_index, test_index in splitter.split(data,data['Survived']):
    train_data = data.iloc[train_index]
    validation_data = data.iloc[test_index]

In [514]:
X_train = train_data.drop(['Survived'],axis=1)
y_train = train_data['Survived']

X_validation = validation_data.drop(['Survived'],axis=1)
y_validation = validation_data['Survived']


In [515]:
numerical_features = X_train.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['category','bool','object']).columns.tolist()

In [516]:
numerical_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('onehot',OneHotEncoder(handle_unknown='ignore'))
    ]
)

In [517]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num',numerical_pipeline,numerical_features),
        ('cat',categorical_pipeline,categorical_features)
    ]
)

In [518]:
reg = LogisticRegression()
rf = RandomForestClassifier()
xgb = XGBClassifier()
svm = SVC()
models = [reg,rf,xgb,svm]

In [519]:
best_accuracy = -1
best_model = None
for model in models:
    pipe = Pipeline(
        steps=[
            ('preprocessor',preprocessor),
            ('model',model)
        ]
    )
    pipe.fit(X_train,y_train)
    y_val = pipe.predict(X_validation)
    acc = accuracy_score(y_validation,y_val)
    if acc > best_accuracy:
        best_accuracy = acc
        best_model = pipe
print(f"The best model is {best_model} with accuracy of {best_accuracy}")
y_pred = pipe.predict(test_data)
result = pd.DataFrame(
    {
        "PassengerId":list(range(892,892+len(y_pred))),
        "Survived" : y_pred
    }
    )
result.to_csv("submission.csv",index=False)    

The best model is Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Pclass', 'Age', 'SibSp',
                                                   'Parch', 'Fare']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncode