In [1]:
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import KBinsDiscretizer

from sklearn.svm import SVC

# Explore Data

In [2]:
df = pd.read_csv('train.csv')

### Remove 'Ticket', 'PassengerId','Cabin'

In [3]:
df.drop(['PassengerId','Ticket'], axis=1, inplace=True)

### Convert Names & Cabin to theri character length

In [4]:
df['Name'] = df['Name'].str.len()
df['Cabin'] = df['Cabin'].str.len()

### Select target and features

In [5]:
X = df.iloc[:,1:]
y = df['Survived']

### Split data to train and test

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state= 42)

In [7]:
X_train.shape, X_test.shape

((534, 9), (357, 9))

### Categorizing numeric and categorical features

In [9]:
numeric_features = ['Name', 'Fare','Age','Cabin']
numeric_transformer = make_pipeline(
    SimpleImputer(strategy="mean"), 
    RobustScaler()
    )

In [10]:
categorical_features = ['Embarked', 'Sex','Pclass']
categorical_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"), 
    OneHotEncoder(handle_unknown="ignore")
    )

### Pre-Process transformations

In [11]:
preprocessor = ColumnTransformer(
    transformers=[('num', numeric_transformer, numeric_features),
                  ('cat',categorical_transformer, categorical_features),
                  ('do_nothing', 'passthrough',['Parch','SibSp'])])

# A. Model creation with 'LogisticRegression'

In [13]:
pipeline_SVC = make_pipeline(preprocessor, SVC(kernel='rbf', C=0.1))

### Fit the train data with 'LogisticRegression'

In [14]:
pipeline_SVC.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer()),
                                                                  ('robustscaler',
                                                                   RobustScaler())]),
                                                  ['Name', 'Fare', 'Age',
                                                   'Cabin']),
                                                 ('cat',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='

### Calculate a cross-validation score with 5 folds

In [16]:
from sklearn.model_selection import cross_validate

cv_SVC = cross_validate(pipeline_SVC, X_train, y_train,
                    cv=5,
                    scoring='accuracy',
                    return_train_score=True
)
# convert the dictionary of lists into a DataFrame
cv_SVC = pd.DataFrame(cv_SVC)
print(cv_SVC)

   fit_time  score_time  test_score  train_score
0  0.062140    0.043111    0.813084     0.768150
1  0.048043    0.016244    0.775701     0.786885
2  0.032417    0.012627    0.757009     0.810304
3  0.025706    0.015650    0.691589     0.810304
4  0.022711    0.011570    0.811321     0.780374


In [17]:
cv_SVC.mean()

fit_time       0.038203
score_time     0.019840
test_score     0.769741
train_score    0.791204
dtype: float64

### Calculate the train and test accuracy

In [18]:
print('train acc', round(pipeline_SVC.score(X_train, y_train), 3))
print('test acc ', round(pipeline_SVC.score(X_test, y_test), 3))

train acc 0.805
test acc  0.798


### Predict on new test data with 'Support Vector Machine'

In [22]:
df_test = pd.read_csv('test.csv')

In [23]:
df_test.drop(['Ticket'], axis=1, inplace=True)
df_test['Name'] = df_test['Name'].str.len()
df_test['Cabin'] = df_test['Cabin'].str.len()
pred_log_reg_SVC=df_test[['PassengerId']]
del df_test['PassengerId']

In [24]:
test_pred_SVC = pipeline_SVC.predict(df_test)

In [25]:
pred_log_reg_SVC['Survived']=test_pred_SVC

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_log_reg_SVC['Survived']=test_pred_SVC


In [26]:
pred_log_reg_SVC.to_csv('Survived_SVC.csv', index=False)