In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce
import math 
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
%matplotlib inline

In [2]:
test = pd.read_csv('https://raw.githubusercontent.com/cicbeast/PythonCoding/main/KaggleComps/titanic/test.csv', index_col=0)
train = pd.read_csv("https://raw.githubusercontent.com/cicbeast/PythonCoding/main/KaggleComps/titanic/train.csv", index_col=0)

In [3]:
test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [6]:
# From the basic info call I know that Cabin will not be a feature that I can reasonably use.
# Next, a full pandas profile

In [7]:
# !pip install pandas-profiling

In [8]:
# from pandas_profiling import ProfileReport
# profile = ProfileReport(train)
# profile.to_file("titanic_train.html")

In [9]:
# Let's go for a baseline prediction
# Based on 'Women and Children first', let's use those parameters to
# predict survival

In [6]:
len(train[train['Sex'] == 'female'])

314

In [7]:
len(train[train['Age'] <= 15])

83

In [8]:
# These numbers seem reasonable so lets take a look
titanic_women = train[train['Sex'] == 'female']
titanic_children = train[train['Age'] <= 15]

titanic_women.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [9]:
titanic_children.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C
11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S
15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14.0,0,0,350406,7.8542,,S
17,0,3,"Rice, Master. Eugene",male,2.0,4,1,382652,29.125,,Q


In [10]:
# Now lets get a women and children df without duplicates
women_and_children = pd.concat([titanic_women,titanic_children]).drop_duplicates().reset_index(drop=True)
women_and_children.shape

(354, 11)

In [11]:
# Our prediction, based on "Women and Children first" would be that
# these 354 people from the titanic survived 
# Let's see how we did with the training set
base = len(women_and_children[women_and_children['Survived'] == 1])
surv = len(train[train['Survived'] == 1])
base/surv

0.7426900584795322

In [12]:
# Our baseline prediction got 74 percent of the survivors in the 
# training set. Not bad. Let's make our first submission using these
# parameters
test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [13]:
testsub = test.copy()
testsub.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [None]:
# testsub.loc[testsub['Sex'] == "female", 'Survived'] = 1
# testsub.loc[testsub['Age'] <= 15, 'Survived'] = 1

# testsub.head()

In [None]:
# sub = testsub[['Survived']]
# sub.reset_index(level=0, inplace=True)

# sub.head()


In [None]:
# sub.to_csv('sub.csv', index=False)

In [None]:
# Our baseline gives us a 75% Accuracy. Not bad at all
# Now we will try to beat this baseline using modeling
# First we'll need to install/import the required modules
# !pip install category_encoders

In [14]:
# Let's do some feature engineering before we do the train test split
# I want to create a new 'Child' feature, based on age
train.loc[train['Age'] <= 15, 'Child'] = 1
train.loc[train['Age'] > 15, 'Child'] = 0

train.sample(5)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Child
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
825,0,3,"Panula, Master. Urho Abraham",male,2.0,4,1,3101295,39.6875,,S,1.0
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0.0
205,1,3,"Cohen, Mr. Gurshon ""Gus""",male,18.0,0,0,A/5 3540,8.05,,S,0.0
488,0,1,"Kent, Mr. Edward Austin",male,58.0,0,0,11771,29.7,B37,C,0.0
381,1,1,"Bidois, Miss. Rosalie",female,42.0,0,0,PC 17757,227.525,,C,0.0


In [15]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
 11  Child     714 non-null    float64
dtypes: float64(3), int64(4), object(5)
memory usage: 90.5+ KB


In [16]:
# Now we'll clean up our new Child feature
train['Child'].fillna(0, inplace=True)

train['Child'] = train['Child'].astype('int64')

train.sample(6)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Child
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
534,1,3,"Peter, Mrs. Catherine (Catherine Rizk)",female,,0,2,2668,22.3583,,C,0
418,1,2,"Silven, Miss. Lyyli Karoliina",female,18.0,0,2,250652,13.0,,S,0
72,0,3,"Goodwin, Miss. Lillian Amy",female,16.0,5,2,CA 2144,46.9,,S,0
837,0,3,"Pasic, Mr. Jakob",male,21.0,0,0,315097,8.6625,,S,0
32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C,0
509,0,3,"Olsen, Mr. Henry Margido",male,28.0,0,0,C 4001,22.525,,S,0


In [17]:
train.sample(15)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Child
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
89,1,1,"Fortune, Miss. Mabel Helen",female,23.0,3,2,19950,263.0,C23 C25 C27,S,0
426,0,3,"Wiseman, Mr. Phillippe",male,,0,0,A/4. 34244,7.25,,S,0
147,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27.0,0,0,350043,7.7958,,S,0
224,0,3,"Nenkoff, Mr. Christo",male,,0,0,349234,7.8958,,S,0
785,0,3,"Ali, Mr. William",male,25.0,0,0,SOTON/O.Q. 3101312,7.05,,S,0
406,0,2,"Gale, Mr. Shadrach",male,34.0,1,0,28664,21.0,,S,0
397,0,3,"Olsson, Miss. Elina",female,31.0,0,0,350407,7.8542,,S,0
400,1,2,"Trout, Mrs. William H (Jessie L)",female,28.0,0,0,240929,12.65,,S,0
249,1,1,"Beckwith, Mr. Richard Leonard",male,37.0,1,1,11751,52.5542,D35,S,0
262,1,3,"Asplund, Master. Edvin Rojj Felix",male,3.0,4,2,347077,31.3875,,S,1


In [18]:
len(train[train['Child'] == 1])

83

In [19]:
# I know that young men held the 'Master' prefix, so I can use that also
train.loc[train['Name'].str.contains("Master"), 'Child'] = 1

len(train[train['Child'] == 1])


87

In [20]:
# Now let's do our train(val)-test split
xtrain_features = train.drop(columns=['Survived'])
ytrain_features = train['Survived']

xtrain_features.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Child
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [21]:
ytrain_features.head()

PassengerId
1    0
2    1
3    1
4    1
5    0
Name: Survived, dtype: int64

In [22]:
xtrain_features['Fare'].describe()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

In [23]:
xtrain_features['Fare'].nunique()

248

In [30]:
# We can consider scaling the Fare feature.
scaler = StandardScaler()
xtrain_features['Fare'] = scaler.fit_transform(xtrain_features['Fare'])

xtrain_features['Fare'].describe()

ValueError: Expected 2D array, got 1D array instead:
array=[  7.25    71.2833   7.925   53.1      8.05     8.4583  51.8625  21.075
  11.1333  30.0708  16.7     26.55     8.05    31.275    7.8542  16.
  29.125   13.      18.       7.225   26.      13.       8.0292  35.5
  21.075   31.3875   7.225  263.       7.8792   7.8958  27.7208 146.5208
   7.75    10.5     82.1708  52.       7.2292   8.05    18.      11.2417
   9.475   21.       7.8958  41.5792   7.8792   8.05    15.5      7.75
  21.6792  17.8     39.6875   7.8     76.7292  26.      61.9792  35.5
  10.5      7.2292  27.75    46.9      7.2292  80.      83.475   27.9
  27.7208  15.2458  10.5      8.1583   7.925    8.6625  10.5     46.9
  73.5     14.4542  56.4958   7.65     7.8958   8.05    29.      12.475
   9.       9.5      7.7875  47.1     10.5     15.85    34.375    8.05
 263.       8.05     8.05     7.8542  61.175   20.575    7.25     8.05
  34.6542  63.3583  23.      26.       7.8958   7.8958  77.2875   8.6542
   7.925    7.8958   7.65     7.775    7.8958  24.15    52.      14.4542
   8.05     9.825   14.4583   7.925    7.75    21.     247.5208  31.275
  73.5      8.05    30.0708  13.      77.2875  11.2417   7.75     7.1417
  22.3583   6.975    7.8958   7.05    14.5     26.      13.      15.0458
  26.2833  53.1      9.2167  79.2     15.2458   7.75    15.85     6.75
  11.5     36.75     7.7958  34.375   26.      13.      12.525   66.6
   8.05    14.5      7.3125  61.3792   7.7333   8.05     8.6625  69.55
  16.1     15.75     7.775    8.6625  39.6875  20.525   55.      27.9
  25.925   56.4958  33.5     29.125   11.1333   7.925   30.6958   7.8542
  25.4667  28.7125  13.       0.      69.55    15.05    31.3875  39.
  22.025   50.      15.5     26.55    15.5      7.8958  13.      13.
   7.8542  26.      27.7208 146.5208   7.75     8.4042   7.75    13.
   9.5     69.55     6.4958   7.225    8.05    10.4625  15.85    18.7875
   7.75    31.       7.05    21.       7.25    13.       7.75   113.275
   7.925   27.      76.2917  10.5      8.05    13.       8.05     7.8958
  90.       9.35    10.5      7.25    13.      25.4667  83.475    7.775
  13.5     31.3875  10.5      7.55    26.      26.25    10.5     12.275
  14.4542  15.5     10.5      7.125    7.225   90.       7.775   14.5
  52.5542  26.       7.25    10.4625  26.55    16.1     20.2125  15.2458
  79.2     86.5    512.3292  26.       7.75    31.3875  79.65     0.
   7.75    10.5     39.6875   7.775  153.4625 135.6333  31.       0.
  19.5     29.7      7.75    77.9583   7.75     0.      29.125   20.25
   7.75     7.8542   9.5      8.05    26.       8.6625   9.5      7.8958
  13.       7.75    78.85    91.0792  12.875    8.85     7.8958  27.7208
   7.2292 151.55    30.5    247.5208   7.75    23.25     0.      12.35
   8.05   151.55   110.8833 108.9     24.      56.9292  83.1583 262.375
  26.       7.8958  26.25     7.8542  26.      14.     164.8667 134.5
   7.25     7.8958  12.35    29.      69.55   135.6333   6.2375  13.
  20.525   57.9792  23.25    28.5    153.4625  18.     133.65     7.8958
  66.6    134.5      8.05    35.5     26.     263.      13.      13.
  13.      13.      13.      16.1     15.9      8.6625   9.225   35.
   7.2292  17.8      7.225    9.5     55.      13.       7.8792   7.8792
  27.9     27.7208  14.4542   7.05    15.5      7.25    75.25     7.2292
   7.75    69.3     55.4417   6.4958   8.05   135.6333  21.075   82.1708
   7.25   211.5      4.0125   7.775  227.525   15.7417   7.925   52.
   7.8958  73.5     46.9     13.       7.7292  12.     120.       7.7958
   7.925  113.275   16.7      7.7958   7.8542  26.      10.5     12.65
   7.925    8.05     9.825   15.85     8.6625  21.       7.75    18.75
   7.775   25.4667   7.8958   6.8583  90.       0.       7.925    8.05
  32.5     13.      13.      24.15     7.8958   7.7333   7.875   14.4
  20.2125   7.25    26.      26.       7.75     8.05    26.55    16.1
  26.       7.125   55.9    120.      34.375   18.75   263.      10.5
  26.25     9.5      7.775   13.       8.1125  81.8583  19.5     26.55
  19.2583  30.5     27.75    19.9667  27.75    89.1042   8.05     7.8958
  26.55    51.8625  10.5      7.75    26.55     8.05    38.5     13.
   8.05     7.05     0.      26.55     7.725   19.2583   7.25     8.6625
  27.75    13.7917   9.8375  52.      21.       7.0458   7.5208  12.2875
  46.9      0.       8.05     9.5875  91.0792  25.4667  90.      29.7
   8.05    15.9     19.9667   7.25    30.5     49.5042   8.05    14.4583
  78.2667  15.1    151.55     7.7958   8.6625   7.75     7.6292   9.5875
  86.5    108.9     26.      26.55    22.525   56.4958   7.75     8.05
  26.2875  59.4      7.4958  34.0208  10.5     24.15    26.       7.8958
  93.5      7.8958   7.225   57.9792   7.2292   7.75    10.5    221.7792
   7.925   11.5     26.       7.2292   7.2292  22.3583   8.6625  26.25
  26.55   106.425   14.5     49.5     71.      31.275   31.275   26.
 106.425   26.      26.      13.8625  20.525   36.75   110.8833  26.
   7.8292   7.225    7.775   26.55    39.6    227.525   79.65    17.4
   7.75     7.8958  13.5      8.05     8.05    24.15     7.8958  21.075
   7.2292   7.8542  10.5     51.4792  26.3875   7.75     8.05    14.5
  13.      55.9     14.4583   7.925   30.     110.8833  26.      40.125
   8.7125  79.65    15.      79.2      8.05     8.05     7.125   78.2667
   7.25     7.75    26.      24.15    33.       0.       7.225   56.9292
  27.       7.8958  42.4      8.05    26.55    15.55     7.8958  30.5
  41.5792 153.4625  31.275    7.05    15.5      7.75     8.05    65.
  14.4     16.1     39.      10.5     14.4542  52.5542  15.7417   7.8542
  16.1     32.3208  12.35    77.9583   7.8958   7.7333  30.       7.0542
  30.5      0.      27.9     13.       7.925   26.25    39.6875  16.1
   7.8542  69.3     27.9     56.4958  19.2583  76.7292   7.8958  35.5
   7.55     7.55     7.8958  23.       8.4333   7.8292   6.75    73.5
   7.8958  15.5     13.     113.275  133.65     7.225   25.5875   7.4958
   7.925   73.5     13.       7.775    8.05    52.      39.      52.
  10.5     13.       0.       7.775    8.05     9.8417  46.9    512.3292
   8.1375  76.7292   9.225   46.9     39.      41.5792  39.6875  10.1708
   7.7958 211.3375  57.      13.4167  56.4958   7.225   26.55    13.5
   8.05     7.7333 110.8833   7.65   227.525   26.2875  14.4542   7.7417
   7.8542  26.      13.5     26.2875 151.55    15.2458  49.5042  26.55
  52.       9.4833  13.       7.65   227.525   10.5     15.5      7.775
  33.       7.0542  13.      13.      53.1      8.6625  21.       7.7375
  26.       7.925  211.3375  18.7875   0.      13.      13.      16.1
  34.375  512.3292   7.8958   7.8958  30.      78.85   262.375   16.1
   7.925   71.      20.25    13.      53.1      7.75    23.      12.475
   9.5      7.8958  65.      14.5      7.7958  11.5      8.05    86.5
  14.5      7.125    7.2292 120.       7.775   77.9583  39.6      7.75
  24.15     8.3625   9.5      7.8542  10.5      7.225   23.       7.75
   7.75    12.475    7.7375 211.3375   7.2292  57.      30.      23.45
   7.05     7.25     7.4958  29.125   20.575   79.2      7.75    26.
  69.55    30.6958   7.8958  13.      25.9292   8.6833   7.2292  24.15
  13.      26.25   120.       8.5167   6.975    7.775    0.       7.775
  13.      53.1      7.8875  24.15    10.5     31.275    8.05     0.
   7.925   37.0042   6.45    27.9     93.5      8.6625   0.      12.475
  39.6875   6.95    56.4958  37.0042   7.75    80.      14.4542  18.75
   7.2292   7.8542   8.3     83.1583   8.6625   8.05    56.4958  29.7
   7.925   10.5     31.       6.4375   8.6625   7.55    69.55     7.8958
  33.      89.1042  31.275    7.775   15.2458  39.4     26.       9.35
 164.8667  26.55    19.2583   7.2292  14.1083  11.5     25.9292  69.55
  13.      13.      13.8583  50.4958   9.5     11.1333   7.8958  52.5542
   5.       9.      24.       7.225    9.8458   7.8958   7.8958  83.1583
  26.       7.8958  10.5167  10.5      7.05    29.125   13.      30.
  23.45    30.       7.75  ].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [51]:
xtrain, xval, ytrain, yval = train_test_split(xtrain_features, ytrain_features, test_size=0.25, random_state=42, stratify=ytrain_features)

In [52]:
print(xtrain.shape, xval.shape, ytrain.shape, yval.shape)

(668, 11) (223, 11) (668,) (223,)


In [53]:
# Our next step would be to encode 'Sex and Embarked'
encoder = ce.OneHotEncoder(use_cat_names=True)
xtrain_encoded = encoder.fit_transform(xtrain)

  elif pd.api.types.is_categorical(cols):


In [54]:
xtrain_encoded.head()

Unnamed: 0_level_0,Pclass,"Name_Hoyt, Mrs. Frederick Maxfield (Jane Anne Forby)","Name_Pengelly, Mr. Frederick William","Name_Gillespie, Mr. William Henry","Name_Kink-Heilmann, Miss. Luise Gretchen","Name_Rugg, Miss. Emily","Name_Marvin, Mr. Daniel Warner","Name_Panula, Mrs. Juha (Maria Emilia Ojala)","Name_Lobb, Mr. William Arthur","Name_Boulos, Mrs. Joseph (Sultana)",...,Cabin_B51 B53 B55,Cabin_B39,Cabin_C49,Cabin_C86,Cabin_E63,Embarked_S,Embarked_C,Embarked_Q,Embarked_nan,Child
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
487,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
239,2,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
723,2,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
185,3,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
57,2,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [None]:
# Not sure if we like this

In [28]:
xtrain.sample(6)
 7   Fare      668 non-null    float64

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
480,3,"Hirvonen, Miss. Hildur E",female,2.0,0,1,3101298,12.2875,,S
123,2,"Nasser, Mr. Nicholas",male,32.5,1,0,237736,30.0708,,C
723,2,"Gillespie, Mr. William Henry",male,34.0,0,0,12233,13.0,,S
19,3,"Vander Planke, Mrs. Julius (Emelia Maria Vande...",female,31.0,1,0,345763,18.0,,S
689,3,"Fischer, Mr. Eberhard Thelander",male,18.0,0,0,350036,7.7958,,S
840,1,"Marechal, Mr. Pierre",male,,0,0,11774,29.7,C47,C


In [29]:
xval.sample(5)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
439,1,"Fortune, Mr. Mark",male,64.0,1,4,19950,263.0,C23 C25 C27,S
566,3,"Davies, Mr. Alfred J",male,24.0,2,0,A/4 48871,24.15,,S
884,2,"Banfield, Mr. Frederick James",male,28.0,0,0,C.A./SOTON 34068,10.5,,S
322,3,"Danoff, Mr. Yoto",male,27.0,0,0,349219,7.8958,,S
490,3,"Coutts, Master. Eden Leslie ""Neville""",male,9.0,1,1,C.A. 37671,15.9,,S


In [30]:
ytrain.sample(6)

PassengerId
263    0
768    0
721    1
841    0
433    1
536    1
Name: Survived, dtype: int64

In [31]:
yval.sample(5)

PassengerId
72     0
727    1
84     0
879    0
803    1
Name: Survived, dtype: int64

In [33]:
xtrain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 668 entries, 487 to 822
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    668 non-null    int64  
 1   Name      668 non-null    object 
 2   Sex       668 non-null    object 
 3   Age       537 non-null    float64
 4   SibSp     668 non-null    int64  
 5   Parch     668 non-null    int64  
 6   Ticket    668 non-null    object 
 7   Fare      668 non-null    float64
 8   Cabin     146 non-null    object 
 9   Embarked  666 non-null    object 
 10  Child     537 non-null    float64
dtypes: float64(3), int64(3), object(5)
memory usage: 62.6+ KB
