In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression

X_train = pd.read_csv("titanic/train.csv")
X_test = pd.read_csv("titanic/test.csv")

In [27]:
X_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [28]:
y_train = X_train["Survived"]
del X_train["Survived"]

# y_test = X_test["Survived"]
# del X_test["Survived"]

In [29]:
print("Embarked:",X_train["Embarked"].unique())#City embarked from
print("Cabin:",X_train["Cabin"].unique())#Cabin, if applicable
print("Sex:",X_train["Sex"].unique())
#Ticket number

Embarked: ['S' 'C' 'Q' nan]
Cabin: [nan 'C85' 'C123' 'E46' 'G6' 'C103' 'D56' 'A6' 'C23 C25 C27' 'B78' 'D33'
 'B30' 'C52' 'B28' 'C83' 'F33' 'F G73' 'E31' 'A5' 'D10 D12' 'D26' 'C110'
 'B58 B60' 'E101' 'F E69' 'D47' 'B86' 'F2' 'C2' 'E33' 'B19' 'A7' 'C49'
 'F4' 'A32' 'B4' 'B80' 'A31' 'D36' 'D15' 'C93' 'C78' 'D35' 'C87' 'B77'
 'E67' 'B94' 'C125' 'C99' 'C118' 'D7' 'A19' 'B49' 'D' 'C22 C26' 'C106'
 'C65' 'E36' 'C54' 'B57 B59 B63 B66' 'C7' 'E34' 'C32' 'B18' 'C124' 'C91'
 'E40' 'T' 'C128' 'D37' 'B35' 'E50' 'C82' 'B96 B98' 'E10' 'E44' 'A34'
 'C104' 'C111' 'C92' 'E38' 'D21' 'E12' 'E63' 'A14' 'B37' 'C30' 'D20' 'B79'
 'E25' 'D46' 'B73' 'C95' 'B38' 'B39' 'B22' 'C86' 'C70' 'A16' 'C101' 'C68'
 'A10' 'E68' 'B41' 'A20' 'D19' 'D50' 'D9' 'A23' 'B50' 'A26' 'D48' 'E58'
 'C126' 'B71' 'B51 B53 B55' 'D49' 'B5' 'B20' 'F G63' 'C62 C64' 'E24' 'C90'
 'C45' 'E8' 'B101' 'D45' 'C46' 'D30' 'E121' 'D11' 'E77' 'F38' 'B3' 'D6'
 'B82 B84' 'D17' 'A36' 'B102' 'B69' 'E49' 'C47' 'D28' 'E17' 'A24' 'C50'
 'B42' 'C148']
Sex: ['m

In [30]:
#data prep
def getCabinClass(cabin):
    if pd.isnull(cabin):
        return 0
    return cabin[0]
X_train["CabinClass"] = X_train["Cabin"].apply(getCabinClass)
print("CabinClass:",X_train["CabinClass"].unique())

CabinClass: [0 'C' 'E' 'G' 'D' 'A' 'B' 'F' 'T']


In [31]:
#Ordinal encoding
categories = [0,"T","G","F","E","D","C","B","A"]
from sklearn.preprocessing import OrdinalEncoder
ord_enc = OrdinalEncoder([categories])#categories seed to be of shape (n_features,)
X_train["CabinClassEnc"] = ord_enc.fit_transform(X_train["CabinClass"].values.reshape(-1,1))
del X_train["Cabin"]
del X_train["CabinClass"]
X_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,CabinClassEnc
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,0.0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,6.0
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,0.0
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,6.0
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,0.0


In [32]:
# ord_enc.fit_transform(X_train["Embarked"].values.reshape(-1,1))
# ord_enc.fit_transform(X_train["Embarked"].fillna(0).values.reshape(-1,1))
# #We defined categories in intialization

ord_enc_gen = OrdinalEncoder()
# #When categories aren't given the order isn't known and is determined through comparisons
# ord_enc_gen.fit_transform(X_train["Embarked"].fillna(0).values.reshape(-1,1))#int vs string comparison error
ord_enc_gen.fit_transform(X_train["Embarked"].fillna("0").values.reshape(-1,1))
ord_enc_gen.categories_

[array(['0', 'C', 'Q', 'S'], dtype=object)]

In [33]:
#One hot encoding
from sklearn.preprocessing import OneHotEncoder
one_hot_enc = OneHotEncoder()#sparse=True
transformed = one_hot_enc.fit_transform(X_train[["Sex","Embarked"]].fillna("0"))
print(transformed)#by default returns a sparse array
print(one_hot_enc.categories_)
oneHotDF = pd.DataFrame(transformed.toarray(),columns = one_hot_enc.get_feature_names())
print(oneHotDF)
X_train = pd.concat([X_train,oneHotDF],axis=1)
del X_train["Sex"]
del X_train["Embarked"]
X_train

  (0, 1)	1.0
  (0, 5)	1.0
  (1, 0)	1.0
  (1, 3)	1.0
  (2, 0)	1.0
  (2, 5)	1.0
  (3, 0)	1.0
  (3, 5)	1.0
  (4, 1)	1.0
  (4, 5)	1.0
  (5, 1)	1.0
  (5, 4)	1.0
  (6, 1)	1.0
  (6, 5)	1.0
  (7, 1)	1.0
  (7, 5)	1.0
  (8, 0)	1.0
  (8, 5)	1.0
  (9, 0)	1.0
  (9, 3)	1.0
  (10, 0)	1.0
  (10, 5)	1.0
  (11, 0)	1.0
  (11, 5)	1.0
  (12, 1)	1.0
  :	:
  (878, 5)	1.0
  (879, 0)	1.0
  (879, 3)	1.0
  (880, 0)	1.0
  (880, 5)	1.0
  (881, 1)	1.0
  (881, 5)	1.0
  (882, 0)	1.0
  (882, 5)	1.0
  (883, 1)	1.0
  (883, 5)	1.0
  (884, 1)	1.0
  (884, 5)	1.0
  (885, 0)	1.0
  (885, 4)	1.0
  (886, 1)	1.0
  (886, 5)	1.0
  (887, 0)	1.0
  (887, 5)	1.0
  (888, 0)	1.0
  (888, 5)	1.0
  (889, 1)	1.0
  (889, 3)	1.0
  (890, 1)	1.0
  (890, 4)	1.0
[array(['female', 'male'], dtype=object), array(['0', 'C', 'Q', 'S'], dtype=object)]
     x0_female  x0_male  x1_0  x1_C  x1_Q  x1_S
0          0.0      1.0   0.0   0.0   0.0   1.0
1          1.0      0.0   0.0   1.0   0.0   0.0
2          1.0      0.0   0.0   0.0   0.0   1.0
3          1

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,CabinClassEnc,x0_female,x0_male,x1_0,x1_C,x1_Q,x1_S
0,1,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.2500,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,6.0,1.0,0.0,0.0,1.0,0.0,0.0
2,3,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.9250,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1000,6.0,1.0,0.0,0.0,0.0,0.0,1.0
4,5,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.0500,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",27.0,0,0,211536,13.0000,0.0,0.0,1.0,0.0,0.0,0.0,1.0
887,888,1,"Graham, Miss. Margaret Edith",19.0,0,0,112053,30.0000,7.0,1.0,0.0,0.0,0.0,0.0,1.0
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",,1,2,W./C. 6607,23.4500,0.0,1.0,0.0,0.0,0.0,0.0,1.0
889,890,1,"Behr, Mr. Karl Howell",26.0,0,0,111369,30.0000,6.0,0.0,1.0,0.0,1.0,0.0,0.0


In [37]:
# one_hot_enc.transform(X_train[["Embarked","Sex"]].fillna("0"))
# one_hot_enc.transform(X_train[["Sex","Embarked"]].fillna("1"))

KeyError: "None of [Index(['Sex', 'Embarked'], dtype='object')] are in the [columns]"

In [38]:
X_train = X_train.drop(labels=["Name","Ticket"],axis=1)
print(X_train.info())
X_train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PassengerId    891 non-null    int64  
 1   Pclass         891 non-null    int64  
 2   Age            714 non-null    float64
 3   SibSp          891 non-null    int64  
 4   Parch          891 non-null    int64  
 5   Fare           891 non-null    float64
 6   CabinClassEnc  891 non-null    float64
 7   x0_female      891 non-null    float64
 8   x0_male        891 non-null    float64
 9   x1_0           891 non-null    float64
 10  x1_C           891 non-null    float64
 11  x1_Q           891 non-null    float64
 12  x1_S           891 non-null    float64
dtypes: float64(9), int64(4)
memory usage: 90.6 KB
None


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,CabinClassEnc,x0_female,x0_male,x1_0,x1_C,x1_Q,x1_S
0,1,3,22.0,1,0,7.25,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,2,1,38.0,1,0,71.2833,6.0,1.0,0.0,0.0,1.0,0.0,0.0
2,3,3,26.0,0,0,7.925,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,4,1,35.0,1,0,53.1,6.0,1.0,0.0,0.0,0.0,0.0,1.0
4,5,3,35.0,0,0,8.05,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [39]:
from sklearn.impute import SimpleImputer
sim_imp = SimpleImputer(strategy="median")
X_train = sim_imp.fit_transform(X_train)

In [40]:
log_reg = LogisticRegression()
log_reg.fit(X_train,y_train)
# log_reg.score(X_train,y_train)#We need to transform our test dataset too

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.797979797979798

In [None]:
#embeddings