In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression

X_train = pd.read_csv("titanic/train.csv")
X_test = pd.read_csv("titanic/test.csv")

In [2]:
X_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
y_train = X_train["Survived"]
del X_train["Survived"]

# y_test = X_test["Survived"]
# del X_test["Survived"]

In [4]:
dataInfo = X_train.dtypes
dataInfo

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [5]:
numColumns = []
for i in range(len(dataInfo)):
    if dataInfo.values[i] in [np.int64, np.float64]:
        numColumns.append(dataInfo.index[i])
print(numColumns)

X_train_numerical = X_train[numColumns]
X_test_numerical = X_test[numColumns]
X_train_numerical

['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
0,1,3,22.0,1,0,7.2500
1,2,1,38.0,1,0,71.2833
2,3,3,26.0,0,0,7.9250
3,4,1,35.0,1,0,53.1000
4,5,3,35.0,0,0,8.0500
...,...,...,...,...,...,...
886,887,2,27.0,0,0,13.0000
887,888,1,19.0,0,0,30.0000
888,889,3,,1,2,23.4500
889,890,1,26.0,0,0,30.0000


In [6]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

simp_imp = SimpleImputer(strategy="median")
mmScaler = MinMaxScaler()
log_reg = LogisticRegression()
X_train_imp = simp_imp.fit_transform(X_train_numerical)
X_train_mm = mmScaler.fit_transform(X_train_imp)
print(X_train_mm)

[[0.         1.         0.27117366 0.125      0.         0.01415106]
 [0.0011236  0.         0.4722292  0.125      0.         0.13913574]
 [0.00224719 1.         0.32143755 0.         0.         0.01546857]
 ...
 [0.99775281 1.         0.34656949 0.125      0.33333333 0.04577135]
 [0.9988764  0.         0.32143755 0.         0.         0.0585561 ]
 [1.         1.         0.39683338 0.         0.         0.01512699]]


In [10]:
from sklearn.pipeline import Pipeline

#(name, estimator)
# All but last estimator need to have transform method
num_pipe_model = Pipeline([
                    ("simp_imp",SimpleImputer(strategy="median")),
                    ("mm_scaler", MinMaxScaler()),
                    ("log_reg", LogisticRegression())#last estimator only calls fit
])

#pipeline has the methods of our last estimator (fit, predict, ...), but can also be fit, transform, fit_transform...
#calling .fit() on pipeline runs fit_transform() on all steps except the last, where it just runs fit

num_pipe_model.fit(X_train_numerical, y_train)
num_pipe_model.predict(X_test_numerical)
num_pipe_model.transform(X_test_numerical)

AttributeError: 'LogisticRegression' object has no attribute 'transform'

In [14]:
num_pipe_transform = Pipeline([
                    ("simp_imp",SimpleImputer(strategy="median")),
                    ("mm_scaler", MinMaxScaler())#last estimator only calls fit
])

num_pipe_transform.fit(X_train_numerical)
# num_pipe_transform.transform(X_train_numerical)

# num_pipe_transform.fit_transform(X_train_numerical)
num_pipe_transform.transform(X_test_numerical)


array([[1.0011236 , 1.        , 0.4282483 , 0.        , 0.        ,
        0.01528158],
       [1.00224719, 1.        , 0.58532295, 0.125     , 0.        ,
        0.01366309],
       [1.00337079, 0.5       , 0.77381252, 0.        , 0.        ,
        0.01890874],
       ...,
       [1.46741573, 1.        , 0.47851219, 0.        , 0.        ,
        0.01415106],
       [1.46853933, 1.        , 0.34656949, 0.        , 0.        ,
        0.01571255],
       [1.46966292, 1.        , 0.34656949, 0.125     , 0.16666667,
        0.0436405 ]])

In [None]:
#data prep
def getCabinClass(cabin):
    if pd.isnull(cabin):
        return 0
    return cabin[0]
X_train["CabinClass"] = X_train["Cabin"].apply(getCabinClass)
print("CabinClass:",X_train["CabinClass"].unique())

In [None]:
#Ordinal encoding
categories = [0,"T","G","F","E","D","C","B","A"]
from sklearn.preprocessing import OrdinalEncoder
ord_enc = OrdinalEncoder([categories])#categories seed to be of shape (n_features,)
X_train["CabinClassEnc"] = ord_enc.fit_transform(X_train["CabinClass"].values.reshape(-1,1))
del X_train["Cabin"]
del X_train["CabinClass"]
X_train.head()

In [None]:
# ord_enc.fit_transform(X_train["Embarked"].values.reshape(-1,1))
# ord_enc.fit_transform(X_train["Embarked"].fillna(0).values.reshape(-1,1))
# #We defined categories in intialization

ord_enc_gen = OrdinalEncoder()
# #When categories aren't given the order isn't known and is determined through comparisons
# ord_enc_gen.fit_transform(X_train["Embarked"].fillna(0).values.reshape(-1,1))#int vs string comparison error
ord_enc_gen.fit_transform(X_train["Embarked"].fillna("0").values.reshape(-1,1))
ord_enc_gen.categories_

In [None]:
#One hot encoding
from sklearn.preprocessing import OneHotEncoder
one_hot_enc = OneHotEncoder()#sparse=True
transformed = one_hot_enc.fit_transform(X_train[["Sex","Embarked"]].fillna("0"))
print(transformed)#by default returns a sparse array
print(one_hot_enc.categories_)
oneHotDF = pd.DataFrame(transformed.toarray(),columns = one_hot_enc.get_feature_names())
print(oneHotDF)
X_train = pd.concat([X_train,oneHotDF],axis=1)
del X_train["Sex"]
del X_train["Embarked"]
X_train

In [None]:
# one_hot_enc.transform(X_train[["Embarked","Sex"]].fillna("0"))
# one_hot_enc.transform(X_train[["Sex","Embarked"]].fillna("1"))

In [None]:
X_train = X_train.drop(labels=["Name","Ticket"],axis=1)
print(X_train.info())
X_train.head()

In [None]:
from sklearn.impute import SimpleImputer
sim_imp = SimpleImputer(strategy="median")
X_train = sim_imp.fit_transform(X_train)

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train,y_train)
# log_reg.score(X_train,y_train)#We need to transform our test dataset too

In [None]:
#embeddings