In [601]:
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, KBinsDiscretizer, Binarizer
from sklearn.preprocessing import FunctionTransformer as FT
from sklearn.impute import SimpleImputer, KNNImputer

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier,plot_tree

from sklearn.model_selection import train_test_split

### Import & Split

In [602]:
X = pd.read_csv("/home/jorge/Proyectos/datas/titanic/tables/data_cleaned/train.csv")
Y = X["Survived"]
del X["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X.values, Y.values)

In [603]:
len(y_train)

668

### Model

In [604]:
##### Fill nans age
def fillage(X):
    mean = np.nanmean(X)
    return np.nan_to_num(X, nan=mean)

### Cut Sibps
def cutsip(X):
    hasmap = {
        0: 0,
        1: 1, 
        2: 2,
        3: 2, 
        4: 2,
        5: 2, 
        8: 2 
    }
    DX = pd.DataFrame(X, columns=["X"])
    return DX.apply(lambda x: hasmap[x]).values
# Cabin
## Letter
def get_letters1(x):
    if isinstance(x, float) == True:
        return ("None")
    sx = x.split()
    return sx[0][0]
def get_letters2(x):
    if isinstance(x, float) == True:
        return 0
    else: 
        return 1

def gletter(X):
    return np.vectorize(get_letters2)(X)

## Number
def get_number(x):
    if isinstance(x, float) == True:
        return 500
    sx = x.split()

    if len(sx[0][1:]) == 0:
        Number = 500
    else:
        Number = int(sx[0][1:])
    return Number

def gnumber(X):
    return np.vectorize(get_number)(X)

## Many
def get_many(x):
    if isinstance(x, float) == True:
        return 0
    sx = x.split()
    if len(sx)>1:
        return 1
    else:
        return 0

def gmany(X):
    return np.vectorize(get_many)(X)

In [605]:
X.columns.to_list()

['PassengerId',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [606]:
column_transforms_initial_CT = [
    ("sex2binary", OrdinalEncoder(dtype="int"), [3]),
    ("age_inpute", SimpleImputer(strategy="mean"), [4]),
    ("pclass", "passthrough", [1]),
    ("parch", "passthrough", [6]),
    ("letter", FT(gletter), [9]),
    ("number", FT(gnumber), [9]),
    ("many", FT(gmany), [9]),
    ("fare", SimpleImputer(strategy="most_frequent"), [8]),
    ("sibsp", "passthrough", [5]),
    ("embarked1", SimpleImputer(strategy="constant", fill_value="None"), [10]),
    ("embarked2", SimpleImputer(strategy="most_frequent"), [10])
    #("sibsp", "passthrough", ["SibSp"]),
    #("Parch", Binarizer(), ["Parch"]),
    #("sibsp", KBinsDiscretizer(n_bins=3, encode="ordinal", strategy="quantile"), ["SibSp"]),
    #("age_inpute", SimpleImputer(strategy="mean"), ["Age"])
    
]

column_transforms_final_CT = [
    ("sex", "passthrough", [0]),
    ("age", "passthrough", [1]),
    ("pclass", OneHotEncoder(dtype="int"), [2]),
    ("parch", "passthrough", [3]),
    #("letter", OneHotEncoder(dtype="int"), [4]),
    ("letter", "passthrough", [4]),
    ("number", "passthrough", [5]),
    ("many", "passthrough", [6]),
    ("embarked1", OneHotEncoder(dtype="int"), [9])
    #("embarked2", OneHotEncoder(dtype="int"), [9]),
    #("fare", "passthrough", [7])
    #("sibsp", OneHotEncoder(dtype="int"), [8])
]

inital_CT = ColumnTransformer(column_transforms_initial_CT, remainder="drop", verbose=True)
final_CT = ColumnTransformer(column_transforms_final_CT, remainder="drop", verbose=True)

### Classifiers

GB = [
    ("Initial transform", inital_CT), #type of columns and a littel clear
    ("Final transform", final_CT), #One hot encoder all labels
    ("Gradient Boosting", GradientBoostingClassifier(n_estimators=250, max_depth=3, learning_rate=0.01))
]

RF = [
    ("Initial transform", inital_CT), #type of columns and a littel clear
    ("Final transform", final_CT), #One hot encoder all labels
    ("Random Forest", RandomForestClassifier(n_estimators=150, max_depth=4, criterion="entropy"))
]

DT = [
    ("Initial transform", inital_CT), #type of columns and a littel clear
    ("Final transform", final_CT), #One hot encoder all labels
    ("Decision Tree", DecisionTreeClassifier(criterion="gini", max_depth=4, splitter="random"))
]

model = Pipeline(RF, verbose=True)

### Train & Test

In [607]:
f = model.fit(X_train, y_train)

[ColumnTransformer] ... (1 of 11) Processing sex2binary, total=   0.0s
[ColumnTransformer] ... (2 of 11) Processing age_inpute, total=   0.0s
[ColumnTransformer] ....... (3 of 11) Processing pclass, total=   0.0s
[ColumnTransformer] ........ (4 of 11) Processing parch, total=   0.0s
[ColumnTransformer] ....... (5 of 11) Processing letter, total=   0.0s
[ColumnTransformer] ....... (6 of 11) Processing number, total=   0.0s
[ColumnTransformer] ......... (7 of 11) Processing many, total=   0.0s
[ColumnTransformer] ......... (8 of 11) Processing fare, total=   0.0s
[ColumnTransformer] ........ (9 of 11) Processing sibsp, total=   0.0s
[ColumnTransformer] ... (10 of 11) Processing embarked1, total=   0.0s
[ColumnTransformer] ... (11 of 11) Processing embarked2, total=   0.0s
[Pipeline] . (step 1 of 3) Processing Initial transform, total=   0.0s
[ColumnTransformer] ........... (1 of 8) Processing sex, total=   0.0s
[ColumnTransformer] ........... (2 of 8) Processing age, total=   0.0s
[Colum

In [608]:
model.score(X_test, y_test)

0.8340807174887892

### DataSet

In [609]:
muestra = model[:-1].fit_transform(X_train)

[ColumnTransformer] ... (1 of 11) Processing sex2binary, total=   0.0s
[ColumnTransformer] ... (2 of 11) Processing age_inpute, total=   0.0s
[ColumnTransformer] ....... (3 of 11) Processing pclass, total=   0.0s
[ColumnTransformer] ........ (4 of 11) Processing parch, total=   0.0s
[ColumnTransformer] ....... (5 of 11) Processing letter, total=   0.0s
[ColumnTransformer] ....... (6 of 11) Processing number, total=   0.0s
[ColumnTransformer] ......... (7 of 11) Processing many, total=   0.0s
[ColumnTransformer] ......... (8 of 11) Processing fare, total=   0.0s
[ColumnTransformer] ........ (9 of 11) Processing sibsp, total=   0.0s
[ColumnTransformer] ... (10 of 11) Processing embarked1, total=   0.0s
[ColumnTransformer] ... (11 of 11) Processing embarked2, total=   0.0s
[ColumnTransformer] ........... (1 of 8) Processing sex, total=   0.0s
[ColumnTransformer] ........... (2 of 8) Processing age, total=   0.0s
[ColumnTransformer] ........ (3 of 8) Processing pclass, total=   0.0s
[Colum

In [616]:
M = pd.DataFrame(muestra, columns=["sex", "age", "pc1", "pc2", "pc3", "parch", "letter", "number", "many", "emb1", "emb2","emb3", "emb4"], dtype="float")
M["Y"] = y_train
M.corr()["Y"]

sex      -0.553478
age      -0.064366
pc1       0.270423
pc2       0.076237
pc3      -0.295241
parch     0.073606
letter    0.291900
number   -0.292292
many      0.062008
emb1      0.179490
emb2      0.071314
emb3      0.018689
emb4     -0.176500
Y         1.000000
Name: Y, dtype: float64

### Eval

In [611]:
model[-1].feature_importances_

array([0.43974895, 0.11095949, 0.05110136, 0.02396972, 0.09513549,
       0.03962441, 0.05639715, 0.10206612, 0.0081217 , 0.02175066,
       0.00178355, 0.01501776, 0.03432366])

### Predict

In [612]:
stop

NameError: name 'stop' is not defined

In [617]:
P = pd.read_csv("/home/jorge/Proyectos/datas/titanic/tables/data_cleaned/test.csv")
P

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [618]:
Predicted = pd.Series(model.predict(P))

Submission = pd.DataFrame({
    "PassengerId": P["PassengerId"],
    "Survived": Predicted
})

Submission.to_csv("/home/jorge/Proyectos/datas/titanic/tables/submission.csv", index=False)