# Novo processamento de dados

Tratamento de dados baseado [neste notebook](https://www.kaggle.com/imoore/titanic-the-only-notebook-you-need-to-see)

In [50]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score
from IPython.display import display
from xgboost import XGBClassifier
from mlxtend.classifier import EnsembleVoteClassifier
import seaborn as sns
import matplotlib.pyplot as plt

In [35]:
train = pd.read_csv('https://raw.githubusercontent.com/SalatielBairros/kaggle-titanic/main/data/original/train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/SalatielBairros/kaggle-titanic/main/data/original/test.csv')

PassengerId = test['PassengerId']
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [36]:
train['Ticket_type'] = train['Ticket'].apply(lambda x: x[0:3])
train['Ticket_type'] = train['Ticket_type'].astype('category')
train['Ticket_type'] = train['Ticket_type'].cat.codes

test['Ticket_type'] = test['Ticket'].apply(lambda x: x[0:3])
test['Ticket_type'] = test['Ticket_type'].astype('category')
test['Ticket_type'] = test['Ticket_type'].cat.codes

train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Ticket_type
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,124
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,137
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,148


In [37]:
full_data = [train, test]

# Some extra features, not necessarily important
# Gives the length of the name
# train['Name_length'] = train['Name'].apply(len)
# test['Name_length'] = test['Name'].apply(len)
train['Words_Count'] = train['Name'].apply(lambda x: len(x.split()))
test['Words_Count'] = test['Name'].apply(lambda x: len(x.split()))

# Feature that tells whether a passenger had a cabin on the Titanic
train['Has_Cabin'] = train["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
test['Has_Cabin'] = test["Cabin"].apply(lambda x: 0 if type(x) == float else 1)

# Feature engineering steps taken from Sina
# Create new feature FamilySize as a combination of SibSp and Parch
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
# Create new feature IsAlone from FamilySize
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
# Remove all NULLS in the Embarked column
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
# Remove all NULLS in the Fare column and create a new feature CategoricalFare
for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())
train['CategoricalFare'] = pd.qcut(train['Fare'], 4)
# Create a New feature CategoricalAge
for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)
train['CategoricalAge'] = pd.cut(train['Age'], 5)
# Define function to extract titles from passenger names
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""
# Create a new feature Title, containing the titles of passenger names
for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)
# Group all non-common titles into one single grouping "Rare"
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

for dataset in full_data:
    # Mapping Sex
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    
    # Mapping titles
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)
    
    # Mapping Embarked
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    
    # Mapping Fare
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] 						        = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] 							        = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    
    # Mapping Age
    dataset.loc[ dataset['Age'] <= 16, 'Age'] 					       = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4 ;

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list


In [38]:
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
train = train.drop(drop_elements, axis = 1)
train = train.drop(['CategoricalAge', 'CategoricalFare'], axis = 1)
test  = test.drop(drop_elements, axis = 1)

In [39]:
train.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,Fare,Embarked,Ticket_type,Words_Count,Has_Cabin,FamilySize,IsAlone,Title
0,0,3,1,1,0,0,0,124,4,0,2,0,1
1,1,1,0,2,0,3,1,137,7,1,2,0,3
2,1,3,0,1,0,1,0,148,3,0,1,1,2


In [40]:
test.head(3)

Unnamed: 0,Pclass,Sex,Age,Parch,Fare,Embarked,Ticket_type,Words_Count,Has_Cabin,FamilySize,IsAlone,Title
0,3,1,2,0,0,2,58,3,0,1,1,1
1,3,0,2,0,0,0,71,5,0,2,0,3
2,2,1,3,0,1,2,32,4,0,1,1,1


In [41]:
train.to_csv('../../data/processed_v3/train.csv', index=False)
test.to_csv('../../data/processed_v3/test.csv', index=False)

Pequeno teste com o best_model

In [42]:
X = train.drop(columns=['Survived'])
y = train['Survived']

treino_x, teste_x, treino_y, teste_y = train_test_split(X, y, test_size = 0.2, stratify = y)
print("Treinaremos com %d elementos e testaremos com %d elementos" % (len(treino_x), len(teste_x)))

Treinaremos com 712 elementos e testaremos com 179 elementos


In [43]:
best_params = {
    'criterion': 'gini',
    'max_depth': 35,
    'min_samples_split': 4,
    'min_samples_leaf': 4,
    'n_estimators': 25,
    'random_state': 5
}

modelo = RandomForestClassifier(**best_params)
modelo.fit(treino_x, treino_y)

predictions = modelo.predict(teste_x)
print({
    'accuracy_score': accuracy_score(teste_y, predictions),
    'recall_score': recall_score(teste_y, predictions),
    'balanced_accuracy_score': balanced_accuracy_score(teste_y, predictions),
    'precision_score': precision_score(teste_y, predictions),
    'f1_score': f1_score(teste_y, predictions)
})

{'accuracy_score': 0.8379888268156425, 'recall_score': 0.6811594202898551, 'balanced_accuracy_score': 0.8087615283267457, 'precision_score': 0.8703703703703703, 'f1_score': 0.7642276422764227}


In [45]:
SEED = 5
np.random.seed(SEED)

In [53]:
def tune_model_params(model, params):
    gridsearch = GridSearchCV(model, params, scoring='accuracy', n_jobs=-1, cv=10)
    gridsearch.fit(X, y)
    best_params, best_score = gridsearch.best_params_, round(gridsearch.best_score_*100, 2)
    return best_params, best_score

In [54]:
lr = LogisticRegression(solver='liblinear')
svc = SVC(gamma = "auto")
rf = RandomForestClassifier(random_state = SEED, n_estimators = 100)
knn = KNeighborsClassifier()
gnb = GaussianNB()
dt = DecisionTreeClassifier(random_state = SEED)
gbc = GradientBoostingClassifier(random_state = SEED)
abc = AdaBoostClassifier(random_state = SEED)
etc = ExtraTreesClassifier(random_state = SEED)
xgbc = XGBClassifier(random_state = SEED, use_label_encoder=False)

In [56]:
lrParams = {"penalty":["l1", "l2"],
            "C": np.logspace(0, 4, 10),
            "max_iter":[5000]}

gbcParams = {"learning_rate": [0.01, 0.02, 0.05, 0.01],
              "max_depth": [4, 6, 8],
              "max_features": [1.0, 0.3, 0.1], 
              "min_samples_split": [ 2, 3, 4],
              "random_state":[SEED]}

svcParams = {"C": np.arange(6,13), 
              "kernel": ["linear","rbf"],
              "gamma": [0.5, 0.2, 0.1, 0.001, 0.0001]}

dtParams = {"max_features": ["auto", "sqrt", "log2"],
             "min_samples_split": np.arange(2,16), 
             "min_samples_leaf":np.arange(1,12),
             "random_state":[SEED]}

rfParams = {"criterion":["gini","entropy"],
             "n_estimators":[10, 15, 20, 25, 30],
             "min_samples_leaf":[1, 2, 3],
             "min_samples_split":np.arange(3,8), 
             "max_features":["sqrt", "auto", "log2"],
             "random_state":[SEED]}

knnParams = {"n_neighbors":np.arange(3,9),
              "leaf_size":[1, 2, 3, 5],
              "weights":["uniform", "distance"],
              "algorithm":["auto", "ball_tree","kd_tree","brute"]}

abcParams = {"n_estimators":[1, 5, 10, 15, 20, 25, 40, 50, 60, 80, 100, 130, 160, 200, 250, 300],
              "learning_rate":[0.0001, 0.001, 0.01, 0.1, 0.2, 0.3,1.5],
              "random_state":[SEED]}

etcParams = {"max_depth":[None],
              "max_features":[1, 3, 10],
              "min_samples_split":[2, 3, 10],
              "min_samples_leaf":[1, 3, 10],
              "bootstrap":[False],
              "n_estimators":[100, 300],
              "criterion":["gini"],
              "random_state":[SEED]}

models_totune = [lr, svc, rf, knn, dt, gbc, abc, etc]
models_totune_str = ['lr', 'svc', 'rf', 'knn', 'dt', 'gbc', 'abc', 'etc']
models_totune_params = [lrParams, svcParams, rfParams, knnParams, dtParams, gbcParams, abcParams, etcParams]
best_params_and_scores = list(map(tune_model_params, models_totune, models_totune_params))

In [57]:
best_params_and_scores_df = pd.DataFrame(best_params_and_scores, columns=['best_params', 'best_score'], index=models_totune_str).sort_values(by='best_score', ascending=False)
pd.set_option("max_colwidth", None)
best_params_and_scores_df

Unnamed: 0,best_params,best_score
gbc,"{'learning_rate': 0.02, 'max_depth': 8, 'max_features': 0.3, 'min_samples_split': 4, 'random_state': 5}",84.63
rf,"{'criterion': 'gini', 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 3, 'n_estimators': 15, 'random_state': 5}",84.51
abc,"{'learning_rate': 0.3, 'n_estimators': 100, 'random_state': 5}",83.28
dt,"{'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 13, 'random_state': 5}",82.38
etc,"{'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 3, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 300, 'random_state': 5}",82.38
lr,"{'C': 7.742636826811269, 'max_iter': 5000, 'penalty': 'l1'}",81.03
svc,"{'C': 6, 'gamma': 0.1, 'kernel': 'rbf'}",80.92
knn,"{'algorithm': 'ball_tree', 'leaf_size': 1, 'n_neighbors': 6, 'weights': 'distance'}",78.46


In [58]:
gbc = GradientBoostingClassifier(**best_params_and_scores_df.loc['gbc', 'best_params'])
rf  = RandomForestClassifier(**best_params_and_scores_df.loc['rf', 'best_params'])
abc = AdaBoostClassifier(**best_params_and_scores_df.loc['abc', 'best_params'])
dt  = DecisionTreeClassifier(**best_params_and_scores_df.loc['dt', 'best_params'])
etc = ExtraTreesClassifier(**best_params_and_scores_df.loc['etc', 'best_params'])
knn = KNeighborsClassifier(**best_params_and_scores_df.loc['knn', 'best_params'])
lr  = LogisticRegression(**best_params_and_scores_df.loc['lr', 'best_params'], solver='liblinear')
svc = SVC(**best_params_and_scores_df.loc['svc', 'best_params'])

In [59]:
def prediction_and_confusion_matrix_scoress(model, X_train, y):
    prediction = cross_val_predict(model, X_train, y, cv=10)
    confusion_matrix = pd.crosstab(y, prediction, rownames=['actual'], colnames=['predicted'], margins=True)
    #correct_pred = (confusion_matrix.loc[0, 0] + confusion_matrix.loc[1, 1])
    classificationReport = pd.DataFrame(classification_report(y, prediction, output_dict=True)).transpose()
    title = str(model).split('(')[0]
    display(title)
    display(confusion_matrix)
    display(classificationReport)

for model in [gbc,rf,abc]:
    display(prediction_and_confusion_matrix_scoress(model, treino_x, treino_y))

'GradientBoostingClassifier'

predicted,0,1,All
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,397,42,439
1,69,204,273
All,466,246,712


Unnamed: 0,precision,recall,f1-score,support
0,0.851931,0.904328,0.877348,439.0
1,0.829268,0.747253,0.786127,273.0
accuracy,0.844101,0.844101,0.844101,0.844101
macro avg,0.8406,0.82579,0.831738,712.0
weighted avg,0.843242,0.844101,0.842372,712.0


None

'RandomForestClassifier'

predicted,0,1,All
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,390,49,439
1,74,199,273
All,464,248,712


Unnamed: 0,precision,recall,f1-score,support
0,0.840517,0.888383,0.863787,439.0
1,0.802419,0.728938,0.763916,273.0
accuracy,0.827247,0.827247,0.827247,0.827247
macro avg,0.821468,0.80866,0.813851,712.0
weighted avg,0.825909,0.827247,0.825494,712.0


None

'AdaBoostClassifier'

predicted,0,1,All
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,378,61,439
1,60,213,273
All,438,274,712


Unnamed: 0,precision,recall,f1-score,support
0,0.863014,0.861048,0.86203,439.0
1,0.777372,0.78022,0.778793,273.0
accuracy,0.830056,0.830056,0.830056,0.830056
macro avg,0.820193,0.820634,0.820412,712.0
weighted avg,0.830176,0.830056,0.830115,712.0


None

In [66]:
def export_titanic_custom_model_submission(modelo, filename):    
    X = train.drop(columns=['Survived'])
    y = train['Survived']

    modelo.fit(X, y)
    predicoes = modelo.predict(test)
    resultado = pd.DataFrame()
    resultado['PassengerId'] = PassengerId
    resultado['Survived'] = predicoes
    resultado.to_csv(f'../../data/submissions/{filename}.csv', index=False)

In [67]:
export_titanic_custom_model_submission(gbc, 'gradient_boost_classifier')

In [70]:
rf.fit(treino_x, treino_y)
predictions = rf.predict(teste_x)
print({
    'accuracy_score': accuracy_score(teste_y, predictions),
    'recall_score': recall_score(teste_y, predictions),
    'balanced_accuracy_score': balanced_accuracy_score(teste_y, predictions),
    'precision_score': precision_score(teste_y, predictions),
    'f1_score': f1_score(teste_y, predictions)
})

{'accuracy_score': 0.8212290502793296, 'recall_score': 0.6811594202898551, 'balanced_accuracy_score': 0.7951251646903821, 'precision_score': 0.8245614035087719, 'f1_score': 0.746031746031746}


Os resultados obtidos com essa transformação dos dados são semelhantes às da transformação feita por mim antes.