In [146]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split,  GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier, callback
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

# ETL

In [120]:
df = pd.read_csv('train.csv')

In [121]:
test_passenger_id = df['PassengerId']

In [122]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [123]:
df.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0


In [124]:
df['Age'].describe()

Unnamed: 0,Age
count,714.0
mean,29.699118
std,14.526497
min,0.42
25%,20.125
50%,28.0
75%,38.0
max,80.0


In [125]:
median_ages = df.groupby(['Sex', 'Pclass'])['Age'].median()

def fill_age(row):
  if pd.isna(row['Age']):
    return median_ages.loc[row['Sex'], row['Pclass']]
  else:
    return row['Age']

df['Age'] = df.apply(fill_age, axis=1)

In [126]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [127]:
df.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,0
SibSp,0
Parch,0
Ticket,0
Fare,0


In [128]:
df['AgeGroup'] = pd.cut(df['Age'], bins=[0,12,18,35,50,80], labels=['Child','Teen','Young Adult','Adult','Senior'])
df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
df['Title'] = df['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
df['Sex_Pclass'] = df['Sex'] + '_' + df['Pclass'].astype(str)

In [129]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeGroup,Title,FamilySize,IsAlone,Sex_Pclass
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Young Adult,Mr,2,0,male_3
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Adult,Mrs,2,0,female_1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Young Adult,Miss,1,1,female_3
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Young Adult,Mrs,2,0,female_1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Young Adult,Mr,1,1,male_3


In [130]:
df.groupby(['AgeGroup', 'Sex'])['Embarked'].value_counts()

  df.groupby(['AgeGroup', 'Sex'])['Embarked'].value_counts()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
AgeGroup,Sex,Embarked,Unnamed: 3_level_1
Child,female,S,25
Child,female,C,7
Child,female,Q,0
Child,male,S,29
Child,male,C,4
Child,male,Q,4
Teen,female,S,19
Teen,female,C,13
Teen,female,Q,4
Teen,male,S,30


In [131]:
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


In [132]:
df.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,0
SibSp,0
Parch,0
Ticket,0
Fare,0


In [133]:
df['Deck'] = df['Cabin'].apply(lambda x:str(x)[0] if pd.notna(x) else 'u' )
df.drop('Cabin', axis=1, inplace=True)

In [134]:
df.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,0
SibSp,0
Parch,0
Ticket,0
Fare,0


In [135]:
X = df.drop(['Survived', 'PassengerId', 'Name', 'Ticket'], axis=1)
y = df['Survived']

In [136]:
cat_cols = ['Sex', 'Embarked', 'Deck', 'AgeGroup', 'Title', 'Sex_Pclass']
num_cols = ['Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'IsAlone']

In [140]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ]), cat_cols),
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), num_cols)
    ]
)

# Criando o Modelo

In [141]:
xgb_model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.03,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='logloss'
)

In [142]:
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', xgb_model)
])

In [143]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [144]:
param_grid = {
    'model__n_estimators': [100, 300, 500],
    'model__learning_rate': [0.01, 0.03, 0.1],
    'model__max_depth': [3, 4, 5],
    'model__subsample': [0.7, 0.8, 0.9],
    'model__colsample_bytree': [0.7, 0.8, 0.9]
}

In [147]:
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

In [148]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


In [149]:
y_pred = grid_search.predict(X_val)
acc = accuracy_score(y_val, y_pred)
print(f'Acurácia no conjunto de validação: {acc:.4f}')
print("Relatório de Classificação:\n", classification_report(y_val, y_pred))

Acurácia no conjunto de validação: 0.8045
Relatório de Classificação:
               precision    recall  f1-score   support

           0       0.80      0.90      0.85       110
           1       0.80      0.65      0.72        69

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.78       179
weighted avg       0.80      0.80      0.80       179



# BASE DE TESTE

In [152]:
test_df = pd.read_csv('test.csv')
test_passenger_id = test_df['PassengerId']

def fill_age_test(row):
    try:
        return median_ages.loc[row['Sex'], row['Pclass']] if pd.isna(row['Age']) else row['Age']
    except KeyError:
        return df['Age'].median()

test_df['Age'] = test_df.apply(fill_age_test, axis=1)

test_df['AgeGroup'] = pd.cut(test_df['Age'], bins=[0,12,18,35,50,80], labels=['Child','Teen','Young Adult','Adult','Senior'])
test_df['Title'] = test_df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
test_df['Title'] = test_df['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
test_df['Title'] = test_df['Title'].replace(['Mlle', 'Ms'], 'Miss')
test_df['Title'] = test_df['Title'].replace('Mme', 'Mrs')

test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch'] + 1
test_df['IsAlone'] = (test_df['FamilySize'] == 1).astype(int)

test_df['Sex_Pclass'] = test_df['Sex'] + '_' + test_df['Pclass'].astype(str)

test_df['Embarked'] = test_df['Embarked'].fillna(df['Embarked'].mode()[0])

test_df['Deck'] = test_df['Cabin'].apply(lambda x: str(x)[0] if pd.notna(x) else 'u')
test_df.drop('Cabin', axis=1, inplace=True)

X_test = test_df[cat_cols + num_cols]


In [153]:
best_model = grid_search.best_estimator_

y_test_pred = best_model.predict(X_test)

submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': y_test_pred
})

submission.to_csv('submission.csv', index=False)
print('Arquivo submission.csv criado com sucesso!')

Arquivo submission.csv criado com sucesso!
