In [512]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

In [513]:
dados_treino = pd.read_csv('/content/train.csv')
dados_teste = pd.read_csv('/content/test.csv')

In [514]:
dados_treino.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [515]:
dados_treino.shape

(891, 12)

In [516]:
dados_treino.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [517]:
#nota-se que em idade (age) existem 177 input sem valores, enquanto em número da cabine, sao 687 e, em embarked, são 2 valores

In [518]:
dados_treino.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [519]:
dados_treino['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [520]:
#549 mortes e 342 sobreviveram

In [521]:
dados_treino.groupby(dados_treino['Pclass']).mean()['Fare']

Pclass
1    84.154687
2    20.662183
3    13.675550
Name: Fare, dtype: float64

In [522]:
dados_treino['Pclass'].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [523]:
dados_treino['SibSp'].value_counts()

0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64

In [524]:
dados_treino['Parch'].value_counts()

0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64

In [525]:
#dá para ver que precisa transformar em variável categórica a coluna "sexo"

In [526]:
y = dados_treino['Survived']
dados_treino.drop(['Survived', 'PassengerId'], axis=1, inplace=True)

In [527]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [528]:
X_train, X_valid, y_train, y_valid = train_test_split(dados_treino, y, train_size=0.8)

In [529]:
colunas_numericas = [numero for numero in X_train.columns if X_train[numero].dtype in ['int64', 'float64'] ]

colunas_categoricas = [categoricas for categoricas in X_train.columns if X_train[categoricas].nunique() < 10 and
                       X_train[categoricas].dtype == 'object']

minhas_colunas = colunas_numericas + colunas_categoricas
X_train = X_train[minhas_colunas].copy()
X_valid = X_valid[minhas_colunas].copy()
X_test = dados_teste[minhas_colunas].copy()

In [530]:
X_train.shape

(712, 7)

In [531]:
y_train.shape

(712,)

In [532]:
X_train.isnull().sum()

Pclass        0
Age         137
SibSp         0
Parch         0
Fare          0
Sex           0
Embarked      2
dtype: int64

In [533]:
#pre-processando valores numéricos.
imputandoIdade = SimpleImputer(strategy='median')

#pre-processando dados categóricos
dados_categoricos = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

#pre-processando dados numéricos e categóricos
preprocessor = ColumnTransformer(transformers=[
    ('num', imputandoIdade, colunas_numericas),
    ('cat', dados_categoricos, colunas_categoricas)
])

#imputando_X_train = pd.DataFrame(imputandoIdade.fit_transform(X_train))
#imputando_X_valid = pd.DataFrame(imputandoIdade.transform(X_valid))

In [534]:
#definindo o modelo
modelo = RandomForestRegressor(n_estimators=200, random_state=0)

In [535]:
#pre-processando e modelando os códigos
falecidos = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', modelo)
])

In [536]:
#treinando
falecidos.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  SimpleImputer(strategy='median'),
                                                  ['Pclass', 'Age', 'SibSp',
                                                   'Parch', 'Fare']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['Sex', 'Embarked'])])),
                ('model',
                 RandomForestRegressor(n_estimators=

In [571]:
#prevendo
predicao = pd.DataFrame(falecidos.predict(X_valid))
predicao

Unnamed: 0,0
0,0.375000
1,0.166750
2,1.000000
3,0.070000
4,0.350000
...,...
174,0.180000
175,0.620000
176,0.214167
177,1.000000


In [538]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_valid, predicao)

0.21562353250411911

In [539]:
#n_estimators = 100, 0.2525
#n_estimators = 300, 0.2535
#n_estimators = 200, 0.2544

In [549]:
#validação cruzada
from sklearn.model_selection import cross_val_score

scores = -1 * cross_val_score(falecidos, dados_treino, y, cv=5, scoring='neg_mean_absolute_error')

In [550]:
scores

array([0.28531706, 0.2428384 , 0.21758645, 0.25007758, 0.21455928])

In [551]:
scores.mean()

0.2420757553480392

In [572]:
comteste = pd.DataFrame(falecidos.predict(X_test))
comteste

Unnamed: 0,0
0,0.165000
1,0.090000
2,0.480000
3,0.665000
4,0.670000
...,...
413,0.000000
414,1.000000
415,0.020083
416,0.000000


In [577]:
comteste.value_counts()

1.000000    36
0.000000    26
0.126322     9
0.990000     8
0.045000     7
            ..
0.210000     1
0.212500     1
0.225000     1
0.230000     1
0.276667     1
Length: 240, dtype: int64

In [553]:
X_test

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex,Embarked
0,3,34.5,0,0,7.8292,male,Q
1,3,47.0,1,0,7.0000,female,S
2,2,62.0,0,0,9.6875,male,Q
3,3,27.0,0,0,8.6625,male,S
4,3,22.0,1,1,12.2875,female,S
...,...,...,...,...,...,...,...
413,3,,0,0,8.0500,male,S
414,1,39.0,0,0,108.9000,female,C
415,3,38.5,0,0,7.2500,male,S
416,3,,0,0,8.0500,male,S


In [574]:
dentro = -1 * cross_val_score(falecidos, X_test, comteste, cv=5, scoring='neg_mean_absolute_error')
print(dentro)

[0.10445812 0.09637461 0.09825005 0.0929145  0.08865278]


In [575]:
dentro.mean()

0.09613001131753249