In [29]:
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [30]:
# importing our data
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [59]:
train.sample(2)

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,children,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3
5,6,0,"Moran, Mr. James",29.699118,0,0,330877,8.4583,,Q,0,0,1,0,0,1
866,867,1,"Duran y More, Miss. Asuncion",27.0,1,0,SC/PARIS 2149,13.8583,,C,0,1,0,0,1,0


<li> PassengerId - Numeric data, it is unique for each passenger
<li> Survived - Target data to be predicted, with incators 0 (Passenger Died) or 1 (Passenger Survived).
<li> Pclass - Numeric data, has 3 categories (1st, 2nd or 3rd)
<li> Name - String data type, mostly unique for each passenger
<li> Sex - Categorical data, has 2 categories (male, female)
<li> Age - Numeric data, max age is 80, it contains few nulls (can be imputed), useful.
<li> SibSp - Numeric data, max number of siblings / spouses aboard is 8, useful.
<li> Parch - Numeric data, max number of parents / children aboard is 6, useful.
<li> Ticket - Categorical data, it has a prefix to the number, it can be useful.
<li> Fare - Numeric data, higher the class, higher is the fare (need to check), 
        max fare is 512,32 (currency is $ I guess) , useful.
<li> Cabin - Categorical data, it contains few nulls (can be imputed), it can be useful.
<li> Embarked - Categorical data, with 3 categories, C = Cherbourg, Q = Queenstown, S = Southampton, 
        it contains few nulls (can be imputed), useful.

<h3>  Coisas para tomar cuidado na análise:</h3>
<li>1. valores faltantes
<li>2. valores muito grandes ou muito pequenos - outliers
<li>3. linhas duplicadas

Missining Values

In [61]:
#how many nulls are there?
train.isnull().sum()

PassengerId      0
Survived         0
Name             0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
children         0
Sex_female       0
Sex_male         0
Pclass_1         0
Pclass_2         0
Pclass_3         0
dtype: int64

In [33]:
#getting all nan for ages
train[train['Age'].isnull()].sample(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
793,794,0,1,"Hoyt, Mr. William Fisher",male,,0,0,PC 17600,30.6958,,C
140,141,0,3,"Boulos, Mrs. Joseph (Sultana)",female,,0,2,2678,15.2458,,C


In [34]:
'''
algumas estratégias:
a. usar a média
b. usar a mediana
c. um modelo para predizer a idade
d. utlizar o nome Mrs ou Mr. = casada = Não é criança
'''

train.Age.fillna(train.Age.mean(), inplace = True )
test.Age.fillna(train.Age.mean(), inplace = True )

In [35]:
# train['old'] = train.Name.apply(lambda x: 1 if any(j in x for j in ['Mr','Mrs']) else 0 )
# print(train[['old', 'Name']].sample(5))
# train.drop(columns=['old'], inplace= True)

Outliers

In [36]:
# três desvio padrão, para um curva normal, pode ser outlier.
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.002015,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,29.699118,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Duplicates

In [37]:
#do we have any duplicates?
train.Name.nunique() == len(train.Name)
#nops =D

True

In [38]:
train.PassengerId.nunique() == len(train.PassengerId)

True

<h1> Feature Engeneering

In [39]:
train['children'] = train.Age.apply(lambda x: 1 if x < 15 else 0)
test['children'] = test.Age.apply(lambda x: 1 if x < 15 else 0)

In [40]:
train.sample(4)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,children
375,376,1,1,"Meyer, Mrs. Edgar Joseph (Leila Saks)",female,29.699118,1,0,PC 17604,82.1708,,C,0
673,674,1,2,"Wilhelms, Mr. Charles",male,31.0,0,0,244270,13.0,,S,0
561,562,0,3,"Sivic, Mr. Husein",male,40.0,0,0,349251,7.8958,,S,0
29,30,0,3,"Todoroff, Mr. Lalio",male,29.699118,0,0,349216,7.8958,,S,0


In [41]:
# outras opções: clustering -> knn, gmm, etc

Dummies

In [42]:
train = pd.get_dummies(train, columns=['Sex', 'Pclass'])
test = pd.get_dummies(test, columns=['Sex', 'Pclass'])

In [43]:
train.columns

Index(['PassengerId', 'Survived', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked', 'children', 'Sex_female', 'Sex_male',
       'Pclass_1', 'Pclass_2', 'Pclass_3'],
      dtype='object')

In [44]:
#finalmente

In [45]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [46]:
y_columns = ['Survived']

x_columns = ['Age', 'SibSp', 'Parch', 'Fare', 'children',
             'Sex_female', 'Sex_male',
             'Pclass_1', 'Pclass_2', 'Pclass_3']

x_columns = ['Age', 'SibSp', 'Parch', 'Fare', 'children',
             'Sex_female',
             'Pclass_1', 'Pclass_2']


x_train,x_val, y_train, y_val = train_test_split(train[x_columns], 
                                                 train[y_columns],
                                                 test_size=0.2,
                                                 random_state=42)

In [78]:
model = RandomForestClassifier(n_estimators=100,
                                 max_depth=5, 
                                 random_state=42,
                                 oob_score = True,
                                criterion= 'entropy'
                              )
clf = model.fit(x_train,y_train)

  import sys


In [75]:
clf.score(x_val, y_val)

0.7988826815642458

In [57]:
cross_val_score(estimator=clf, 
                X=(x_val),
                y= y_val.values.ravel(), 
                cv=3,
                n_jobs=-1,
                verbose = 1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.2s finished


array([0.8       , 0.86666667, 0.76271186])

In [53]:
print(pd.crosstab(np.array(y_val).reshape(1,-1)[0], 
                  clf.predict(x_val) ,
                  rownames=['Actual'], 
                  colnames=['Predicted'],
                  margins=True))

Predicted    0   1  All
Actual                 
0           92  13  105
1           23  51   74
All        115  64  179


In [52]:
from sklearn.feature_selection import RFECV

selector = RFECV(clf, step=1, cv=5, n_jobs=-1)
selector = selector.fit(x_train, y_train)
print(pd.DataFrame(selector.ranking_, index = x_columns, columns=['importance']).sort_values('importance', ascending=True))


  y = column_or_1d(y, warn=True)


            importance
Age                  1
SibSp                1
Parch                1
Fare                 1
children             1
Sex_female           1
Pclass_1             1
Pclass_2             1


In [63]:
from sklearn.model_selection import GridSearchCV
a  = GridSearchCV(clf, {'n_estimators':[100,200,300]}, cv = 4)
# a.fit(x_train, np.array(y_train).reshape(1,712)[0])
# a.fit(x_train, np.array(y_train).reshape(1,-1)[0])
a.fit(x_train, np.array(np.ravel(y_train)).reshape(1,-1)[0])