In [1]:
# ---------------------------- IMPORTS -------------------------------:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

# --------------------------- FROM ----------------------------------:
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,GradientBoostingClassifier,VotingClassifier
from sklearn.model_selection import train_test_split,cross_val_score,cross_val_predict, cross_validate
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC

# -------------------------- obs -----------------------------------:
%matplotlib inline

In [2]:
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
def tratando(data):
    # Tratando Name:
    data['Mrs'] = data.Name.apply(lambda x: 1 if ('Mrs.' in x) else 0)
    data['Miss'] = data.Name.apply(lambda x: 1 if ('Miss.' in x) else 0)
    data['Master'] = data.Name.apply(lambda x: 1 if ('Master.' in x) else 0)
    
    # Tratando Age:
    data['aux0NAN'] = data['Age'].isnull()
    aux = pd.get_dummies(data['aux0NAN'], prefix='isNaN')
    data['NaN_Age'] = aux['isNaN_True']
    data['Child_or_Old'] = data.Age.apply(lambda x: 1 if (x<10 or x>70) else 0)
    data['Adult'] = data.Age.apply(lambda x: 1 if (x>20 or x<50) else 0)
    
    # Tratando Pclass:
    aux = pd.get_dummies(data['Pclass'], prefix='Class', drop_first=True)
    data['Class_3'] = aux['Class_3']
    data['Class_2'] = aux['Class_2']
    
    # Tratando Fare:
    data.Fare = data.Fare.interpolate()

    # Tratando o Cabin com relação ao NaN:
    data['auxNAN'] = data['Cabin'].isnull()
    aux = pd.get_dummies(data['auxNAN'], prefix='isNaN')
    data['isNaN'] = aux['isNaN_True']

    # Tratando o Cabin com relação ao tipo da cabine:
    data['Cabin_B'] = data.Cabin.apply(lambda x: 1 if str(x)[0]=='B' else 0)
    data['Cabin_C'] = data.Cabin.apply(lambda x: 1 if str(x)[0]=='C' else 0)
    data['Cabin_D'] = data.Cabin.apply(lambda x: 1 if str(x)[0]=='D' else 0)

    # Tratando o Sex:
    aux = pd.get_dummies(data['Sex'], prefix='Sex')
    data['Female'] = aux['Sex_female']

    # Tratando o Embarked:
    aux = pd.get_dummies(data['Embarked'], prefix='Embarked')
    data['Embarked_S'] = aux['Embarked_S']
    data['Embarked_C'] = aux['Embarked_C']
    
    # Criando variáveis a partir das anteriores:
    # data['Adult_Class_3'] = data['Adult']*data['Class_3']

    # Dropando colunas não utilizadas:
    # data = data.drop(['Sex', 'Embarked','auxNAN','Cabin','Ticket','Name','Pclass'], axis=1)
    
    X = data[['Class_3',
          'Class_2',
          'SibSp',
          'Parch',
          'Fare',
          'isNaN',
          'Cabin_B',
          'Cabin_C',
          'Cabin_D',
          'Female',
          'Embarked_S',
          'Embarked_C',
          'Child_or_Old',
          'NaN_Age',
          'Adult',
          'Mrs',
          'Miss',
          'Master']]
    
    return X

In [4]:
X = tratando(data)
y = data['Survived']

In [5]:
#Fazendo split dos dados
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.001)

In [6]:
#Models
def prediction(X_test):
    
    # model = XGBClassifier(max_depth = 10, learning_rate=0.01, n_estimators=100)
    model1 = RandomForestClassifier()
    model2 = XGBClassifier()
    model3 = BaggingClassifier()
    model4 = LogisticRegression()
    model5 = CatBoostClassifier(verbose=0)
    model6 = LGBMClassifier()
    model7 = SVC()

    #Fit
    model1.fit(X_train, y_train)
    model2.fit(X_train, y_train)
    model3.fit(X_train, y_train)
    model4.fit(X_train, y_train)
    model5.fit(X_train, y_train)
    model6.fit(X_train, y_train)
    model7.fit(X_train, y_train)

    #Predict
    y_pred1=model1.predict(X_test)
    y_pred2=model2.predict(X_test)
    y_pred3=model3.predict(X_test)
    y_pred4=model4.predict(X_test)
    y_pred5=model5.predict(X_test)
    y_pred6=model6.predict(X_test)
    y_pred7=model7.predict(X_test)
    
    # Mean:
    soma = (y_pred1+y_pred2+y_pred3+y_pred4+y_pred5+y_pred6+y_pred7)/7
    ans = []
    for i in soma:
        if i>0.5:
            ans.append(1)
        else:
            ans.append(0)
    
    return ans

In [7]:
y_pred = prediction(X_test)
y_pred

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[1]

In [8]:
cv = 15
metricas = ['accuracy','f1', 'precision','recall','roc_auc']

In [9]:
# Avaliando as métricas de erros e o cross-validation:
# Selecionando o Modelo:
model2 = XGBClassifier()
model2.fit(X_train, y_train)
model = model2

averages = {}
metricas = metricas
ans = cross_validate(model, X_train, y_train, return_train_score=True, scoring=metricas, cv=cv)
for key_dict in ans.keys():
    averages[key_dict] = round(np.mean(ans[key_dict]),4)

print('========= Model Evaluation ===========')
print('')
for keys,values in averages.items():
    print(keys," = ", str(values))

print('')
print('========= Features Importance =========')
print('')
#Verificando a importância das variáveis:
feature_imp = pd.Series(model.feature_importances_,index=X.columns).sort_values(ascending=False)
print(feature_imp)

print('========= Confusion Matrix ============')
print('')
cm = confusion_matrix(y_test, y_pred)
print(cm)


fit_time  =  0.1117
score_time  =  0.0106
test_accuracy  =  0.8168
train_accuracy  =  0.9294
test_f1  =  0.7487
train_f1  =  0.9038
test_precision  =  0.7834
train_precision  =  0.9446
test_recall  =  0.7216
train_recall  =  0.8666
test_roc_auc  =  0.8657
train_roc_auc  =  0.9754


Female          0.388812
Class_3         0.225462
isNaN           0.088172
Master          0.068268
SibSp           0.034929
Cabin_D         0.024892
Cabin_C         0.023458
Cabin_B         0.023020
Fare            0.018723
Embarked_S      0.018722
Embarked_C      0.015419
Mrs             0.014134
Parch           0.013251
NaN_Age         0.011473
Class_2         0.011055
Child_or_Old    0.011012
Miss            0.009197
Adult           0.000000
dtype: float32

[[1]]


In [10]:
teste = pd.read_csv("test.csv")
X_sub = tratando(teste)

In [11]:
ans = prediction(X_sub)
submission = pd.DataFrame({'PassengerId':teste.PassengerId, 'Survived':ans})
submission.to_csv('submission.csv',index=False)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
