# 14 - SVC

Teste com os dois datasets gerados utilizando o `SVC`(Support Vector Machine) 

## Preparando o ambiente

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, GroupKFold, cross_validate
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler, Normalizer, RobustScaler
from IPython.display import display, clear_output

In [18]:
SEED = 5
np.random.seed(SEED)

## Carregando os dados

 ### DummiesDataset

In [3]:
treino = pd.read_csv('https://raw.githubusercontent.com/SalatielBairros/kaggle-titanic/main/data/processed_v2/train_dummies_2.csv')
teste = pd.read_csv('https://raw.githubusercontent.com/SalatielBairros/kaggle-titanic/main/data/processed_v2/test_dummies_2.csv')
Xvalidation = teste.drop(columns=['PassengerId'])
treino.head(2)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Relateds,possui_cabine,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22,1,0,1,0,0,1,0,0,1
1,1,1,38,1,0,1,1,1,0,1,0,0


### Without Dummies

In [24]:
nd_treino = pd.read_csv('https://raw.githubusercontent.com/SalatielBairros/kaggle-titanic/main/data/processed/train_processed_byage_ag_nm.csv')
nd_teste = pd.read_csv('https://raw.githubusercontent.com/SalatielBairros/kaggle-titanic/main/data/processed/test_processed_byage_ag_nm.csv')
nd_Xvalidacao = nd_teste.drop(columns=['PassengerId'])
nd_Xvalidacao.head(5)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Floor,Relateds,possui_cabine,acompanhado,faixa_etaria
0,3,0,34,0,0,0,0,0,0,1
1,3,1,47,1,0,0,1,0,1,2
2,2,0,62,0,0,0,0,0,0,2
3,3,0,27,0,0,0,0,0,0,1
4,3,1,22,1,1,0,2,0,1,1


## Separando treino e teste

In [19]:
X = treino.drop(columns=['Survived'])
y = treino['Survived']

treino_x, teste_x, treino_y, teste_y = train_test_split(X, y, test_size = 0.2, stratify = y)
print("Treinaremos com %d elementos e testaremos com %d elementos" % (len(treino_x), len(teste_x)))

Treinaremos com 712 elementos e testaremos com 179 elementos


In [26]:
nd_X = nd_treino.drop(columns=['Survived'])
nd_y = nd_treino['Survived']

nd_treino_x, nd_teste_x, nd_treino_y, nd_teste_y = train_test_split(nd_X, nd_y, test_size = 0.2, stratify = nd_y)

## Executando o `SVC`

### Utilizando o `train_test_split`

In [22]:
# kernel{‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’}, default=’rbf’
modelo = SVC(kernel='linear', verbose=1)
modelo.fit(treino_x, treino_y)
train_score = modelo.score(treino_x, treino_y)
test_score = modelo.score(teste_x, teste_y)
print(f'\nTrainScore: {train_score * 100}% | TestScore: {test_score * 100}%')

[LibSVM]
TrainScore: 78.51123595505618% | TestScore: 79.3296089385475%


In [27]:
# kernel{‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’}, default=’rbf’
modelo = SVC(kernel='linear', verbose=1)
modelo.fit(nd_treino_x, nd_treino_y)
train_score = modelo.score(nd_treino_x, nd_treino_y)
test_score = modelo.score(nd_teste_x, nd_teste_y)
print(f'\nTrainScore: {train_score * 100}% | TestScore: {test_score * 100}%')

[LibSVM]
TrainScore: 78.51123595505618% | TestScore: 81.00558659217877%


A acurácia do SVC no teste foi maior para o dataset sem os Dummies. Além disso, a acurácia de teste está melhor que a acurácia de treino, o que pode indicar uma divisão ruim dos dados, junto com a variabilidade do `SEED`. Vamos usar o `cross_validation`. Além disso o melhor `kernel` foi o `linear`, indicando que talvez o `LinearSVC` tenha melhor desempenho.

#### `SVC` + `train_test_split` + `cross_validate`

In [37]:
modelo = SVC(kernel='linear')
results = cross_validate(modelo, X, y, cv = GroupKFold(n_splits = 10), groups = treino.Age, return_train_score=True)
pd.DataFrame(results).drop(columns=['fit_time', 'score_time']).describe()

Unnamed: 0,test_score,train_score
count,10.0,10.0
mean,0.786723,0.786756
std,0.052433,0.005835
min,0.685393,0.779302
25%,0.762034,0.782419
50%,0.792135,0.78616
75%,0.825843,0.789474
max,0.853933,0.798005


In [42]:
modelo = SVC(kernel='linear')
results = cross_validate(modelo, nd_X, nd_y, cv = GroupKFold(n_splits = 10), groups = nd_treino.Age, return_train_score=True)
pd.DataFrame(results).drop(columns=['fit_time', 'score_time']).describe()

Unnamed: 0,test_score,train_score
count,10.0,10.0
mean,0.790094,0.791744
std,0.056053,0.006716
min,0.685393,0.781796
25%,0.762034,0.787406
50%,0.797753,0.7899
75%,0.825843,0.796945
max,0.876404,0.802993


O `cross_validate` mostrou a acurácia mais real do modelo (removendo o problema do treino e teste mencionado anteriormente). Continuou tendo melhor resultado no dataset antigo.

### Utilizando o `Scalers`

In [45]:
scalers = [StandardScaler(), MaxAbsScaler(), MinMaxScaler(), Normalizer(), RobustScaler()]

In [48]:
resultado = []
for scaler in scalers:
    sX = scaler.fit_transform(X)
    modelo = SVC(kernel='linear')
    results = cross_validate(modelo, sX, y, cv = GroupKFold(n_splits = 10), groups = treino.Age, return_train_score=True)
    resultado.append({
        'test_score': results['test_score'].mean(),
        'train_score': results['train_score'].mean(),
        'scaler': scaler
    })

pd.DataFrame(resultado)
    

Unnamed: 0,test_score,train_score,scaler
0,0.786723,0.786756,StandardScaler()
1,0.786723,0.786756,MaxAbsScaler()
2,0.786723,0.786756,MinMaxScaler()
3,0.631801,0.638233,Normalizer()
4,0.786723,0.786756,RobustScaler()


In [49]:
resultado = []
for scaler in scalers:
    sX = scaler.fit_transform(nd_X)
    modelo = SVC(kernel='linear')
    results = cross_validate(modelo, sX, y, cv = GroupKFold(n_splits = 10), groups = treino.Age, return_train_score=True)
    resultado.append({
        'test_score': results['test_score'].mean(),
        'train_score': results['train_score'].mean(),
        'scaler': scaler
    })

pd.DataFrame(resultado)
    

Unnamed: 0,test_score,train_score,scaler
0,0.616058,0.61616,StandardScaler()
1,0.616058,0.61616,MaxAbsScaler()
2,0.616058,0.61616,MinMaxScaler()
3,0.616058,0.61616,Normalizer()
4,0.616058,0.61616,RobustScaler()


O dataset com dummies se mostrou melhor à normalização, mantendo a acurácia, enquanto o dataset antigo (sem dummies) teve uma perda considerável de score.

### Gerando os Submissions

In [54]:
modelo = SVC(kernel='linear')
modelo.fit(X, y)
d_predictions = modelo.predict(Xvalidation)
resultado = pd.DataFrame()
resultado['PassengerId'] = teste['PassengerId']
resultado['Survived'] = d_predictions
resultado.to_csv('../../data/submissions/svc_dummies.csv', index=False)

In [55]:
modelo = SVC(kernel='linear')
modelo.fit(nd_X, nd_y)
d_predictions = modelo.predict(nd_Xvalidacao)
resultado = pd.DataFrame()
resultado['PassengerId'] = teste['PassengerId']
resultado['Survived'] = d_predictions
resultado.to_csv('../../data/submissions/svc_no_dummies.csv', index=False)

In [56]:
modelo = SVC(kernel='linear')
scaler = StandardScaler()
sX = scaler.fit_transform(X)
modelo.fit(sX, y)
d_predictions = modelo.predict(Xvalidation)
resultado = pd.DataFrame()
resultado['PassengerId'] = teste['PassengerId']
resultado['Survived'] = d_predictions
resultado.to_csv('../../data/submissions/svc_dummies_ss.csv', index=False)

O melhor modelo foi o `SVC` sem dummies e sem normalização, obtendo no kaggle 77% de acurácia, ainda inferior ao melhor resultado da `DecisionTree`. O problema está no dataset ou no modelo? No caso do `SVC` a diferença entre o teste no treino e a validação no Kaggle ficou em torno de 1 ~ 2%, enquanto nas árvores a diferença chegou a 10%, indicando overfitting.

## `LinearSVC`

In [81]:
modelo = LinearSVC(max_iter=3000)
scaler = StandardScaler()
sX = scaler.fit_transform(X)
results = cross_validate(modelo, sX, y, cv = GroupKFold(n_splits = 10), groups = treino.Age, return_train_score=True)
print(f"\nTrainScore: {results['train_score'].mean() * 100}% | TestScore: {results['test_score'].mean() * 100}%")


TrainScore: 80.14711355171771% | TestScore: 79.68735103847465%


In [93]:
modelo = LinearSVC(max_iter=2000)
scaler = StandardScaler()
sX = scaler.fit_transform(nd_X)
results = cross_validate(modelo, sX, nd_y, cv = GroupKFold(n_splits = 10), groups = treino.Age, return_train_score=True)
print(f"\nTrainScore: {results['train_score'].mean() * 100}% | TestScore: {results['test_score'].mean() * 100}%")


TrainScore: 80.80810164752033% | TestScore: 80.47386789240723%


In [94]:
scaler = StandardScaler()
vX = scaler.fit_transform(nd_Xvalidacao)

modelo.fit(sX, nd_y)
predictions = modelo.predict(vX)
resultado = pd.DataFrame()
resultado['PassengerId'] = teste['PassengerId']
resultado['Survived'] = d_predictions
resultado.to_csv('../../data/submissions/linear_svc.csv', index=False)

Mesmo tendo testes mais consistentes aqui, para o Kaggle o resultado ainda foi 76.5%. Na verdade poucos modelos conseguem ultrapassar esse valor. Tudo parece indicar para o tratamento dado ao dataset original.