# Projeto Integrador 6 - Predição de Diabetes com Modelos de IA

## Grupo 04

# Import de Pacotes e Bibliotecas

In [1]:
!pip install optuna
!pip install ucimlrepo



In [1]:
import warnings
import math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import optuna
from plotly.io import show
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, FunctionTransformer, RobustScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import accuracy_score, f1_score, make_scorer, confusion_matrix, ConfusionMatrixDisplay, recall_score, precision_score
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from ucimlrepo import fetch_ucirepo
from functools import partial


warnings.filterwarnings('ignore') #Ignorar warnigs

# organizar os imports - explicar o motivo deles

  from .autonotebook import tqdm as notebook_tqdm


# Load Dataset

In [2]:
cdc_diabetes_health_indicators = fetch_ucirepo(id=891)

data = cdc_diabetes_health_indicators.data.features
aux = cdc_diabetes_health_indicators.data.targets

data["Diabetes_binary"] = aux["Diabetes_binary"]

print(cdc_diabetes_health_indicators.metadata)

print(cdc_diabetes_health_indicators.variables)

data.columns

{'uci_id': 891, 'name': 'CDC Diabetes Health Indicators', 'repository_url': 'https://archive.ics.uci.edu/dataset/891/cdc+diabetes+health+indicators', 'data_url': 'https://archive.ics.uci.edu/static/public/891/data.csv', 'abstract': 'The Diabetes Health Indicators Dataset contains healthcare statistics and lifestyle survey information about people in general along with their diagnosis of diabetes. The 35 features consist of some demographics, lab test results, and answers to survey questions for each patient. The target variable for classification is whether a patient has diabetes, is pre-diabetic, or healthy. ', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Tabular', 'Multivariate'], 'num_instances': 253680, 'num_features': 21, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Sex', 'Age', 'Education Level', 'Income'], 'target_col': ['Diabetes_binary'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_

Index(['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke',
       'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income',
       'Diabetes_binary'],
      dtype='object')

# Pré-Processamento

## Transformação do data type para integer

In [3]:
data['Age']=data['Age'].astype('int64')
data['Education']=data['Education'].astype('int64')
data['Income']=data['Income'].astype('int64')
data['BMI']=data['BMI'].astype('int64')
data['CholCheck']=data['CholCheck'].astype('int64')
data['Smoker']=data['Smoker'].astype('int64')
data['Stroke']=data['Stroke'].astype('int64')
data['HeartDiseaseorAttack']=data['HeartDiseaseorAttack'].astype('int64')
data['PhysActivity']=data['PhysActivity'].astype('int64')
data['Fruits']=data['Fruits'].astype('int64')
data['Veggies']=data['Veggies'].astype('int64')
data['HvyAlcoholConsump']=data['HvyAlcoholConsump'].astype('int64')
data['AnyHealthcare']=data['AnyHealthcare'].astype('int64')
data['NoDocbcCost']=data['NoDocbcCost'].astype('int64')
data['GenHlth']=data['GenHlth'].astype('int64')
data['MentHlth']=data['MentHlth'].astype('int64')
data['PhysHlth']=data['PhysHlth'].astype('int64')
data['DiffWalk']=data['DiffWalk'].astype('int64')
data['Sex']=data['Sex'].astype('int64')
data['Diabetes_binary']=data['Diabetes_binary'].astype('int64')
data['HighBP']=data['HighBP'].astype('int64')
data['HighChol']=data['HighChol'].astype('int64')

## Remoção de duplicatas

In [4]:
data.duplicated().sum()

24206

In [5]:
data.drop_duplicates(inplace=True)

In [6]:
data.shape

(229474, 22)

In [7]:

data.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
0,1,1,1,40,1,0,0,0,0,1,...,0,5,18,15,1,0,9,4,3,0
1,0,0,0,25,1,0,0,1,0,0,...,1,3,0,0,0,0,7,6,1,0
2,1,1,1,28,0,0,0,0,1,0,...,1,5,30,30,1,0,9,4,8,0
3,1,0,1,27,0,0,0,1,1,1,...,0,2,0,0,0,0,11,3,6,0
4,1,1,1,24,0,0,0,1,1,1,...,0,2,3,0,0,0,11,5,4,0


## Divisão dos tipos de Variáveis

In [8]:
categorical_columns = ['GenHlth', 'Age', 'Education', 'Income']
numerical_columns = ['BMI', 'MentHlth', 'PhysHlth']
binary_columns = ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Sex']

# Divisão de Dados - Treino, Validação e Teste

Optamos por realizar o estudo do optuna com a base sampleada para economizar tempo, uma vez que cada tentativa de execução superava o tempo limite do colab.

In [9]:
data1 = data.sample(frac=0.05, random_state=42) # Base de dados sampleada

x = data1.drop('Diabetes_binary', axis = 1)
y = data1['Diabetes_binary']

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

x_train, x_validation, y_train, y_validation = train_test_split(x_train, y_train, test_size=0.15, random_state=42, stratify=y_train)


print("Treino\n",y_train.value_counts())
print("\nValidação \n",y_validation.value_counts())
print("\nTeste\n",y_test.value_counts())

Treino
 Diabetes_binary
0    6580
1    1222
Name: count, dtype: int64

Validação 
 Diabetes_binary
0    1161
1     216
Name: count, dtype: int64

Teste
 Diabetes_binary
0    1935
1     360
Name: count, dtype: int64


## PCA

In [11]:
pca = PCA(n_components=5)
pca.fit(x_train)
X_train = pca.fit_transform(x_train)
X_test = pca.fit_transform(x_test)

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

# Modelos

## KNN

In [14]:
preprocessor_knn = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_columns),
        ('cat', OneHotEncoder(), categorical_columns),
        ('bin', 'passthrough', binary_columns)
    ])

knn_pipeline = Pipeline([
    ('preprocessor', preprocessor_knn),
    ('classifier', KNeighborsClassifier())
])

In [20]:
def objective_knn(trial):
    pipeline = Pipeline([
        ('preprocessor', preprocessor_knn),
        ('knn', KNeighborsClassifier())
    ])
    k = trial.suggest_int('k', 2, 25)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    metric = trial.suggest_categorical('metric', ['euclidean', 'manhattan', 'minkowski'])
    pipeline.set_params(knn__n_neighbors=k, knn__weights=weights, knn__metric=metric)
    pipeline.fit(x_train, y_train)

    y_pred = pipeline.predict(x_validation)
    score = f1_score(y_validation, y_pred)
    return score




study_knn = optuna.create_study(direction='maximize')
study_knn.optimize(objective_knn, n_trials=100, n_jobs=-1)

study_knn.best_params

[I 2024-11-23 19:50:31,594] A new study created in memory with name: no-name-7b76c87c-7a4f-414d-8199-5e03647af8cd
[I 2024-11-23 19:50:31,928] Trial 1 finished with value: 0.24489795918367346 and parameters: {'k': 8, 'weights': 'distance', 'metric': 'minkowski'}. Best is trial 1 with value: 0.24489795918367346.
[I 2024-11-23 19:50:32,279] Trial 0 finished with value: 0.1532258064516129 and parameters: {'k': 17, 'weights': 'uniform', 'metric': 'euclidean'}. Best is trial 1 with value: 0.24489795918367346.
[I 2024-11-23 19:50:32,592] Trial 2 finished with value: 0.1532258064516129 and parameters: {'k': 17, 'weights': 'uniform', 'metric': 'euclidean'}. Best is trial 1 with value: 0.24489795918367346.
[I 2024-11-23 19:50:32,859] Trial 3 finished with value: 0.12903225806451613 and parameters: {'k': 8, 'weights': 'uniform', 'metric': 'euclidean'}. Best is trial 1 with value: 0.24489795918367346.
[I 2024-11-23 19:50:33,046] Trial 4 finished with value: 0.1532258064516129 and parameters: {'k':

{'k': 2, 'weights': 'distance', 'metric': 'minkowski'}

### Melhores Hiper-parâmetros k=2, weights=distance, metric=minkowski

In [26]:
score = np.zeros(30)
acc = np.zeros(30)
rec = np.zeros(30)
prec = np.zeros(30)
score_train = np.zeros(30)
acc_train = np.zeros(30)
rec_train = np.zeros(30)
prec_train = np.zeros(30)

for i in range(30):
  df = data.sample(frac=1, random_state=42)
  x = df.drop(columns=['Diabetes_binary'])
  y = df['Diabetes_binary']
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=10, stratify=y)

  # pca = PCA(n_components=5)
  # pca.fit(x_train)
  # x_train = pca.fit_transform(x_train)
  # x_test = pca.fit_transform(x_test)

  # x_train = pd.DataFrame(X_train)
  # x_test = pd.DataFrame(X_test)


  pipeline = Pipeline([
      ('preprocessor', preprocessor_knn),
      ('pca', PCA(n_components=5)),
      ('knn', KNeighborsClassifier())
  ])

  pipeline.set_params(knn__n_neighbors=2, knn__weights='distance', knn__metric='minkowski')
  pipeline.fit(x_train, y_train)
  y_train_pred = pipeline.predict(x_train)
  y_pred = pipeline.predict(x_test)
  score[i] = f1_score(y_test, y_pred)
  acc[i] = accuracy_score(y_test, y_pred)
  rec[i] = recall_score(y_test, y_pred)
  prec[i] = precision_score(y_test, y_pred)

  score_train[i] = f1_score(y_train, y_train_pred)
  acc_train[i] = accuracy_score(y_train, y_train_pred)
  rec_train[i] = recall_score(y_train, y_train_pred)
  prec_train[i] = precision_score(y_train, y_train_pred)

  print(i)

print("MÉTRICAS DE TREINO")
print("F1_SCORE: \n")
print(f" Média -> {score_train.mean()}")
print(f" Desvio Padrão -> {score.std()}")
print("\nACCURACY: \n")
print(f" Média -> {acc_train.mean()}")
print(f" Desvio Padrão -> {acc.std()}")
print("\nRECALL: \n")
print(f" Média -> {rec_train.mean()}")
print(f" Desvio Padrão -> {rec.std()}")
print("\nPRECISION: \n")
print(f" Média -> {prec_train.mean()}")
print(f" Desvio Padrão -> {prec_train.std()}")
print("\n\n---------------------\n\n")
print("MÉTRICAS DE TESTE")
print("F1_SCORE: \n")
print(f" Média -> {score.mean()}")
print(f" Desvio Padrão -> {score.std()}")
print("\nACCURACY: \n")
print(f" Média -> {acc.mean()}")
print(f" Desvio Padrão -> {acc.std()}")
print("\nRECALL: \n")
print(f" Média -> {rec.mean()}")
print(f" Desvio Padrão -> {rec.std()}")
print("\nPRECISION: \n")
print(f" Média -> {prec.mean()}")
print(f" Desvio Padrão -> {prec.std()}")

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
MÉTRICAS DE TREINO
F1_SCORE: 

 Média -> 0.9833229877922619
 Desvio Padrão -> 5.551115123125783e-17

ACCURACY: 

 Média -> 0.9949822885993361
 Desvio Padrão -> 1.1102230246251565e-16

RECALL: 

 Média -> 0.9671930967111694
 Desvio Padrão -> 0.0

PRECISION: 

 Média -> 1.0
 Desvio Padrão -> 0.0


---------------------


MÉTRICAS DE TESTE
F1_SCORE: 

 Média -> 0.27206198880757637
 Desvio Padrão -> 5.551115123125783e-17

ACCURACY: 

 Média -> 0.7789317722934792
 Desvio Padrão -> 1.1102230246251565e-16

RECALL: 

 Média -> 0.27011112166397566
 Desvio Padrão -> 0.0

PRECISION: 

 Média -> 0.2740412410869146
 Desvio Padrão -> 5.551115123125783e-17


## Logistic Regression

In [17]:
preprocessor_lr = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_columns),
        ('cat', OneHotEncoder(), categorical_columns),
        ('bin', 'passthrough', binary_columns)
    ])

lr_pipeline = Pipeline([
    ('preprocessor', preprocessor_lr),
    ('log_reg', LogisticRegression())
])

In [22]:
def objective_lr(trial):
    solver = trial.suggest_categorical('solver', ['liblinear', 'lbfgs', 'saga'])
    if solver == 'lbfgs':
        penalty = 'l2'  # lbfgs só suporta 'l2'
    else:
        penalty = trial.suggest_categorical('penalty', ['l2', 'l1'])  # liblinear e saga suportam 'l1' e 'l2'
    C = trial.suggest_float('C', 0.01, 10.0)
    max_iter = trial.suggest_int('max_iter', 100, 1000)
    pipeline = Pipeline([
    ('preprocessor', preprocessor_lr),
    ('log_reg', LogisticRegression())
    ])
    pipeline.set_params(log_reg__solver=solver, log_reg__penalty=penalty, log_reg__C=C, log_reg__max_iter=max_iter)
    pipeline.fit(x_train, y_train)

    y_pred = pipeline.predict(x_validation)

    score = f1_score(y_validation, y_pred)

    return score

study_lr = optuna.create_study(direction='maximize')
study_lr.optimize(objective_lr, n_trials=100, n_jobs=-1)

print(study_lr.best_params)




[I 2024-11-23 19:52:37,154] A new study created in memory with name: no-name-1891cd0c-bedd-4443-9c08-bce3e3a2b063
[I 2024-11-23 19:52:39,098] Trial 0 finished with value: 0.30935251798561153 and parameters: {'solver': 'saga', 'penalty': 'l2', 'C': 1.8144893835258327, 'max_iter': 875}. Best is trial 0 with value: 0.30935251798561153.
[I 2024-11-23 19:52:39,200] Trial 2 finished with value: 0.30824372759856633 and parameters: {'solver': 'lbfgs', 'C': 4.614272775123134, 'max_iter': 464}. Best is trial 0 with value: 0.30935251798561153.
[I 2024-11-23 19:52:39,296] Trial 3 finished with value: 0.3104693140794224 and parameters: {'solver': 'liblinear', 'penalty': 'l2', 'C': 1.051624740348695, 'max_iter': 345}. Best is trial 3 with value: 0.3104693140794224.
[I 2024-11-23 19:52:39,386] Trial 4 finished with value: 0.30824372759856633 and parameters: {'solver': 'lbfgs', 'C': 7.280785660826048, 'max_iter': 486}. Best is trial 3 with value: 0.3104693140794224.
[I 2024-11-23 19:52:39,513] Trial 5

{'solver': 'liblinear', 'penalty': 'l2', 'C': 1.051624740348695, 'max_iter': 345}


Melhor Hiper-parâmetros: 'solver': 'liblinear', 'penalty': 'l2', 'C': 1.051624740348695, 'max_iter': 345

In [19]:
score = np.zeros(30)
acc = np.zeros(30)
rec = np.zeros(30)
prec = np.zeros(30)
score_train = np.zeros(30)
acc_train = np.zeros(30)
rec_train = np.zeros(30)
prec_train = np.zeros(30)

for i in range(30):
  df = data.sample(frac=1, random_state=42)
  x = df.drop(columns=['Diabetes_binary'])
  y = df['Diabetes_binary']
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=10, stratify=y)


  X_train = pd.DataFrame(X_train)
  X_test = pd.DataFrame(X_test)

  
  pipeline = Pipeline([
    ('preprocessor', preprocessor_lr),
    ('pca', PCA(n_components=5)),
    ('log_reg', LogisticRegression())
  ])

  pipeline.set_params(log_reg__solver='liblinear', log_reg__penalty='l2', log_reg__C=1.051624740348695, log_reg__max_iter=345)
  pipeline.fit(x_train, y_train)
  y_train_pred = pipeline.predict(x_train)
  y_pred = pipeline.predict(x_test)
  score[i] = f1_score(y_test, y_pred)
  acc[i] = accuracy_score(y_test, y_pred)
  rec[i] = recall_score(y_test, y_pred)
  prec[i] = precision_score(y_test, y_pred)

  score_train[i] = f1_score(y_train, y_train_pred)
  acc_train[i] = accuracy_score(y_train, y_train_pred)
  rec_train[i] = recall_score(y_train, y_train_pred)
  prec_train[i] = precision_score(y_train, y_train_pred)
  
  

print("MÉTRICAS DE TREINO")
print("F1_SCORE: \n")
print(f" Média -> {score_train.mean()}")
print(f" Desvio Padrão -> {score.std()}")
print("\nACCURACY: \n")
print(f" Média -> {acc_train.mean()}")
print(f" Desvio Padrão -> {acc.std()}")
print("\nRECALL: \n")
print(f" Média -> {rec_train.mean()}")
print(f" Desvio Padrão -> {rec.std()}")
print("\nPRECISION: \n")
print(f" Média -> {prec_train.mean()}")
print(f" Desvio Padrão -> {prec_train.std()}")
print("\n\n---------------------\n\n")
print("MÉTRICAS DE TESTE")
print("F1_SCORE: \n")
print(f" Média -> {score.mean()}")
print(f" Desvio Padrão -> {score.std()}")
print("\nACCURACY: \n")
print(f" Média -> {acc.mean()}")
print(f" Desvio Padrão -> {acc.std()}")
print("\nRECALL: \n")
print(f" Média -> {rec.mean()}")
print(f" Desvio Padrão -> {rec.std()}")
print("\nPRECISION: \n")
print(f" Média -> {prec.mean()}")
print(f" Desvio Padrão -> {prec.std()}")

Fase: 0
Fase: 1
Fase: 2
Fase: 3
Fase: 4
Fase: 5
Fase: 6
Fase: 7
Fase: 8
Fase: 9
Fase: 10
Fase: 11
Fase: 12
Fase: 13
Fase: 14
Fase: 15
Fase: 16
Fase: 17
Fase: 18
Fase: 19
Fase: 20
Fase: 21
Fase: 22
Fase: 23
Fase: 24
Fase: 25
Fase: 26
Fase: 27
Fase: 28
Fase: 29
MÉTRICAS DE TREINO
F1_SCORE: 

 Média -> 0.23587965192820212
 Desvio Padrão -> 8.326672684688674e-17

ACCURACY: 

 Média -> 0.8518530047126648
 Desvio Padrão -> 2.220446049250313e-16

RECALL: 

 Média -> 0.14950341908173229
 Desvio Padrão -> 2.7755575615628914e-17

PRECISION: 

 Média -> 0.5586311787072245
 Desvio Padrão -> 1.1102230246251565e-16


---------------------


MÉTRICAS DE TESTE
F1_SCORE: 

 Média -> 0.23635952003622376
 Desvio Padrão -> 8.326672684688674e-17

ACCURACY: 

 Média -> 0.8530133782664902
 Desvio Padrão -> 2.220446049250313e-16

RECALL: 

 Média -> 0.14873207332130303
 Desvio Padrão -> 2.7755575615628914e-17

PRECISION: 

 Média -> 0.5753122703894195
 Desvio Padrão -> 1.1102230246251565e-16


## SVM

In [11]:
preprocessor_svm = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_columns),
        ('cat', OneHotEncoder(), categorical_columns),
        ('bin', 'passthrough', binary_columns)
    ])

svm_pipeline = Pipeline([
        ('preprocessor', preprocessor_svm),
        ('svm', SVC())
    ])


In [None]:
def objective_svm(trial):
    pipeline = Pipeline([
        ('preprocessor', preprocessor_svm),
        ('pca', PCA(n_components=5)),
        ('svm', SVC())
    ])


    C = trial.suggest_float('C', 0.001, 1000, log =True)
    gamma = trial.suggest_float('gamma', 0.0001, 10, log = True)
    kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly', 'sigmoid'])

    pipeline.set_params(svm__C = C, svm__gamma = gamma, svm__kernel = kernel)
    pipeline.fit(x_train, y_train)

    y_pred = pipeline.predict(x_validation)
    score = f1_score(y_validation, y_pred)
    return score



study_svm = optuna.create_study(direction='maximize')
study_svm.optimize(objective_svm, n_trials=100, n_jobs=-1)


study_svm.best_params

[I 2024-11-26 22:46:46,538] A new study created in memory with name: no-name-7bbfca8a-f950-4ba4-a8a3-1cb2355b7726


'C': 165.95337664088618, 'gamma': 0.024468806409998854, 'kernel': 'sigmoid'

In [15]:
score = np.zeros(30)
acc = np.zeros(30)
rec = np.zeros(30)
prec = np.zeros(30)
score_train = np.zeros(30)
acc_train = np.zeros(30)
rec_train = np.zeros(30)
prec_train = np.zeros(30)

for i in range(30):
  df = data.sample(frac=1, random_state=42)
  x = df.drop(columns=['Diabetes_binary'])
  y = df['Diabetes_binary']
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=10, stratify=y)



  pipeline = Pipeline([
    ('preprocessor', preprocessor_svm),
    ('pca', PCA(n_components=5)),
    ('svm', SVC())
  ])
  C = 165.95337664088618
  gamma = 0.024468806409998854
  kernel = 'sigmoid'
  pipeline.set_params(svm__C = C, svm__gamma = gamma, svm__kernel = kernel)
  pipeline.fit(x_train, y_train)


  y_train_pred = pipeline.predict(x_train)
  y_pred = pipeline.predict(x_test)
  score[i] = f1_score(y_test, y_pred)
  acc[i] = accuracy_score(y_test, y_pred)
  rec[i] = recall_score(y_test, y_pred)
  prec[i] = precision_score(y_test, y_pred)

  score_train[i] = f1_score(y_train, y_train_pred)
  acc_train[i] = accuracy_score(y_train, y_train_pred)
  rec_train[i] = recall_score(y_train, y_train_pred)
  prec_train[i] = precision_score(y_train, y_train_pred)
  
  print("Fase:",i)

print("MÉTRICAS DE TREINO")
print("F1_SCORE: \n")
print(f" Média -> {score_train.mean()}")
print(f" Desvio Padrão -> {score.std()}")
print("\nACCURACY: \n")
print(f" Média -> {acc_train.mean()}")
print(f" Desvio Padrão -> {acc.std()}")
print("\nRECALL: \n")
print(f" Média -> {rec_train.mean()}")
print(f" Desvio Padrão -> {rec.std()}")
print("\nPRECISION: \n")
print(f" Média -> {prec_train.mean()}")
print(f" Desvio Padrão -> {prec_train.std()}")
print("\n\n---------------------\n\n")
print("MÉTRICAS DE TESTE")
print("F1_SCORE: \n")
print(f" Média -> {score.mean()}")
print(f" Desvio Padrão -> {score.std()}")
print("\nACCURACY: \n")
print(f" Média -> {acc.mean()}")
print(f" Desvio Padrão -> {acc.std()}")
print("\nRECALL: \n")
print(f" Média -> {rec.mean()}")
print(f" Desvio Padrão -> {rec.std()}")
print("\nPRECISION: \n")
print(f" Média -> {prec.mean()}")
print(f" Desvio Padrão -> {prec.std()}")

Fase: 0
Fase: 1
Fase: 2
Fase: 3
Fase: 4
Fase: 5
Fase: 6
Fase: 7
Fase: 8
Fase: 9
Fase: 10
Fase: 11
Fase: 12
Fase: 13
Fase: 14
Fase: 15
Fase: 16
Fase: 17
Fase: 18
Fase: 19
Fase: 20
Fase: 21
Fase: 22
Fase: 23
Fase: 24
Fase: 25
Fase: 26
Fase: 27
Fase: 28
Fase: 29
MÉTRICAS DE TREINO
F1_SCORE: 

 Média -> 0.2684242564177914
 Desvio Padrão -> 5.551115123125783e-17

ACCURACY: 

 Média -> 0.7830244473358193
 Desvio Padrão -> 2.220446049250313e-16

RECALL: 

 Média -> 0.2602572451970042
 Desvio Padrão -> 5.551115123125783e-17

PRECISION: 

 Média -> 0.27712044380878076
 Desvio Padrão -> 5.551115123125783e-17


---------------------


MÉTRICAS DE TESTE
F1_SCORE: 

 Média -> 0.27078079414528017
 Desvio Padrão -> 5.551115123125783e-17

ACCURACY: 

 Média -> 0.7857879522972558
 Desvio Padrão -> 2.220446049250313e-16

RECALL: 

 Média -> 0.2600436888593408
 Desvio Padrão -> 5.551115123125783e-17

PRECISION: 

 Média -> 0.28244274809160314
 Desvio Padrão -> 5.551115123125783e-17


In [18]:
import pickle

with open('modelo_svm.pkl', 'wb') as f:
    pickle.dump(pipeline, f)