<a href="https://colab.research.google.com/github/Rodrigo-Perico/Introduction-to-Machine-Learning-on-AWS/blob/main/SVM_teste.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:
!pip install optuna
!pip install ucimlrepo



In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import optuna
from plotly.io import show
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, FunctionTransformer, RobustScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import accuracy_score, f1_score, make_scorer, confusion_matrix, ConfusionMatrixDisplay, recall_score, precision_score
from sklearn.compose import ColumnTransformer
from ucimlrepo import fetch_ucirepo
from functools import partial
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report

In [2]:
cdc_diabetes_health_indicators = fetch_ucirepo(id=891)

df = cdc_diabetes_health_indicators.data.features
aux = cdc_diabetes_health_indicators.data.targets

df["diabetes"] = aux["Diabetes_binary"]

# df = df.sample(frac=0.03, random_state=42)

print(cdc_diabetes_health_indicators.metadata)

print(cdc_diabetes_health_indicators.variables)

{'uci_id': 891, 'name': 'CDC Diabetes Health Indicators', 'repository_url': 'https://archive.ics.uci.edu/dataset/891/cdc+diabetes+health+indicators', 'data_url': 'https://archive.ics.uci.edu/static/public/891/data.csv', 'abstract': 'The Diabetes Health Indicators Dataset contains healthcare statistics and lifestyle survey information about people in general along with their diagnosis of diabetes. The 35 features consist of some demographics, lab test results, and answers to survey questions for each patient. The target variable for classification is whether a patient has diabetes, is pre-diabetic, or healthy. ', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Tabular', 'Multivariate'], 'num_instances': 253680, 'num_features': 21, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Sex', 'Age', 'Education Level', 'Income'], 'target_col': ['Diabetes_binary'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [3]:
df.duplicated().sum()

24206

In [4]:
df.drop_duplicates(inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [5]:
X = df.drop(columns=["diabetes"])
y = df["diabetes"]

x_train, x_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=10, stratify=y)

categorical_columns = ['GenHlth', 'Age', 'Education', 'Income']
numerical_columns = ['BMI', 'MentHlth', 'PhysHlth']
binary_columns = ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Sex']


#print(y_train.value_counts())
#print(y_test.value_counts())

x_validation, x_test, y_validation, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=10, stratify=y_temp)

print("Treino\n",y_train.value_counts())
print("\nValidação \n",y_validation.value_counts())
print("\nTeste\n",y_test.value_counts())

Treino
 diabetes
0    155501
1     28078
Name: count, dtype: int64

Validação 
 diabetes
0    19438
1     3509
Name: count, dtype: int64

Teste
 diabetes
0    19438
1     3510
Name: count, dtype: int64


In [6]:
preprocessor_svm = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_columns),
        ('cat', OneHotEncoder(), categorical_columns),
        ('bin', 'passthrough', binary_columns)
    ])


In [7]:
pca = PCA(n_components=3)
pca.fit(x_train)
X_train = pca.fit_transform(x_train)
X_test = pca.fit_transform(x_test)

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [8]:
x_train.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
147736,0,0,1,19,1,0,0,0,1,1,...,0,1,3,10,3,0,0,3,6,7
238301,0,1,1,33,1,0,0,0,0,0,...,1,0,4,25,10,1,0,5,5,2
75436,0,1,1,26,1,0,0,1,1,1,...,1,0,3,0,14,0,1,11,6,7
192687,0,1,1,27,1,0,0,0,0,1,...,1,0,3,5,2,0,0,12,3,4
94464,1,1,1,28,1,0,0,1,1,1,...,1,0,1,0,0,0,1,11,6,7


In [None]:

def objective(trial):
    pipeline = Pipeline([
        #('outlier_remover', outlier_remover),
        ('preprocessor', preprocessor_svm),
        ('svm', SVC())

    ])


    # C = trial.suggest_float('C', 0.001, 1000, log =True)
    C = trial.suggest_categorical('C', [c for c in range(1, 1000,100)])
    # suggest_float(..., log=True)
    gamma = trial.suggest_float('gamma', 0.0001, 8, log = True)
    kernel = trial.suggest_categorical('kernel', ['rbf', 'poly', 'sigmoid'])

    # print(C)

    pipeline.set_params(svm__C = C, svm__gamma = gamma, svm__kernel = kernel)
    pipeline.fit(x_train, y_train)

    y_pred = pipeline.predict(x_validation)
    score = f1_score(y_validation, y_pred)
    return score


# #objective_func = partial(objective, pipeline=pipe_knn)
# for i in range(0,1000):
#   X = df.sample(frac=0.01, random_state=42)
#   y = X["diabetes"]
#   X = X.drop(columns=["diabetes"])

#   x_train, x_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=10, stratify=y)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30) # incrementação do parâmetro n_jobs=-1
# study.optimize(objective_func, n_trials=100)
study.best_params

[I 2024-11-19 21:22:07,594] A new study created in memory with name: no-name-92eaf719-d67a-4305-a530-bd37a21586b6


In [9]:
pipeline = Pipeline([
      #('outlier_remover', outlier_remover),
      ('preprocessor_svm', preprocessor_svm),
      ('svm', SVC())
  ])

C = 900
gamma = 0.03215038461268562
kernel=  'sigmoid'

pipeline.set_params(svm__C = C, svm__gamma = gamma, svm__kernel = kernel)
pipeline.fit(x_train, y_train)

y_pred = pipeline.predict(x_temp)
score = f1_score(y_temp, y_pred)
print(score)
print(classification_report(y_temp, y_pred))


0.2987852525396036
              precision    recall  f1-score   support

           0       0.87      0.87      0.87     38876
           1       0.30      0.30      0.30      7019

    accuracy                           0.78     45895
   macro avg       0.59      0.59      0.59     45895
weighted avg       0.79      0.78      0.79     45895



In [None]:
print(score)