<a href="https://colab.research.google.com/github/OseiasBeu/Cardiovascular-diseases-risk-project/blob/main/Pipeline_Cardiovascular_diseases_risk_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
import pandas as pd
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import requests
import zipfile
import io
import os
import numpy as np
import pickle

In [2]:
# URL do arquivo ZIP
url = "https://github.com/OseiasBeu/Cardiovascular-diseases-risk-project/raw/main/dataset.zip"

# Diretório onde deseja extrair o conteúdo do arquivo zip
diretorio_destino = '/content/dataset/'

# Faz o download do arquivo ZIP
response = requests.get(url)
zip_file = io.BytesIO(response.content)

# Verifica se o diretório de destino existe, senão cria
if not os.path.exists(diretorio_destino):
    os.makedirs(diretorio_destino)

# Descompacta o arquivo zip
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall(diretorio_destino)

In [3]:
data = pd.read_csv('/content/dataset/dataset/CVD_cleaned.csv')

In [4]:
data.head()

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes,0.0,30.0,16.0,12.0
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165.0,77.11,28.29,No,0.0,30.0,0.0,4.0
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163.0,88.45,33.47,No,4.0,12.0,3.0,16.0
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180.0,93.44,28.73,No,0.0,30.0,30.0,8.0
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191.0,88.45,24.37,Yes,0.0,8.0,4.0,0.0


In [6]:
data.columns

Index(['General_Health', 'Checkup', 'Exercise', 'Heart_Disease', 'Skin_Cancer',
       'Other_Cancer', 'Depression', 'Diabetes', 'Arthritis', 'Sex',
       'Age_Category', 'Height_(cm)', 'Weight_(kg)', 'BMI', 'Smoking_History',
       'Alcohol_Consumption', 'Fruit_Consumption',
       'Green_Vegetables_Consumption', 'FriedPotato_Consumption'],
      dtype='object')

In [7]:
# primeira_linha_dict = data.iloc[0].to_dict()

In [8]:
# primeira_linha_dict

{'General_Health': 'Poor',
 'Checkup': 'Within the past 2 years',
 'Exercise': 'No',
 'Heart_Disease': 'No',
 'Skin_Cancer': 'No',
 'Other_Cancer': 'No',
 'Depression': 'No',
 'Diabetes': 'No',
 'Arthritis': 'Yes',
 'Sex': 'Female',
 'Age_Category': '70-74',
 'Height_(cm)': 150.0,
 'Weight_(kg)': 32.66,
 'BMI': 14.54,
 'Smoking_History': 'Yes',
 'Alcohol_Consumption': 0.0,
 'Fruit_Consumption': 30.0,
 'Green_Vegetables_Consumption': 16.0,
 'FriedPotato_Consumption': 12.0}

In [10]:
mock_data = {'General_Health': 'Poor',
 'Checkup': 'Within the past 2 years',
 'Exercise': 'No',
 'Heart_Disease': 'No',
 'Skin_Cancer': 'No',
 'Other_Cancer': 'No',
 'Depression': 'No',
 'Diabetes': 'No',
 'Arthritis': 'Yes',
 'Sex': 'Female',
 'Age_Category': '70-74',
 'Height_(cm)': 150.0,
 'Weight_(kg)': 32.66,
 'BMI': 14.54,
 'Smoking_History': 'Yes',
 'Alcohol_Consumption': 0.0,
 'Fruit_Consumption': 30.0,
 'Green_Vegetables_Consumption': 16.0,
 'FriedPotato_Consumption': 12.0}

In [14]:
# Transforme o dicionário em um DataFrame
data = pd.DataFrame([mock_data])

# Agora, 'df' conterá os dados do dicionário como um DataFrame
data.head()

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes,0.0,30.0,16.0,12.0


In [15]:
mapper_hd = {
    'No':0,
    'Yes':1,
    np.nan: 1
}

mapper_sex = {
    'Female':0,
    'Male':1,
    np.nan: 1
}

In [16]:
data['Heart_Disease'] = data['Heart_Disease'].map(mapper_hd,na_action=None)
data['Sex'] = data['Sex'].map(mapper_sex,na_action=None)
data['Exercise'] = data['Exercise'].map(mapper_hd,na_action=None)
data['Skin_Cancer'] = data['Skin_Cancer'].map(mapper_hd,na_action=None)
data['Other_Cancer'] = data['Other_Cancer'].map(mapper_hd,na_action=None)
data['Depression'] = data['Depression'].map(mapper_hd,na_action=None)
data['Arthritis'] = data['Arthritis'].map(mapper_hd,na_action=None)
data['Smoking_History'] = data['Smoking_History'].map(mapper_hd,na_action=None)

In [17]:
data.loc[data['Checkup'] == 'Within the past year', 'last_Checkup_years'] = '<1'
data.loc[data['Checkup'] == 'Within the past 2 years', 'last_Checkup_years'] = '<2'
data.loc[data['Checkup'] == 'Within the past 5 years', 'last_Checkup_years'] = '<5'
data.loc[data['Checkup'] == '5 or more years ago', 'last_Checkup_years'] = '>5'
data.loc[data['Checkup'] == 'Never', 'last_Checkup_years'] = 'Never'

In [18]:
data.loc[data['Diabetes'] == 'No', 'diabetes?'] = 'No'
data.loc[data['Diabetes'] == 'Yes', 'diabetes?'] = 'Yes'
data.loc[data['Diabetes'] == 'No, pre-diabetes or borderline diabetes', 'diabetes?'] = 'Pre_diabetes_boderline'
data.loc[data['Diabetes'] == 'Yes, but female told only during pregnancy', 'diabetes?'] = 'Yes_scovered_in_pregnancy'

In [19]:
# Get one hot encoding of columns
one_hot = pd.get_dummies(data[['General_Health','Age_Category','last_Checkup_years', 'diabetes?']])
# Drop columns as it is now encoded
data = data.drop(['General_Health', 'Age_Category','last_Checkup_years', 'diabetes?'],axis = 1)
# Join the encoded df
df = data.join(one_hot)

In [27]:
with open('dtc_model_cdr_r.pkl', 'rb') as file:
    modelo_treinado = pickle.load(file)

In [28]:
print(type(modelo_treinado))

<class 'sklearn.tree._classes.DecisionTreeClassifier'>


In [30]:
previsoes = modelo_treinado.predict(df)

ValueError: ignored