In [6]:
import pandas as pd
import numpy as np

# Carregar o dataset
file_path = 'Ficheiros CSV/CVD_cleaned.csv'  # Substitua pelo caminho do ficheiro
data = pd.read_csv(file_path)

# Copiar o dataset original para evitar alterações diretas
data_missing_10 = data.copy()
data_missing_20 = data.copy()

# Função para remover aleatoriamente uma porcentagem de valores de um dataset
def remove_random_values(data, percentage):
    data_with_nan = data.copy()
    np.random.seed(42)  # Para reprodutibilidade
    mask = np.random.rand(*data_with_nan.shape) < (percentage / 100)
    data_with_nan[mask] = np.nan
    return data_with_nan

# Remover 10% e 20% dos valores
data_missing_10 = remove_random_values(data_missing_10, 10)
data_missing_20 = remove_random_values(data_missing_20, 20)

# Estratégias para lidar com valores faltantes
# 1. Preencher valores numéricos com a média
# 2. Preencher valores categóricos com a moda

def handle_missing_values(data):
    data_filled = data.copy()
    for column in data_filled.columns:
        if data_filled[column].dtype in [np.float64, np.int64]:
            # Preencher valores numéricos com a média
            data_filled[column].fillna(data_filled[column].mean(), inplace=True)
        else:
            # Preencher valores categóricos com a moda
            data_filled[column].fillna(data_filled[column].mode()[0], inplace=True)
    return data_filled

# Aplicar estratégias aos datasets com valores removidos
data_filled_10 = handle_missing_values(data_missing_10)
data_filled_20 = handle_missing_values(data_missing_20)

# Salvar os datasets resultantes
output_file_10 = 'Ficheiros CSV/CVD_missing_10_filled.csv'
output_file_20 = 'Ficheiros CSV/CVD_missing_20_filled.csv'
data_filled_10.to_csv(output_file_10, index=False)
data_filled_20.to_csv(output_file_20, index=False)

print(f"Dataset com 10% de valores faltantes preenchido salvo como: {output_file_10}")
data_filled_10.head()
data_filled_10.isnull().sum()

print(f"Dataset com 20% de valores faltantes preenchido salvo como: {output_file_20}")
data_filled_20.head()
data_filled_20.isnull().sum()



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_filled[column].fillna(data_filled[column].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_filled[column].fillna(data_filled[column].mean(), inplace=True)


Dataset com 10% de valores faltantes preenchido salvo como: Ficheiros CSV/CVD_missing_10_filled.csv
Dataset com 20% de valores faltantes preenchido salvo como: Ficheiros CSV/CVD_missing_20_filled.csv


General_Health                  0
Checkup                         0
Exercise                        0
Heart_Disease                   0
Skin_Cancer                     0
Other_Cancer                    0
Depression                      0
Diabetes                        0
Arthritis                       0
Sex                             0
Age_Category                    0
Height_(cm)                     0
Weight_(kg)                     0
BMI                             0
Smoking_History                 0
Alcohol_Consumption             0
Fruit_Consumption               0
Green_Vegetables_Consumption    0
FriedPotato_Consumption         0
dtype: int64