In [7]:
import pandas as pd
import numpy as np

# Carregar o dataset
file_path = 'Ficheiros CSV/CVD_cleaned.csv'  # Substitua pelo caminho do ficheiro
data = pd.read_csv(file_path)

# Copiar o dataset original
data_missing_10 = data.copy()
data_missing_20 = data.copy()

# Função para remover aleatoriamente 
def remove_random_values(data, percentage):
    data_with_nan = data.copy()
    np.random.seed(42)  
    mask = np.random.rand(*data_with_nan.shape) < (percentage / 100)
    data_with_nan[mask] = np.nan
    return data_with_nan

# Remover 10% e 20% dos valores
data_missing_10 = remove_random_values(data_missing_10, 10)
data_missing_20 = remove_random_values(data_missing_20, 20)

output_file_10 = 'Ficheiros CSV/CVD_missing10.csv'
output_file_20 = 'Ficheiros CSV/CVD_missing20.csv'
data_missing_10.to_csv(output_file_10, index=False)
data_missing_20.to_csv(output_file_20, index=False)

data_missing_20.head()


Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,Within the past 2 years,No,No,,,,No,Yes,Female,,150.0,32.66,14.54,,,30.0,16.0,12.0
1,Very Good,Within the past year,,Yes,No,No,No,,No,Female,,165.0,,,No,0.0,30.0,0.0,
2,Very Good,Within the past year,,No,,No,No,Yes,No,Female,60-64,,88.45,33.47,No,4.0,12.0,3.0,
3,,,Yes,Yes,No,No,No,Yes,No,,75-79,,93.44,28.73,,,30.0,30.0,8.0
4,Good,,No,,No,No,No,,No,Male,80+,191.0,88.45,24.37,,0.0,8.0,4.0,0.0


In [8]:
data_missing_10.isnull().sum()

General_Health                  30992
Checkup                         31114
Exercise                        30960
Heart_Disease                   31188
Skin_Cancer                     30867
Other_Cancer                    30907
Depression                      30630
Diabetes                        30983
Arthritis                       30946
Sex                             30941
Age_Category                    30743
Height_(cm)                     30972
Weight_(kg)                     30701
BMI                             30875
Smoking_History                 31135
Alcohol_Consumption             30834
Fruit_Consumption               30911
Green_Vegetables_Consumption    30639
FriedPotato_Consumption         30889
dtype: int64

In [9]:
data_missing_20.isnull().sum()

General_Health                  61863
Checkup                         62133
Exercise                        61889
Heart_Disease                   62196
Skin_Cancer                     61728
Other_Cancer                    61728
Depression                      61505
Diabetes                        61877
Arthritis                       61786
Sex                             61891
Age_Category                    61597
Height_(cm)                     61709
Weight_(kg)                     61340
BMI                             61698
Smoking_History                 61880
Alcohol_Consumption             61989
Fruit_Consumption               61510
Green_Vegetables_Consumption    61481
FriedPotato_Consumption         61674
dtype: int64

In [10]:

# Estratégias para lidar com valores faltantes
# 1. Preencher valores numéricos com a média
# 2. Preencher valores categóricos com a moda

def handle_missing_values(data):
    data_filled = data.copy()
    for column in data_filled.columns:
        if data_filled[column].dtype in [np.float64, np.int64]:
            # Preencher valores numéricos com a média
            data_filled[column].fillna(data_filled[column].mean(), inplace=True)
        else:
            # Preencher valores categóricos com a moda
            data_filled[column].fillna(data_filled[column].mode()[0], inplace=True)
    return data_filled

# Aplicar estratégias aos datasets com valores removidos
data_filled_10 = handle_missing_values(data_missing_10)
data_filled_20 = handle_missing_values(data_missing_20)

# Salvar os datasets resultantes
output_file_10 = 'Ficheiros CSV/CVD_missing_10_filled.csv'
output_file_20 = 'Ficheiros CSV/CVD_missing_20_filled.csv'
data_filled_10.to_csv(output_file_10, index=False)
data_filled_20.to_csv(output_file_20, index=False)

print(f"Dataset com 10% de valores faltantes preenchido salvo como: {output_file_10}")
data_filled_10.head()
data_filled_10.isnull().sum()



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_filled[column].fillna(data_filled[column].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_filled[column].fillna(data_filled[column].mean(), inplace=True)


Dataset com 10% de valores faltantes preenchido salvo como: Ficheiros CSV/CVD_missing_10_filled.csv
Dataset com 20% de valores faltantes preenchido salvo como: Ficheiros CSV/CVD_missing_20_filled.csv


General_Health                  0
Checkup                         0
Exercise                        0
Heart_Disease                   0
Skin_Cancer                     0
Other_Cancer                    0
Depression                      0
Diabetes                        0
Arthritis                       0
Sex                             0
Age_Category                    0
Height_(cm)                     0
Weight_(kg)                     0
BMI                             0
Smoking_History                 0
Alcohol_Consumption             0
Fruit_Consumption               0
Green_Vegetables_Consumption    0
FriedPotato_Consumption         0
dtype: int64

In [21]:
print(f"Dataset com 20% de valores missing preenchido salvo como: {output_file_20}")
data_filled_20.head()
data_filled_20.isnull().sum()

Dataset com 20% de valores missing preenchido salvo como: Ficheiros CSV/CVD_missing_20_filled.csv


General_Health                  0
Checkup                         0
Exercise                        0
Heart_Disease                   0
Skin_Cancer                     0
Other_Cancer                    0
Depression                      0
Diabetes                        0
Arthritis                       0
Sex                             0
Age_Category                    0
Height_(cm)                     0
Weight_(kg)                     0
BMI                             0
Smoking_History                 0
Alcohol_Consumption             0
Fruit_Consumption               0
Green_Vegetables_Consumption    0
FriedPotato_Consumption         0
dtype: int64

In [22]:

# Aplicar dropna() para remover linhas com valores ausentes
data_missing_10_dropped = data_missing_10.dropna()
data_missing_20_dropped = data_missing_20.dropna()

# Salvar os dados com valores ausentes removidos
output_file_10_dropped = 'Ficheiros CSV/CVD_missing10_dropped.csv'
output_file_20_dropped = 'Ficheiros CSV/CVD_missing20_dropped.csv'
data_missing_10_dropped.to_csv(output_file_10_dropped, index=False)
data_missing_20_dropped.to_csv(output_file_20_dropped, index=False)

# Mostrar as primeiras linhas dos dados com valores ausentes removidos

data_missing_10_dropped.isnull().sum()


General_Health                  0
Checkup                         0
Exercise                        0
Heart_Disease                   0
Skin_Cancer                     0
Other_Cancer                    0
Depression                      0
Diabetes                        0
Arthritis                       0
Sex                             0
Age_Category                    0
Height_(cm)                     0
Weight_(kg)                     0
BMI                             0
Smoking_History                 0
Alcohol_Consumption             0
Fruit_Consumption               0
Green_Vegetables_Consumption    0
FriedPotato_Consumption         0
dtype: int64

In [25]:
data_missing_20_dropped.isnull().sum()


General_Health                  0
Checkup                         0
Exercise                        0
Heart_Disease                   0
Skin_Cancer                     0
Other_Cancer                    0
Depression                      0
Diabetes                        0
Arthritis                       0
Sex                             0
Age_Category                    0
Height_(cm)                     0
Weight_(kg)                     0
BMI                             0
Smoking_History                 0
Alcohol_Consumption             0
Fruit_Consumption               0
Green_Vegetables_Consumption    0
FriedPotato_Consumption         0
dtype: int64