In [6]:
# we import the necessary libraries
import torch 
import matplotlib.pyplot as plt
import pandas as pd
import numpy
import random 



In [7]:
# Load the data:
raw_data = pd.read_csv('diabetes.csv')
raw_data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [11]:
def dirtify_dataset(df:pd.DataFrame, missing_values:list, dirtyrate: float):
    
    dirty_data = df.copy()
    
    # Step 1: Add redundant rows
    redundant_sample_size = int(len(df) * dirtyrate)  # Calculate number of rows to duplicate
    if redundant_sample_size > 0:
        redundant_rows = df.sample(n=redundant_sample_size, random_state=42)
        dirty_data = pd.concat([dirty_data, redundant_rows])
    
    # Step 2: Add new rows with missing values
    missing_sample_size = int(len(df) * dirtyrate)  # Calculate number of rows to add with missing values
    if missing_sample_size > 0:
        missing_rows = df.sample(n=missing_sample_size, random_state=42)
        
        # Introduce missing values in the sampled rows
        for i in range(len(missing_rows)):
            row = missing_rows.iloc[i]
            
            # Randomly select a number of columns to make 'dirty'
            num_missing_cols = random.randint(1, max(1, len(row) // 2))  # Select 1 to half of the columns
            columns_to_dirty = random.sample(list(row.index), num_missing_cols)
            
            # Replace values in the selected columns with missing values
            for col in columns_to_dirty:
                row[col] = random.choice(missing_values)
            
            missing_rows.iloc[i] = row
        
        dirty_data = pd.concat([dirty_data, missing_rows])
    
    # Step 3: Shuffle the dirty data to mix redundant and missing data rows
    dirty_data = dirty_data.sample(frac=1, random_state=42).reset_index(drop=True)
    
    print(f"Original Data Length: {len(df)}")
    print(f"Dirty Data Length: {len(dirty_data)}")
    
    return dirty_data

missing_values = ["", "nan", "Null", "null"]
dirty_data = dirtify_dataset(raw_data, missing_values, 0.1)
    



Original Data Length: 768
Dirty Data Length: 920


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row[col] = random.choice(missing_values)
  row[col] = random.choice(missing_values)
  missing_rows.iloc[i] = row
  missing_rows.iloc[i] = row
  missing_rows.iloc[i] = row
  missing_rows.iloc[i] = row
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row[col] = random.choice(missing_values)
  missing_rows.iloc[i] = row
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row[col] = random.choice(missing_values)
  missing_rows.iloc[i] = row
  missing_rows.iloc[i] = row
A value

In [12]:
print(f"original dataset: we have {raw_data.shape[0]} rows of data and {raw_data.shape[1]} columns of data")
print(f"new dataset: we have {dirty_data.shape[0]} rows of data and {dirty_data.shape[1]} columns of data")


original dataset: we have 768 rows of data and 9 columns of data
new dataset: we have 920 rows of data and 9 columns of data


In [14]:
dirty_data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,194,78,0,0,23.5,0.129,59,1
1,1,87,60,37,75,37.2,0.509,22,0
2,0,127,80,37,210,36.3,0.804,23,0
3,2,146,70,38,360,28.0,0.337,29,1
4,0,107,76,0,0,45.3,0.686,24,0
...,...,...,...,...,...,...,...,...,...
915,1,96,122,0,0,22.4,0.207,27,0
916,10,101,86,37,0,45.6,1.136,38,1
917,3,,,32,0,37.2,0.267,28,0
918,0,141,0,0,0,42.4,0.205,29,1


In [32]:
# Data Cleaning:
# we check for missing values
missing_values = ["", "nan", "Null", "null"]
filtered_dirty_data = dirty_data.drop_duplicates()
filtered_dirty_data = filtered_dirty_data.replace(missing_values, numpy.nan)
filtered_dirty_data = filtered_dirty_data.dropna()
filtered_dirty_data.shape 


  filtered_dirty_data = filtered_dirty_data.replace(missing_values, numpy.nan)


(768, 9)

In [79]:
10%1

0

In [36]:
filtered_dirty_data[filtered_dirty_data == 0].drop("Outcome", axis=1).count()


Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
dtype: int64

<p>Pregnancies	Number of times pregnant	❌ No	It is possible to have 0 pregnancies.</p>
<p>Glucose	Plasma glucose concentration	✅ Yes	Physiologically impossible to have 0 glucose.</p>
<p>BloodPressure	Diastolic blood pressure (mm Hg)	✅ Yes	Impossible for diastolic BP to be 0 in a living person.</p>
<p>SkinThickness	Triceps skin fold thickness (mm)	✅ Yes	Impossible for human skin thickness to be 0 mm.</p>
<p>Insulin	2-Hour serum insulin (mu U/ml)	✅ Yes (likely)	Unlikely, unless the person has severe type 1 diabetes.</p>
<p>BMI	Body mass index (weight in kg/(height in m)^2)	✅ Yes	BMI of 0 is physiologically impossible.</p>
<p>DiabetesPedigreeFunction	Genetic risk measure (not physiological)	❌ No	0 is a valid value for a risk function.</p>
<p>Age	Age in years	❌ No	Age can be 0 (for infants).</p>

In [37]:
# now we need to replace the 0 values by using KNN imputer
# first we seperate the patients with diabetes and those without diabetes
diabetes = filtered_dirty_data[filtered_dirty_data["Outcome"] == 1]
print(diabetes.shape)
no_diabetes = filtered_dirty_data[filtered_dirty_data["Outcome"] == 0]
print(no_diabetes.shape)


(268, 9)
(500, 9)


In [51]:
# use knn to impute the missing values in the diabetes dataset
from sklearn.impute import KNNImputer
diabetes_col = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
imputer = KNNImputer(n_neighbors=5)
diabetes_imputed = imputer.fit_transform(diabetes)
diabetes_imputed = pd.DataFrame(diabetes_imputed, columns=diabetes.columns)
diabetes_imputed


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,194.0,78.0,0.0,0.0,23.5,0.129,59.0,1.0
1,2.0,146.0,70.0,38.0,360.0,28.0,0.337,29.0,1.0
2,2.0,100.0,66.0,20.0,90.0,32.9,0.867,28.0,1.0
3,4.0,125.0,70.0,18.0,122.0,28.9,1.144,45.0,1.0
4,6.0,115.0,60.0,39.0,0.0,33.7,0.245,40.0,1.0
...,...,...,...,...,...,...,...,...,...
263,10.0,148.0,84.0,48.0,237.0,37.6,1.001,51.0,1.0
264,9.0,112.0,82.0,32.0,175.0,34.2,0.260,36.0,1.0
265,11.0,138.0,74.0,26.0,144.0,36.1,0.557,50.0,1.0
266,10.0,101.0,86.0,37.0,0.0,45.6,1.136,38.0,1.0
