# Data Issue Generation Notebook

### 1. Import Libraries and Load Dataset

In [13]:

import pandas as pd
import numpy as np
import random

# Load the dataset 
dataset_path = '../data/diabetes_dataset.csv'  # Replace with the actual path to your dataset
df = pd.read_csv(dataset_path)

# Take a quick look at the first few rows to understand the data  
df.head()


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,1,0,Never,28.27,6.6,90,0
1,Male,65.0,0,0,Current,37.54,6.1,158,0
2,Female,28.0,0,0,Never,27.32,4.5,85,0
3,Male,39.0,0,0,No Info,27.98,6.1,100,0
4,Male,63.0,1,0,Former,29.97,6.8,130,1


### 1. Introduce Missing Values (in 'Age' and 'Blood Glucose Level')

In [14]:

# Introduce missing values into the dataset 
missing_percentage = 0.1  # Set 10% of the data to have missing values 
n_missing = int(len(df) * missing_percentage)

# Randomly select indices to assign missing values
missing_age_indices = random.sample(range(len(df)), n_missing)
missing_blood_glucose_indices = random.sample(range(len(df)), n_missing)
df.loc[missing_age_indices, 'age'] = np.nan  
df.loc[missing_blood_glucose_indices, 'blood_glucose_level'] = np.nan  # Set selected rows' 'age' and 'blood_glucose_level' columns to NaN  

# Extract rows where 'age' or 'blood_glucose_level' has missing values
missing_values_df = df[df['age'].isna() | df['blood_glucose_level'].isna()]

# Display a preview of the rows with missing values
missing_values_df.head()


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
10,Female,66.0,1,0,No Info,26.08,6.1,,0
14,Female,58.0,0,0,Never,22.4,5.0,,0
16,Male,0.88,0,0,No Info,20.07,4.8,,0
20,Male,,0,0,Never,45.22,6.0,240.0,1
28,Female,25.0,0,0,No Info,25.86,6.6,,0


###  2. Introduce Missing Columns (drop 'Heart Disease' column)

In [15]:
# Remove the 'heart_disease' column from the dataset
df_no_heart_disease = df.drop(columns=['heart_disease'])  # Dropping the 'Heart Disease' column

# Preview the dataset after removing the column  
df_no_heart_disease.head()

Unnamed: 0,gender,age,hypertension,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,1,Never,28.27,6.6,90.0,0
1,Male,65.0,0,Current,37.54,6.1,158.0,0
2,Female,28.0,0,Never,27.32,4.5,85.0,0
3,Male,39.0,0,No Info,27.98,6.1,100.0,0
4,Male,63.0,1,Former,29.97,6.8,130.0,1


### 3. Introduce Unknown Values in 'Gender'

In [17]:
unknown_gender_indices = random.sample(range(len(df)), 5)  # Select 5 random rows to assign an 'Unknown' gender  
df.loc[unknown_gender_indices, 'gender'] = 'Unknown'  # Update the 'gender' column for these selected rows 
unknown_gender_df = df[df['gender'] == 'Unknown']
unknown_gender_df.head()  # Extract and preview the rows where gender is set to 'Unknown'

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
11558,Unknown,18.0,0,0,Never,28.36,6.0,155.0,0
20527,Unknown,,0,0,No Info,27.32,6.2,140.0,0
31560,Unknown,54.0,0,0,Current,24.23,6.0,200.0,0
32748,Unknown,48.0,0,0,Current,23.55,5.7,126.0,0
34852,Unknown,54.0,0,0,Current,24.23,6.0,,0


### 4. Introduce Wrong data Values for Features

In [19]:
# Introduce incorrect values in the 'Age' and 'Gender' columns
wrong_age_indices = random.sample(range(len(df)), 5)  # Pick 5 random rows for wrong 'Age'
wrong_gender_indices = random.sample(range(len(df)), 5)  # Pick 5 random rows for wrong 'Gender'

df.loc[wrong_age_indices, 'age'] = -12  # Set negative age (invalid value) for selected rows 
df.loc[wrong_gender_indices, 'gender'] = 'child'  # Assign an incorrect category to the 'gender' column

# Extract rows where 'age' is negative or 'gender' has an incorrect value
wrong_values_df = df[(df['age'] == -12) | (df['gender'] == 'child')]

# Preview the rows with incorrect values
wrong_values_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
555,Female,-12.0,0,0,No Info,21.96,4.8,158.0,0
6294,child,,0,0,Never,28.36,6.0,155.0,0
7628,Female,-12.0,0,0,No Info,25.86,6.6,100.0,0
10224,Male,-12.0,0,0,No Info,20.07,4.8,158.0,0
13842,child,56.0,0,0,No Info,27.32,6.2,140.0,0


### 5. Introduce String in Numerical Columns (e.g., 'Blood Glucose Level')

In [22]:
# Introduce string values in numerical columns ('age' and 'blood_glucose_level')  

# Select random rows to insert incorrect values  
string_in_blood_glucose_indices = random.sample(range(len(df)), 25)  
string_in_age_indices = random.sample(range(len(df)), 35)

# Define incorrect values for 'blood_glucose_level'
blood_glucose_values = ['High', 'Low']

# Assign random incorrect string values to 'blood_glucose_level'
for idx, value in zip(string_in_blood_glucose_indices, random.choices(blood_glucose_values, k=len(string_in_blood_glucose_indices))):
    df.loc[idx, 'blood_glucose_level'] = value

# Define incorrect values for 'age'
age_values = ['Young', 'Old', 'Teen']

# Assign random incorrect string values to 'age'
for idx, value in zip(string_in_age_indices, random.choices(age_values, k=len(string_in_age_indices))):
    df.loc[idx, 'age'] = value

# Extract rows where 'blood_glucose_level' or 'age' contains incorrect string values  
wrong_values_df = df[df['blood_glucose_level'].isin(blood_glucose_values) | df['age'].isin(age_values)]

# Preview the rows with incorrect values
wrong_values_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
1165,Female,18.0,0,0,Never,28.36,6.0,Low,0
3293,Female,Teen,0,0,No Info,25.86,6.6,100.0,0
4378,Female,Old,1,1,Former,27.32,5.7,160.0,0
4631,Male,Old,0,0,No Info,27.32,6.0,145.0,0
5796,Female,Teen,0,0,No Info,27.32,6.2,140.0,0


### 6. Introduce Outliers (e.g., extreme BMI values)

In [23]:
# Introduce extreme outliers in the 'bmi' column  

# Select 2 random rows to assign outlier values 
outlier_bmi_indices = random.sample(range(len(df)), 2)
df.loc[outlier_bmi_indices, 'bmi'] = [100, 200]  # Set extremely high BMI values to simulate outliers 

# Extract and preview the rows containing BMI outliers
outliers_df = df[df['bmi'].isin([100, 200])]
outliers_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
7606,Male,41.0,0,0,No Info,100.0,5.7,160.0,0
45439,Female,33.0,0,0,No Info,200.0,6.0,200.0,0
55860,Female,44.0,0,0,No Info,100.0,4.8,158.0,0
80852,Female,56.0,0,0,No Info,200.0,6.2,140.0,0


### 7. Special Characters or Corrupted Data

In [24]:
# Introduce special character corruption in the 'hbA1c_level' column  

# Select 5 random rows to insert special characters  
special_char_indices = random.sample(range(len(df)), 5)

# Define a list of special character values 
corrupt_values = random.choices(['@#$', '*&%', '!@', '###', '$$$'], k=len(special_char_indices))

# Assign these corrupt values to the selected rows in the 'hbA1c_level' column 
df.loc[special_char_indices, 'hbA1c_level'] = corrupt_values 

# Extract and preview rows where 'hbA1c_level' contains special characters 
corrupt_data_df = df[df['hbA1c_level'].isin(['@#$', '*&%', '!@', '###', '$$$'])]
corrupt_data_df.head()



Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
14159,Female,50.0,1,0,Never,28.17,@#$,126.0,1
30385,Female,26.0,0,0,No Info,27.32,$$$,126.0,0
46931,Female,25.0,0,0,Never,22.82,*&%,145.0,0
62720,Female,48.0,0,0,Never,36.53,*&%,85.0,0
87104,Female,40.0,0,0,Never,22.05,$$$,90.0,0


### 8. Random Duplicate rows

In [25]:
# Randomly select 5 rows to swap 'age' and 'bmi' values 
swap_indices = random.sample(range(len(df)), 5)

# Save the original values before swapping (for reference)  
swapped_rows_before = df.loc[swap_indices, ['age', 'bmi']].copy()

# Swap the values between 'age' and 'bmi' columns 
df.loc[swap_indices, ['age', 'bmi']] = df.loc[swap_indices, ['bmi', 'age']].values  

# Preview the rows after swapping
swapped_rows_after = df.loc[swap_indices, ['age', 'bmi']]
swapped_rows_after.head()

Unnamed: 0,age,bmi
96114,38.96,41.0
18947,27.32,25.0
74480,21.06,17.0
58063,27.6,80.0
47134,34.28,68.0
