# Data Issue Generation Notebook

### 1. Import Libraries and Load Dataset

In [None]:

import pandas as pd
import numpy as np
import random

# Load the dataset 
dataset_path = '../airflow/data/diabetes_dataset.csv'  
df = pd.read_csv(dataset_path)
df_errors = df.copy()
df_errors.rename(columns={'diabetes': 'actual_label'}, inplace=True)
df_errors.head()


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,1,0,Never,28.27,6.6,90,0
1,Male,65.0,0,0,Current,37.54,6.1,158,0
2,Female,28.0,0,0,Never,27.32,4.5,85,0
3,Male,39.0,0,0,No Info,27.98,6.1,100,0
4,Male,63.0,1,0,Former,29.97,6.8,130,1


### 1. Introduce Missing Values (in 'Age' and 'Blood Glucose Level')

In [None]:
# --- 1. Missing Values in 'age' and 'blood_glucose_level' ---
missing_percentage = 0.1
n_missing = int(len(df_errors) * missing_percentage)

missing_age_indices = random.sample(range(len(df_errors)), n_missing)
missing_bgl_indices = random.sample(range(len(df_errors)), n_missing)

df_errors.loc[missing_age_indices, 'age'] = np.nan
df_errors.loc[missing_bgl_indices, 'blood_glucose_level'] = np.nan
df_errors.loc[missing_age_indices, 'hbA1c_level'] = np.nan  
df_errors.loc[missing_bgl_indices, 'gender'] = np.nan 

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
2,Female,28.0,0,0,Never,27.32,4.5,,0
3,Male,39.0,0,0,No Info,27.98,6.1,,0
8,Male,,0,0,No Info,20.07,4.8,158.0,0
20,Male,48.0,0,0,Never,45.22,6.0,,1
23,Female,56.0,0,0,No Info,27.32,6.2,,0


### 2. Introduce Unknown Values in 'Gender'

In [None]:
# --- 2. Unknown/Invalid Categories in 'gender' ---
unknown_gender_indices = random.sample(range(len(df_errors)), 25)
df_errors.loc[unknown_gender_indices, 'gender'] = 'Unknown'

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
6252,Unknown,54.0,0,0,Current,24.23,6.0,200.0,0
14274,Unknown,,0,0,Never,22.82,5.0,145.0,0
63503,Unknown,61.0,0,0,Former,29.39,5.8,200.0,0
90548,Unknown,36.0,0,0,No Info,19.85,6.6,85.0,0
92887,Unknown,54.0,0,0,Current,24.23,6.0,200.0,0


### 4. Introduce Wrong data Values for Features

In [7]:
# Introduce incorrect values in the 'Age' and 'Gender' columns
wrong_age_indices = random.sample(range(len(df)), 5)  # Pick 5 random rows for wrong 'Age'
wrong_gender_indices = random.sample(range(len(df)), 5)  # Pick 5 random rows for wrong 'Gender'

df.loc[wrong_age_indices, 'age'] = -12  # Set negative age (invalid value) for selected rows 
df.loc[wrong_gender_indices, 'gender'] = 'child'  # Assign an incorrect category to the 'gender' column

# Extract rows where 'age' is negative or 'gender' has an incorrect value
wrong_values_df = df[(df['age'] == -12) | (df['gender'] == 'child')]

# Preview the rows with incorrect values
wrong_values_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
51966,Female,-12.0,0,0,Never,22.82,5.0,145.0,0
55873,child,80.0,1,0,Never,28.27,6.6,,0
59658,child,48.0,0,0,Current,23.55,5.7,126.0,0
61061,Female,-12.0,0,0,No Info,27.32,6.2,140.0,0
67342,Female,-12.0,0,0,No Info,27.32,4.0,,0


### 5. Introduce String in Numerical Columns (e.g., 'Blood Glucose Level')

In [8]:
# Introduce string values in numerical columns ('age' and 'blood_glucose_level')  

# Select random rows to insert incorrect values  
string_in_blood_glucose_indices = random.sample(range(len(df)), 25)  
string_in_age_indices = random.sample(range(len(df)), 35)

# Define incorrect values for 'blood_glucose_level'
blood_glucose_values = ['High', 'Low']

# Assign random incorrect string values to 'blood_glucose_level'
for idx, value in zip(string_in_blood_glucose_indices, random.choices(blood_glucose_values, k=len(string_in_blood_glucose_indices))):
    df.loc[idx, 'blood_glucose_level'] = value

# Define incorrect values for 'age'
age_values = ['Young', 'Old', 'Teen']

# Assign random incorrect string values to 'age'
for idx, value in zip(string_in_age_indices, random.choices(age_values, k=len(string_in_age_indices))):
    df.loc[idx, 'age'] = value

# Extract rows where 'blood_glucose_level' or 'age' contains incorrect string values  
wrong_values_df = df[df['blood_glucose_level'].isin(blood_glucose_values) | df['age'].isin(age_values)]

# Preview the rows with incorrect values
wrong_values_df.head()

  df.loc[idx, 'blood_glucose_level'] = value
  df.loc[idx, 'age'] = value


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
1290,Female,Young,0,0,Never,30.51,4.5,159.0,0
5110,Male,Young,1,0,Former,55.33,4.5,200.0,0
5355,Male,59.0,0,0,Current,34.07,5.8,Low,0
5560,Male,26.0,0,0,Never,23.8,6.5,High,0
6115,Male,0.88,0,0,No Info,20.07,4.8,Low,0


### 6. Introduce Outliers (e.g., extreme BMI values)

In [9]:
# Introduce extreme outliers in the 'bmi' column  

# Select 2 random rows to assign outlier values 
outlier_bmi_indices = random.sample(range(len(df)), 2)
df.loc[outlier_bmi_indices, 'bmi'] = [100, 200]  # Set extremely high BMI values to simulate outliers 

# Extract and preview the rows containing BMI outliers
outliers_df = df[df['bmi'].isin([100, 200])]
outliers_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
73729,Male,16.0,0,0,Former,200.0,6.2,126.0,0
98203,Female,7.0,0,0,No Info,100.0,3.5,126.0,0


### 7. Special Characters or Corrupted Data

In [10]:
# Introduce special character corruption in the 'hbA1c_level' column  

# Select 5 random rows to insert special characters  
special_char_indices = random.sample(range(len(df)), 5)

# Define a list of special character values 
corrupt_values = random.choices(['@#$', '*&%', '!@', '###', '$$$'], k=len(special_char_indices))

# Assign these corrupt values to the selected rows in the 'hbA1c_level' column 
df.loc[special_char_indices, 'hbA1c_level'] = corrupt_values 

# Extract and preview rows where 'hbA1c_level' contains special characters 
corrupt_data_df = df[df['hbA1c_level'].isin(['@#$', '*&%', '!@', '###', '$$$'])]
corrupt_data_df.head()



  df.loc[special_char_indices, 'hbA1c_level'] = corrupt_values


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
22211,Female,31.0,0,0,Never,45.13,@#$,100.0,0
44837,Male,,0,0,Current,23.55,###,,0
80311,Male,0.88,0,0,No Info,20.07,@#$,158.0,0
80647,Male,15.0,0,0,No Info,27.32,###,,0
86522,Female,25.0,0,0,Never,22.82,$$$,145.0,0


### 8. Random Duplicate rows

In [11]:
# Randomly select 5 rows to swap 'age' and 'bmi' values 
swap_indices = random.sample(range(len(df)), 5)

# Save the original values before swapping (for reference)  
swapped_rows_before = df.loc[swap_indices, ['age', 'bmi']].copy()

# Swap the values between 'age' and 'bmi' columns 
df.loc[swap_indices, ['age', 'bmi']] = df.loc[swap_indices, ['bmi', 'age']].values  

# Preview the rows after swapping
swapped_rows_after = df.loc[swap_indices, ['age', 'bmi']]
swapped_rows_after.head()

Unnamed: 0,age,bmi
11759,19.85,36.0
21384,22.72,41.0
13891,31.68,51.0
42750,38.96,41.0
43386,21.96,44.0
