# Data Issue Generation Notebook

### 1. Import Libraries and Load Dataset

In [1]:

import pandas as pd
import numpy as np
import random

# Load the dataset 
dataset_path = '../airflow/data/diabetes_dataset.csv'  
df = pd.read_csv(dataset_path)
df_errors = df.copy()
df_errors.rename(columns={'diabetes': 'actual_label'}, inplace=True)
df_errors.head()


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,actual_label
0,Female,80.0,0,0,No Info,10.01,6.0,100,0
1,Male,8.0,0,0,No Info,10.01,6.0,140,0
2,Male,38.0,0,0,Never,10.08,6.1,130,0
3,Male,11.0,0,0,No Info,10.14,6.1,160,0
4,Male,39.0,0,0,Never,10.19,6.5,85,0


### 1. Introduce Missing Values (in 'Age' and 'Blood Glucose Level')

In [None]:
missing_percentage = 0.1
n_missing = int(len(df_errors) * missing_percentage)

missing_age_indices = random.sample(range(len(df_errors)), n_missing)
missing_bgl_indices = random.sample(range(len(df_errors)), n_missing)

df_errors.loc[missing_age_indices, 'age'] = np.nan
df_errors.loc[missing_bgl_indices, 'blood_glucose_level'] = np.nan
df_errors.loc[missing_age_indices, 'hbA1c_level'] = np.nan  
df_errors.loc[missing_bgl_indices, 'gender'] = np.nan 

### 2. Introduce Unknown Values in 'Gender'

In [None]:
unknown_gender_indices = random.sample(range(len(df_errors)), 25)
df_errors.loc[unknown_gender_indices, 'gender'] = 'Unknown'

### 3. Introduce Wrong data Values for Features

In [None]:
wrong_age_indices = random.sample(range(len(df_errors)), 25)
wrong_gender_indices = random.sample(range(len(df_errors)), 25)

df_errors.loc[wrong_age_indices, 'age'] = -12
df_errors.loc[wrong_gender_indices, 'gender'] = 'child'

### 4. Introduce String in Numerical Columns (e.g., 'Blood Glucose Level')

In [6]:
string_in_bgl_indices = random.sample(range(len(df_errors)), 25)
string_in_age_indices = random.sample(range(len(df_errors)), 35)

bgl_strings = ['High', 'Low']
age_strings = ['Young', 'Old', 'Teen']

for idx, val in zip(string_in_bgl_indices, random.choices(bgl_strings, k=len(string_in_bgl_indices))):
    df_errors.loc[idx, 'blood_glucose_level'] = val

for idx, val in zip(string_in_age_indices, random.choices(age_strings, k=len(string_in_age_indices))):
    df_errors.loc[idx, 'age'] = val

### 5. Introduce Outliers (e.g., extreme BMI values)

In [7]:
outlier_bmi_indices = random.sample(range(len(df_errors)), 20)
df_errors.loc[outlier_bmi_indices, 'bmi'] = random.choices([100, 200], k=len(outlier_bmi_indices))

### 6. Special Characters or Corrupted Data

In [None]:
special_char_indices = random.sample(range(len(df_errors)), 25)
corrupt_values = random.choices(['@#$', '*&%', '!@', '###', '$$$'], k=len(special_char_indices))

df_errors.loc[special_char_indices, 'hbA1c_level'] = corrupt_values

  df.loc[special_char_indices, 'hbA1c_level'] = corrupt_values


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
22211,Female,31.0,0,0,Never,45.13,@#$,100.0,0
44837,Male,,0,0,Current,23.55,###,,0
80311,Male,0.88,0,0,No Info,20.07,@#$,158.0,0
80647,Male,15.0,0,0,No Info,27.32,###,,0
86522,Female,25.0,0,0,Never,22.82,$$$,145.0,0


### 8. Random Duplicate rows

In [11]:
# Randomly select 5 rows to swap 'age' and 'bmi' values 
swap_indices = random.sample(range(len(df)), 5)

# Save the original values before swapping (for reference)  
swapped_rows_before = df.loc[swap_indices, ['age', 'bmi']].copy()

# Swap the values between 'age' and 'bmi' columns 
df.loc[swap_indices, ['age', 'bmi']] = df.loc[swap_indices, ['bmi', 'age']].values  

# Preview the rows after swapping
swapped_rows_after = df.loc[swap_indices, ['age', 'bmi']]
swapped_rows_after.head()

Unnamed: 0,age,bmi
11759,19.85,36.0
21384,22.72,41.0
13891,31.68,51.0
42750,38.96,41.0
43386,21.96,44.0
