In [41]:
import pandas as pd
import numpy as np
import random
import os
import glob

In [42]:
# Define file paths
input_file_path = 'data_from_kaggle/kaggle_data.csv'  # Input file path
output_folder = r'D:\DSP FILES\airflow_docker_one\data'  # Output folder
output_file_path = os.path.join(output_folder, 'raw_data.csv')  # Output file path

In [43]:
# Create output directory if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Clear the destination folder by removing any existing files
files = glob.glob(f'{output_folder}/*')
for f in files:
    os.remove(f)

In [44]:
# Read the dataset
df = pd.read_csv(input_file_path)

In [45]:
# 1. Remove Column 'Hypertension'
df.drop('hypertension', axis=1, inplace=True)
df.head ()

Unnamed: 0,gender,age,heart_disease,glucose_level,bmi,stroke
0,Female,14,0,139.67,14.1,0
1,Male,15,0,70.33,16.9,0
2,Female,16,0,139.67,14.1,0
3,Male,17,0,70.33,16.9,0
4,Male,16,0,114.71,17.4,0


In [46]:
# Ensure 'age' column is treated as object to handle mixed types
df['age'] = df['age'].astype(object)

In [47]:
# 2. Delete 5 rows from 'age' column
def delete_rows(df, column_name, num_rows_to_delete):
    if len(df) < num_rows_to_delete:
        raise ValueError(f"Not enough rows to delete {num_rows_to_delete} from column {column_name}")
    indices = np.random.choice(df.index, num_rows_to_delete, replace=False)
    df.loc[indices, column_name] = np.nan
    return df

df = delete_rows(df, 'age', 5)
df.head()

Unnamed: 0,gender,age,heart_disease,glucose_level,bmi,stroke
0,Female,14,0,139.67,14.1,0
1,Male,15,0,70.33,16.9,0
2,Female,16,0,139.67,14.1,0
3,Male,17,0,70.33,16.9,0
4,Male,16,0,114.71,17.4,0


In [48]:
# 3. Put 'France', 'Germany', or 'USA' in 5 rows of 'Gender'
gender_indices = df.index.difference(df.index[df['age'].isna()])
random_countries = random.choices(['France', 'Germany', 'USA'], k=5)
df.loc[random.sample(gender_indices.tolist(), 5), 'gender'] = random_countries
df.head()

Unnamed: 0,gender,age,heart_disease,glucose_level,bmi,stroke
0,Female,14,0,139.67,14.1,0
1,Male,15,0,70.33,16.9,0
2,Female,16,0,139.67,14.1,0
3,Male,17,0,70.33,16.9,0
4,Male,16,0,114.71,17.4,0


In [49]:
# 4. Put negative values in 'age' for 5 rows (excluding existing NaNs)
negative_age_indices = df.index.difference(df.index[df['age'].isna()])
df.loc[random.sample(negative_age_indices.tolist(), 5), 'age'] = [-random.randint(1, 100) for _ in range(5)]

In [50]:
# 5. Put 'BMI' value more than 100 for 5 rows
bmi_indices = df.index.difference(df.index[df['age'].isna()])
df.loc[random.sample(bmi_indices.tolist(), 5), 'bmi'] = [random.uniform(101, 150) for _ in range(5)]

In [51]:
# 6. Put value more than 2 in 'heart_disease' for 10 rows
heart_disease_indices = df.index.difference(df.index[df['age'].isna()])
df.loc[random.sample(heart_disease_indices.tolist(), 10), 'heart_disease'] = random.choices([3, 4, 5], k=10)

In [52]:
# 7. Simulate two columns with the same name for 'heart_disease'
df['heart_disease_duplicate'] = df['heart_disease']
df = pd.concat([df, df[['heart_disease_duplicate']].rename(columns={'heart_disease_duplicate': 'heart_disease'})], axis=1)
df.drop(columns='heart_disease_duplicate', inplace=True)

In [53]:
# 8. Add random numbers to 'gender' column for 5 rows
gender_indices = df.index.difference(df.index[df['age'].isna()])
random_numbers = [str(random.randint(100, 999)) for _ in range(5)]  # Random numbers as strings
df.loc[random.sample(gender_indices.tolist(), 5), 'gender'] = random_numbers

In [54]:
# 9. Insert dates into 'age' for 5 randomly selected rows (ensure column is treated as object dtype)
date_indices = df.index.difference(df.index[df['age'].isna()])
selected_date_indices = random.sample(date_indices.tolist(), 5)
dates = pd.to_datetime(random.choices(['2023-01-01', '2024-07-15', '2022-12-25'], k=5))
df.loc[selected_date_indices, 'age'] = [date.strftime('%Y-%m-%d') for date in dates]

In [55]:
# 10. Put incorrect decimal values in 'bmi' for 5 rows
incorrect_bmi_indices = df.index.difference(df.index[df['age'].isna()])
df.loc[random.sample(incorrect_bmi_indices.tolist(), 5), 'bmi'] = [round(random.uniform(20, 40), 7) for _ in range(5)]

In [56]:
# 11. Insert food names into 'glucose_level' for 10 rows
df['glucose_level'] = df['glucose_level'].astype(str)
food_names = random.choices(['apple', 'banana', 'pizza', 'burger'], k=10)
df.loc[random.sample(incorrect_bmi_indices.tolist(), 10), 'avg_glucose_level'] = food_names

In [57]:
# 12. Replace numeric values in 'age' with strings for 5 rows
age_strings = ['twenty five', 'thirty', 'forty five', 'sixty', 'eighty']
df.loc[random.sample(df.index.tolist(), 5), 'age'] = age_strings

In [58]:
# Save the modified DataFrame with errors to a new CSV file
df.to_csv(output_file_path, index=False)
print(f"Dataset with errors has been saved to {output_file_path}")

Dataset with errors has been saved to D:\DSP FILES\airflow_docker_one\data\raw_data.csv
