In [2]:
%matplotlib inline

In [3]:
import pandas as pd
import numpy as np

Explanation:
- Height Ranges: The height is assigned based on gender-specific ranges.
- BMI Calculation: The BMI is generated within a realistic range (17 to 33).
- Weight Calculation: Weight is calculated using the formula BMI = Weight / (Height in meters)^2, ensuring that it is consistent with the height.
- This setup ensures that the weight corresponds logically to the height and falls within realistic limits.

Uncleaned dataset

In [24]:
# Define the number of samples for the uncleaned synthetic dataset
num_samples = 5000

# Create the data with gender-specific height ranges and integer heights
gender = np.random.choice(['Male', 'Female', 'm', 'f', 'woman'], num_samples, p = [0.40, 0.53, 0.05, 0.01, 0.01]) # Introduce empty and inconsistent values
data_uncleaned = {
    'Age': np.random.randint(18, 70, num_samples),
    'Gender': gender,
    
    'Height cm': np.where(
        gender == 'Male',
        np.random.randint(160, 191, num_samples),  
        np.random.randint(150, 181, num_samples)  
    ),

    # Inconsistent category and missing values
    'Diet Type': np.random.choice(
        ['Vegetarian', 'Vegan', 'Omnivorous', 'Pescatarian', 'Flexitarian', 'all', ''], 
        num_samples, 
        p = [0.10, 0.03, 0.61, 0.05, 0.15, 0.05, 0.01]
    ),
    
    'Daily Fruit Intake': np.random.randint(0, 5, num_samples),
    'Daily Vegetable Intake': np.random.randint(0, 5, num_samples),
    'Daily Meal Count': np.random.randint(1, 6, num_samples),
    
    'Snack Frequency': np.random.choice(
        ['Never', 'Rarely', 'Sometimes', 'Often', 'Always'], num_samples,
        p = [0.12, 0.18, 0.33, 0.20, 0.17]
    ),
    
    'Water Intake L': np.round(np.random.uniform(0.5, 3, num_samples), 2),
    'Soft Drinks Intake L': np.round(np.random.uniform(0, 1, num_samples), 2),
    
    'Daily Coffee Consumption': np.random.choice(['0', '1-2', '3-4', ''], num_samples, p = [0.20, 0.50, 0.20, 0.1]),  
    
    'Alcohol Consumption': np.random.choice(
        ['Never', 'Occasional', 'Moderate', 'High'], num_samples, p = [0.25, 0.28, 0.30, 0.17]                                    
    ),
    
    'Smoking Status': np.random.choice(
        ['Non-Smoker', 'Former Smoker', 'Current Smoker'], 
        num_samples, 
        p = [0.30, 0.40, 0.30]
    ),
    
    'Weekly Exercise Frequency': np.random.choice(
        ['0', '1-2', '3-4', '5-6', 'N/A'], 
        num_samples,
        p = [0.19, 0.25, 0.30, 0.21, 0.05]
    ),  
    
    'Health Condition': np.random.choice(
        ['Healthy', 'Diabetes', 'Hypertension', 'Heart Disease'],
        num_samples,
        p = [0.60, 0.10, 0.25, 0.05]
    ),
    
    'Food Allergies': np.random.choice(
        ['No', 'Milk', 'Eggs', 'Fish', 'Shellfish', 'Peanuts', 'Tree Nuts', 'Wheat', 'Soybeans', 'Sesame'], 
        num_samples, 
        p = [0.85, 0.02, 0.02, 0.01, 0.03, 0.03, 0.02, 0.01, 0.005, 0.005]
    ),
    
    'Food Intolerances': np.random.choice(
        ['No', 'Lactose', 'Gluten', 'Fructose', ''], 
        num_samples,
        p = [0.63, 0.20, 0.10, 0.05, 0.02]
    ),
    
    'Sleep Quality': np.random.choice(
        ['Poor', 'Fair', 'Good', 'Very Good', 'Excellent'],
        num_samples,
        p = [0.15, 0.25, 0.35, 0.15, 0.10]
    )
}

# Convert the unclean dictionary into a DataFrame
dietry_habits_messy = pd.DataFrame(data_uncleaned)

# Generate BMI within a realistic range
dietry_habits_messy['BMI'] = np.round(np.random.uniform(17, 33, num_samples), 1)

# Calculate Weight based on Height and BMI
dietry_habits_messy['Weight kg'] = np.round(dietry_habits_messy['BMI'] * ((dietry_habits_messy['Height cm'] / 100) ** 2), 1)

# Introduce some duplicate rows
dietry_habits_messy = pd.concat([dietry_habits_messy, dietry_habits_messy.sample(100, random_state = 42)], ignore_index = True)

# Introduce some missing values randomly
for col in ['Age', 'Height cm', 'Weight kg', 'Diet Type']:
    dietry_habits_messy.loc[np.random.choice(dietry_habits_messy.index, 50, replace = False), col] = np.nan

# Introduce inconsistencies in BMI calculations
dietry_habits_messy.loc[np.random.choice(dietry_habits_messy.index, 30), 'BMI'] = np.nan  # Introduce missing BMI

# Save the uncleaned and untidy synthetic dataset to a CSV file
dietry_habits_messy.to_csv('dietary_habits_messy.csv', index = False)

In [43]:
dietry_habits_messy = pd.read_csv('data/dietary_habits_messy.csv')

In [45]:
dietry_habits_messy

Unnamed: 0,Age,Gender,Height cm,Diet Type,Daily Fruit Intake,Daily Vegetable Intake,Daily Meal Count,Snack Frequency,Water Intake L,Soft Drinks Intake L,Daily Coffee Consumption,Alcohol Consumption,Smoking Status,Weekly Exercise Frequency,Health Condition,Food Allergies,Food Intolerances,Sleep Quality,BMI,Weight kg
0,69.0,Female,159.0,Vegetarian,2,4,1,Never,0.69,0.76,0,Moderate,Former Smoker,5-6,Hypertension,No,No,Fair,24.3,61.4
1,25.0,Male,172.0,Flexitarian,0,0,4,Sometimes,2.60,0.24,1-2,Never,Current Smoker,1-2,Healthy,No,No,Very Good,26.9,79.6
2,46.0,Male,180.0,Omnivorous,1,2,5,Sometimes,1.13,0.18,1-2,Occasional,Current Smoker,3-4,Heart Disease,No,No,Good,26.7,86.5
3,49.0,Female,163.0,Flexitarian,0,3,1,Rarely,0.97,0.21,0,Moderate,Current Smoker,1-2,Healthy,No,No,Poor,26.5,70.4
4,45.0,f,178.0,Omnivorous,0,1,4,Never,0.91,0.47,3-4,Moderate,Former Smoker,1-2,Healthy,No,No,Fair,22.7,71.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5095,66.0,Male,183.0,Vegetarian,2,4,2,Sometimes,1.42,0.89,1-2,Moderate,Former Smoker,3-4,Healthy,No,Lactose,Fair,22.5,75.4
5096,68.0,Female,172.0,Omnivorous,0,4,4,Often,1.62,0.94,1-2,Moderate,Non-Smoker,0,Hypertension,Tree Nuts,No,Fair,22.2,65.7
5097,29.0,Female,167.0,Pescatarian,2,4,4,Rarely,1.29,0.60,3-4,Occasional,Non-Smoker,3-4,Hypertension,No,No,Good,19.0,53.0
5098,44.0,Female,179.0,Omnivorous,2,1,1,Sometimes,1.08,0.07,3-4,Occasional,Former Smoker,3-4,Diabetes,No,Lactose,Good,26.8,85.9


In [28]:
dietry_habits_messy.shape

(5100, 20)

In [30]:
dietry_habits_messy.duplicated().sum()

90

In [32]:
dietry_habits_messy.isna().sum()

Age                          50
Gender                        0
Height cm                    50
Diet Type                    50
Daily Fruit Intake            0
Daily Vegetable Intake        0
Daily Meal Count              0
Snack Frequency               0
Water Intake L                0
Soft Drinks Intake L          0
Daily Coffee Consumption      0
Alcohol Consumption           0
Smoking Status                0
Weekly Exercise Frequency     0
Health Condition              0
Food Allergies                0
Food Intolerances             0
Sleep Quality                 0
BMI                          30
Weight kg                    50
dtype: int64

In [34]:
dietry_habits_messy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5100 entries, 0 to 5099
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        5050 non-null   float64
 1   Gender                     5100 non-null   object 
 2   Height cm                  5050 non-null   float64
 3   Diet Type                  5050 non-null   object 
 4   Daily Fruit Intake         5100 non-null   int32  
 5   Daily Vegetable Intake     5100 non-null   int32  
 6   Daily Meal Count           5100 non-null   int32  
 7   Snack Frequency            5100 non-null   object 
 8   Water Intake L             5100 non-null   float64
 9   Soft Drinks Intake L       5100 non-null   float64
 10  Daily Coffee Consumption   5100 non-null   object 
 11  Alcohol Consumption        5100 non-null   object 
 12  Smoking Status             5100 non-null   object 
 13  Weekly Exercise Frequency  5100 non-null   objec

In [36]:
# # Rearrange the columns 
# dietry_habits_messy = dietry_habits_messy[['Age', 'Gender', 'Height cm', 'Weight kg', 'BMI', 'Diet Type', 
#                                            'Daily Fruit Intake', 'Daily Vegetable Intake', 'Daily Meal Count',
#                                            'Snack Frequency', 'Daily Water Intake L', 'Daily Soft Drinks Intake L',
#                                            'Daily Coffee Consumption', 'Alcohol Consumption', 'Food Allergies', 
#                                            'Food Intolerances','Smoking Status', 'Weekly Exercise Frequency', 
#                                            'Health Condition', 'Sleep Quality', 'Extra_Column']]


In [38]:
# # Generate the clean synthetic dataset again
# # Define the number of samples for synthetic data
# num_samples = 5000

# # Create the data with gender-specific height ranges
# data = {
#     'Age': np.random.randint(18, 70, num_samples),
#     'Gender': np.random.choice(['Male', 'Female'], num_samples),
#     'Height_cm': np.where(
#         np.random.choice(['Male', 'Female'], num_samples) == 'Male',
#         np.random.randint(160, 191, num_samples),  # Integer height for males
#         np.random.randint(150, 181, num_samples)   # Integer height for females
#     ),
# }

# # Convert the dictionary into a DataFrame
# dietry_habits_synthetic_cleaned = pd.DataFrame(data)

# # Generate BMI within a realistic range
# dietry_habits_synthetic_cleaned['BMI'] = np.round(np.random.uniform(18.5, 35, num_samples), 1)

# # Calculate Weight based on Height and BMI
# dietry_habits_synthetic_cleaned['Weight_kg'] = np.round(dietry_habits_synthetic_clean['BMI'] * ((dietry_habits_synthetic_clean['Height_cm'] / 100) ** 2), 1)

# # Rearrange the columns to have BMI after Weight_kg
# dietry_habits_synthetic_clean = dietry_habits_synthetic_clean[['Age', 'Gender', 'Height_cm', 'Weight_kg', 'BMI']]

# # Add other dietary habit columns
# dietry_habits_synthetic_clean['Diet_Type'] = np.random.choice(['Vegetarian', 'Vegan', 'Non-Vegetarian', 'Pescatarian'], num_samples)
# dietry_habits_synthetic_clean['Daily_Fruit_Intake'] = np.random.randint(0, 5, num_samples)
# dietry_habits_synthetic_clean['Daily_Vegetable_Intake'] = np.random.randint(0, 5, num_samples)
# dietry_habits_synthetic_clean['Daily_Meal_Count'] = np.random.randint(1, 6, num_samples)
# dietry_habits_synthetic_clean['Snack_Frequency'] = np.random.choice(['Never', 'Rarely', 'Sometimes', 'Often', 'Always'], num_samples)
# dietry_habits_synthetic_clean['Water_Intake_Liters'] = np.round(np.random.uniform(0.5, 3, num_samples), 2)
# dietry_habits_synthetic_clean['Soft_Drinks_Intake_Liters'] = np.round(np.random.uniform(0, 2, num_samples), 2)
# dietry_habits_synthetic_clean['Coffee_Consumption_Per_Day'] = np.random.choice(['None', '1-2', '3-4'], num_samples)
# dietry_habits_synthetic_clean['Alcohol_Consumption'] = np.random.choice(['None', 'Occasional', 'Moderate', 'High'], num_samples)
# dietry_habits_synthetic_clean['Smoking_Status'] = np.random.choice(['Non-Smoker', 'Former Smoker', 'Current Smoker'], num_samples)
# dietry_habits_synthetic_clean['Exercise_Frequency_Per_Week'] = np.random.choice(['None', '1-2', '3-4', '5+'], num_samples)
# dietry_habits_synthetic_clean['Health_Condition'] = np.random.choice(['None', 'Diabetes', 'Hypertension', 'Heart Disease'], num_samples)
# dietry_habits_synthetic_clean['Food_Allergies'] = np.random.choice(['None', 'Milk', 'Eggs', 'Fish', 'Shellfish', 'Peanuts', 'Tree Nuts', 'Wheat', 'Soybeans', 'Sesame'], num_samples)
# dietry_habits_synthetic_clean['Food_Intolerance'] = np.random.choice(['None', 'Lactose', 'Histamine', 'Gluten'], num_samples)

# # Save the synthetic dataset to a CSV file
# dietry_habits_synthetic_clean.to_csv('dietary_habits_synthetic.csv', index=False)

# # # Display the first few rows of the synthetic dataset
# # print(synthetic_df.head())