In [14]:
!pip install ace_tools

Collecting ace_tools
  Downloading ace_tools-0.0-py3-none-any.whl.metadata (300 bytes)
Downloading ace_tools-0.0-py3-none-any.whl (1.1 kB)
Installing collected packages: ace_tools
Successfully installed ace_tools-0.0



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [44]:
import pandas as pd
import numpy as np

In [84]:
user_profiles = pd.read_csv("../data/user_Profiles.csv")
recent_activity = pd.read_csv("../data/recent_activity.csv")
synthetic_recent_activity = pd.read_csv("../data/Synthetic_Recent_Activity_for_Exercise.csv")
synthetic_user_profiles = pd.read_csv("../data/Synthetic_User_Profiles_for_Exercise.csv")
diet = pd.read_csv("../data/diet.csv")
exercise = pd.read_csv("../data/exercise.csv")

In [85]:
# Check structure and missing values in all datasets
datasets = {
    "User Profiles": user_profiles,
    "Recent Activity": recent_activity,
    "Synthetic Recent Activity (Exercise)": synthetic_recent_activity,
    "Synthetic User Profiles (Exercise)": synthetic_user_profiles,
    "Synthetic User Profiles"
    "Diet": diet
}

In [86]:
# Summarize datasets
summary = {name: df.info() for name, df in datasets.items()}

# Display first few rows of each dataset for analysis
user_profiles.head(), recent_activity.head(), synthetic_recent_activity.head(), synthetic_user_profiles.head(), diet.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98 entries, 0 to 97
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   User_Id   98 non-null     object
 1   Veg_Non   98 non-null     object
 2   Nutrient  98 non-null     object
 3   Disease   98 non-null     object
 4   Diet      98 non-null     object
dtypes: object(5)
memory usage: 4.0+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 983 entries, 0 to 982
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   User_Id    983 non-null    object
 1   Meal_Id    983 non-null    object
 2   Rated      983 non-null    int64 
 3   Liked      983 non-null    int64 
 4   Searched   983 non-null    int64 
 5   Purchased  983 non-null    int64 
 6   Timestamp  983 non-null    object
dtypes: int64(4), object(3)
memory usage: 53.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 963 entries, 0 to 962
Data columns (

(  User_Id  Veg_Non   Nutrient  \
 0  User_1  non-veg   chloride   
 1  User_2      veg   chloride   
 2  User_3      veg  magnesium   
 3  User_4      veg  vitamin_e   
 4  User_5  non-veg  vitamin_c   
 
                                              Disease  \
 0     anemia kidney_disease goitre rickets pregnancy   
 1                                             goitre   
 2    cancer hypertension goitre heart_disease scurvy   
 3   cancer kidney_disease obesity anemia heart_di...   
 4     anemia kidney_disease goitre rickets pregnancy   
 
                                                 Diet  
 0   type_a_diet high_protien_diet low_carb_diet l...  
 1                         high_fiber_diet vegan_diet  
 2   high_fiber_diet ketogenic_diet high_protien_diet  
 3   high_fiber_diet ketogenic_diet gluten_free_di...  
 4   type_a_diet high_protien_diet low_carb_diet l...  ,
    User_Id    Meal_Id  Rated  Liked  Searched  Purchased            Timestamp
 0  User_19   meal_id3      1     

In [87]:
# Standardizing User Profiles Data
user_profiles["User_Id"] = user_profiles["User_Id"].str.lower()
user_profiles["User_Id"] = user_profiles["User_Id"].str.replace("user_", "", regex= True)
user_profiles['Veg_Non'] = user_profiles['Veg_Non'].apply(lambda x: 0 if x == "veg" else 1)

synthetic_user_profiles["User_Id"] = synthetic_user_profiles["User_Id"].str.lower()
synthetic_user_profiles["User_Id"] = synthetic_user_profiles["User_Id"].str.replace("user_", "", regex= True)

diet["Meal_Id"] = diet["Meal_Id"].str.lower()
diet["Meal_Id"] = diet["Meal_Id"].str.replace("meal_id", "", regex= True)
diet["Disease"] = diet["Disease"].apply(lambda x: x.split() if isinstance(x, str) else [])
diet["Diet"] = diet["Diet"].apply(lambda x: x.split() if isinstance(x, str) else [])
diet["Veg_Non"] = diet["Veg_Non"].apply(lambda x: 0 if x == "veg" else 1)
synthetic_recent_activity["User_Id"] = synthetic_recent_activity["User_Id"].str.lower()
synthetic_recent_activity["User_Id"] = synthetic_recent_activity["User_Id"].str.replace("user_", "", regex= True)
synthetic_recent_activity["Timestamp"] = pd.to_datetime(synthetic_recent_activity["Timestamp"])

recent_activity["User_Id"] = recent_activity["User_Id"].str.lower()
recent_activity["User_Id"] = recent_activity["User_Id"].str.replace("user_", "", regex= True)
recent_activity["Meal_Id"] = recent_activity["Meal_Id"].str.replace("meal_id", "", regex= True)
recent_activity["Timestamp"] = pd.to_datetime(recent_activity["Timestamp"])

synthetic_recent_activity["Duration"] = synthetic_recent_activity["Duration"].astype(int)
synthetic_recent_activity["Performed"] = synthetic_recent_activity["Performed"].astype(int)

diet["Meal_Id"] = diet["Meal_Id"].astype(int)
user_profiles["User_Id"] = user_profiles["User_Id"].astype(int)
synthetic_user_profiles["User_Id"] = synthetic_user_profiles["User_Id"].astype(int)
recent_activity["User_Id"] = recent_activity["User_Id"].astype(int)
synthetic_recent_activity["User_Id"] = synthetic_recent_activity["User_Id"].astype(int)

exercise.columns = (
    exercise.columns.str.strip().str.lower().str.replace(" ", "_")
)

# Convert appropriate columns to correct data types
exercise["id"] = exercise["id"].astype(int)
exercise["duration"] = exercise["duration"].astype(int)
exercise["heart_rate"] = exercise["heart_rate"].astype(int)
exercise["exercise_intensity"] = exercise["exercise_intensity"].astype(int)

exercise["calories_burn"] = exercise["calories_burn"].astype(float)
exercise["dream_weight"] = exercise["dream_weight"].astype(float)
exercise["actual_weight"] = exercise["actual_weight"].astype(float)
exercise["bmi"] = exercise["bmi"].astype(float)

# Standardizing categorical fields
exercise["gender"] = exercise["gender"].str.lower().str.strip()
exercise["weather_conditions"] = exercise["weather_conditions"].str.lower().str.strip()
exercise["exercise"] = exercise["exercise"].str.lower().str.strip()

# Standardizing Prices and Dropping Duplicates in Diet Data
diet["Price"] = diet.groupby(["Meal_Id"])["Price"].transform("mean")  # Standardize price
diet = diet.drop_duplicates(subset=["Meal_Id"], keep="first")  # Remove duplicate meals

cleaned_exercise_path = "../data/cleaned/cleaned_exercise.csv"
cleaned_user_profiles_path = "../data/cleaned/cleaned_user_profiles.csv"
cleaned_meals_path = "../data/cleaned/cleaned_meals.csv"
cleaned_exercise_activity_path = "../data/cleaned/cleaned_exercise_activity.csv"
cleaned_recent_activity_path = "../data/cleaned/cleaned_recent_activity.csv"
cleaned_user_profiles_exercise_path = "../data/cleaned/cleaned_exercise_user_profiles.csv"

user_profiles.to_csv(cleaned_user_profiles_path, index=False)
diet.to_csv(cleaned_meals_path, index=False)
synthetic_recent_activity.to_csv(cleaned_exercise_activity_path, index=False)
recent_activity.to_csv(cleaned_recent_activity_path, index=False)
exercise.to_csv(cleaned_exercise_path, index=False)
synthetic_user_profiles.to_csv(cleaned_user_profiles_exercise_path, index = False)

In [88]:
# Cleaning Meals Data
diet["Meal_Id"] = diet["Meal_Id"].str.lower()
diet["Disease"] = diet["Disease"].apply(lambda x: x.split() if isinstance(x, str) else [])
diet["Diet"] = diet["Diet"].apply(lambda x: x.split() if isinstance(x, str) else [])
diet["Price"] = diet["Price"].astype(float)

# Cleaning Exercise Data
synthetic_recent_activity["User_Id"] = synthetic_recent_activity["User_Id"].str.lower()
synthetic_recent_activity["Timestamp"] = pd.to_datetime(synthetic_recent_activity["Timestamp"])
synthetic_recent_activity["Duration"] = synthetic_recent_activity["Duration"].astype(int)

# Cleaning Recent Activity Data
recent_activity["User_Id"] = recent_activity["User_Id"].str.lower()
recent_activity["Timestamp"] = pd.to_datetime(recent_activity["Timestamp"])



AttributeError: Can only use .str accessor with string values!

In [39]:
# Save cleaned datasets to CSV for download
cleaned_user_profiles_path = "../data/cleaned/cleaned_user_profiles.csv"
cleaned_meals_path = "../data/cleaned/cleaned_meals.csv"
cleaned_exercise_activity_path = "../data/cleaned/cleaned_exercise_activity.csv"
cleaned_recent_activity_path = "../data/cleaned/cleaned_recent_activity.csv"

user_profiles.to_csv(cleaned_user_profiles_path, index=False)
diet.to_csv(cleaned_meals_path, index=False)
synthetic_recent_activity.to_csv(cleaned_exercise_activity_path, index=False)
recent_activity.to_csv(cleaned_recent_activity_path, index=False)

# Provide download links
cleaned_user_profiles_path, cleaned_meals_path, cleaned_exercise_activity_path, cleaned_recent_activity_path


('../data/cleaned/cleaned_user_profiles.csv',
 '../data/cleaned/cleaned_meals.csv',
 '../data/cleaned/cleaned_exercise_activity.csv',
 '../data/cleaned/cleaned_recent_activity.csv')

In [40]:
# Load dataset.csv for cleaning
dataset_path = "../data/exercise.csv"
dataset = pd.read_csv(dataset_path)

# Display basic info and first few rows
dataset.info(), dataset.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3864 entries, 0 to 3863
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  3864 non-null   int64  
 1   Exercise            3864 non-null   object 
 2   Calories Burn       3864 non-null   float64
 3   Dream Weight        3864 non-null   float64
 4   Actual Weight       3864 non-null   float64
 5   Age                 3864 non-null   int64  
 6   Gender              3864 non-null   object 
 7   Duration            3864 non-null   int64  
 8   Heart Rate          3864 non-null   int64  
 9   BMI                 3864 non-null   float64
 10  Weather Conditions  3864 non-null   object 
 11  Exercise Intensity  3864 non-null   int64  
dtypes: float64(4), int64(5), object(3)
memory usage: 362.4+ KB


(None,
    ID     Exercise  Calories Burn  Dream Weight  Actual Weight  Age  Gender  \
 0   1   Exercise 2     286.959851     91.892531      96.301115   45    Male   
 1   2   Exercise 7     343.453036     64.165097      61.104668   25    Male   
 2   3   Exercise 4     261.223465     70.846224      71.766724   20    Male   
 3   4   Exercise 5     127.183858     79.477008      82.984456   33    Male   
 4   5  Exercise 10     416.318374     89.960226      85.643174   29  Female   
 
    Duration  Heart Rate        BMI Weather Conditions  Exercise Intensity  
 0        37         170  29.426275              Rainy                   5  
 1        43         142  21.286346              Rainy                   5  
 2        20         148  27.899592             Cloudy                   4  
 3        39         170  33.729552              Sunny                  10  
 4        34         118  23.286113             Cloudy                   3  )

In [41]:
# Standardize column names (remove spaces and lowercase)
dataset.columns = (
    dataset.columns.str.strip().str.lower().str.replace(" ", "_")
)

# Convert appropriate columns to correct data types
dataset["id"] = dataset["id"].astype(int)
dataset["duration"] = dataset["duration"].astype(int)
dataset["heart_rate"] = dataset["heart_rate"].astype(int)
dataset["exercise_intensity"] = dataset["exercise_intensity"].astype(int)

dataset["calories_burn"] = dataset["calories_burn"].astype(float)
dataset["dream_weight"] = dataset["dream_weight"].astype(float)
dataset["actual_weight"] = dataset["actual_weight"].astype(float)
dataset["bmi"] = dataset["bmi"].astype(float)

# Standardizing categorical fields
dataset["gender"] = dataset["gender"].str.lower().str.strip()
dataset["weather_conditions"] = dataset["weather_conditions"].str.lower().str.strip()
dataset["exercise"] = dataset["exercise"].str.lower().str.strip()

cleaned_exercise_path = "../data/cleaned/cleaned_exercise.csv"
dataset.to_csv(cleaned_exercise_path, index=False)

In [43]:
# Define paths to cleaned CSV files
cleaned_user_profiles_path = "../data/cleaned/cleaned_user_profiles.csv"
cleaned_meals_path = "../data/cleaned/cleaned_meals.csv"
cleaned_exercise_activity_path = "../data/cleaned/cleaned_exercise_activity.csv"
cleaned_recent_activity_path = "../data/cleaned/cleaned_recent_activity.csv"
cleaned_exercise_path = "../data/cleaned/cleaned_exercise.csv"

# Load cleaned datasets
cleaned_user_profiles = pd.read_csv(cleaned_user_profiles_path)
cleaned_meals = pd.read_csv(cleaned_meals_path)
cleaned_exercise_activity = pd.read_csv(cleaned_exercise_activity_path)
cleaned_recent_activity = pd.read_csv(cleaned_recent_activity_path)
cleaned_exercise = pd.read_csv(cleaned_exercise_path)

# Display summary to confirm data loading
datasets_summary = {
    "User Profiles": cleaned_user_profiles.info(),
    "Meals": cleaned_meals.info(),
    "Exercise Activity": cleaned_exercise_activity.info(),
    "Recent Activity": cleaned_recent_activity.info(),
    "Exercise": cleaned_exercise.info(),
}

# Show first few rows of each dataset
cleaned_user_profiles.head(), cleaned_meals.head(), cleaned_exercise_activity.head(), cleaned_recent_activity.head(), cleaned_exercise.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98 entries, 0 to 97
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   User_Id   98 non-null     object
 1   Veg_Non   98 non-null     object
 2   Nutrient  98 non-null     object
 3   Disease   98 non-null     object
 4   Diet      98 non-null     object
dtypes: object(5)
memory usage: 4.0+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 512 entries, 0 to 511
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Meal_Id      512 non-null    object 
 1   Name         512 non-null    object 
 2   catagory     512 non-null    object 
 3   description  511 non-null    object 
 4   Veg_Non      512 non-null    object 
 5   Nutrient     512 non-null    object 
 6   Disease      512 non-null    object 
 7   Diet         512 non-null    object 
 8   Price        512 non-null    float64
dtypes: float64(1), object

(  User_Id  Veg_Non   Nutrient  \
 0  user_1  non-veg   chloride   
 1  user_2      veg   chloride   
 2  user_3      veg  magnesium   
 3  user_4      veg  vitamin_e   
 4  user_5  non-veg  vitamin_c   
 
                                              Disease  \
 0     anemia kidney_disease goitre rickets pregnancy   
 1                                             goitre   
 2    cancer hypertension goitre heart_disease scurvy   
 3   cancer kidney_disease obesity anemia heart_di...   
 4     anemia kidney_disease goitre rickets pregnancy   
 
                                                 Diet  
 0   type_a_diet high_protien_diet low_carb_diet l...  
 1                         high_fiber_diet vegan_diet  
 2   high_fiber_diet ketogenic_diet high_protien_diet  
 3   high_fiber_diet ketogenic_diet gluten_free_di...  
 4   type_a_diet high_protien_diet low_carb_diet l...  ,
     Meal_Id                        Name catagory  \
 0  meal_id1         summer squash salad    salad   
 1  mea