In [1]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Load datasets and rename columns to lowercase
df1 = pd.read_csv("dataset/dataset1.csv")[['user_id', 'date', 'steps']].rename(columns=str.lower)
df2 = pd.read_csv("dataset/dataset2.csv")[['Gender', 'Height', 'Weight', 'Index']].rename(columns=str.lower)
df3 = pd.read_csv("dataset/dataset3.csv")[['Person ID', 'Gender', 'Age']].rename(columns=lambda x: x.lower().replace(" ", "_"))

# Ensure consistency in column types
df2['gender'] = df2['gender'].astype(str)
df3['gender'] = df3['gender'].astype(str)

In [3]:
# Calculate daily average steps per user from Dataset 1
df1_daily_steps = df1.groupby('user_id')['steps'].mean().reset_index()
df1_daily_steps.rename(columns={'steps': 'daily_steps'}, inplace=True)

In [4]:
# Add underweight entries to Dataset 2
underweight_entries = pd.DataFrame({
    'gender': ['female', 'male'],
    'height': [160, 180],
    'weight': [45, 55],
    'index': [999, 998]
})
df2 = pd.concat([df2, underweight_entries], ignore_index=True)

In [5]:
# Calculate BMI, target weight, and weight adjustment for all entries in Dataset 2
df2['bmi'] = df2['weight'] / ((df2['height'] / 100) ** 2)
df2['target_weight'] = 22 * ((df2['height'] / 100) ** 2)
df2['weight_adjustment'] = df2['target_weight'] - df2['weight']

In [6]:
# Merge datasets on gender and user identifiers
merged_df = df2.merge(df3, on='gender', how='inner')
final_df = merged_df.merge(df1_daily_steps, left_on='person_id', right_on='user_id', how='inner')

In [7]:
# Calculate BMI category and adjust steps based on factors
final_df['bmi_category'] = pd.cut(
    final_df['bmi'],
    bins=[0, 18.5, 24.9, 29.9, float('inf')],
    labels=['underweight', 'normal weight', 'overweight', 'obesity']
)

def calculate_adjusted_steps(row):
    # Set base steps based on BMI category
    if row['bmi_category'] == 'underweight':
        base_steps = 2500
    elif row['bmi_category'] == 'normal weight':
        base_steps = 4500
    elif row['bmi_category'] == 'overweight':
        base_steps = 3500
    else:
        base_steps = 3000

    # Adjust steps based on age and gender
    if row['age'] < 30:
        base_steps += 500
    elif 30 <= row['age'] <= 50:
        base_steps += 250
    else:
        base_steps -= 500

    base_steps += 500 if row['gender'] == 'male' else -200

    # Adjust steps based on weight adjustment for weight goals
    if row['weight_adjustment'] > 0:
        base_steps += 1000
    elif row['weight_adjustment'] < 0:
        base_steps -= 500

    return base_steps

In [8]:
# Apply the step adjustment function and clean up columns
final_df['daily_steps'] = final_df.apply(calculate_adjusted_steps, axis=1).astype(int)
final_df['bmi'] = final_df['bmi'].round(1)
final_df['target_weight'] = final_df['target_weight'].round(1)
final_df['weight_adjustment'] = (final_df['target_weight'] - final_df['weight']).round(1)  # Round to 1 decimal place

# Drop unnecessary columns
final_df = final_df.drop(columns=['user_id', 'person_id'])

In [9]:
print(final_df[['gender', 'age', 'height', 'weight', 'bmi', 'bmi_category', 
                'daily_steps', 'target_weight', 'weight_adjustment']].head())

  gender  age  height  weight   bmi bmi_category  daily_steps  target_weight  \
0   Male   27     174      96  31.7      obesity         2800           66.6   
1   Male   28     174      96  31.7      obesity         2800           66.6   
2   Male   28     174      96  31.7      obesity         2800           66.6   
3   Male   28     174      96  31.7      obesity         2800           66.6   
4   Male   28     174      96  31.7      obesity         2800           66.6   

   weight_adjustment  
0              -29.4  
1              -29.4  
2              -29.4  
3              -29.4  
4              -29.4  


In [10]:
final_df.to_csv("dataset/final_df.csv", index=False)