In [6]:
import pandas as pd
import os

# Define relative file paths (moving up one level from "Scripts/")
file1 = os.path.join("..", "Cleaned Data", "Samuel_2025-02-23.csv")
file2 = os.path.join("..", "Data", "BCIT.csv")
output_file = os.path.join("..", "Data", "Merged_Data.csv")  # Save in Data folder

# Check if files exist before proceeding
if not os.path.exists(file1) or not os.path.exists(file2):
    print("Error: One or both files not found. Please check the file paths.")
else:
    # Read the CSV files
    df1 = pd.read_csv(file1, encoding="utf-8")
    df2 = pd.read_csv(file2, encoding="utf-8")

    # Debug: Print column names
    print("Columns in df1:", df1.columns)
    print("Columns in df2:", df2.columns)

    # Ensure "program" column exists and has the same name in both files
    df1.rename(columns={"Program": "program"}, inplace=True)
    df2.rename(columns={"Program": "program"}, inplace=True)

    # Convert "program" to string type and strip whitespace
    df1["program"] = df1["program"].astype(str).str.strip()
    df2["program"] = df2["program"].astype(str).str.strip()

    # Merge the dataframes on 'program' (inner join)
    merged_df = pd.merge(df1, df2, on="program", how="inner")

    # Display merged dataframe
    print("\nMerged Data Preview:")
    print(merged_df.head())  # Show first few rows

    # Ensure Data folder exists before saving
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
    # Save the merged file in the "Data" folder
    merged_df.to_csv(output_file, index=False)

    print(f"Merge successful! Saved as '{output_file}'")


Columns in df1: Index(['Timestamp', 'program', 'gender', 'AGE', 'HEIGHT', 'WEIGHT',
       'SALARY_OLD', 'SALARY_GRAD', 'DESIRE_WORK_ORG', 'PARENTS', 'EXPENSES',
       'TRANSPORT', 'COST_TRANSPORTATION', 'COST_ENTERTAINMENT', 'HAND',
       'CELL_PHONE', 'SMOKE', 'foot_size', 'alcohol', 'tattoo', 'sleep',
       'social', 'homework', 'work', 'travel', 'tuition', 'coffee', 'BMI'],
      dtype='object')
Columns in df2: Index(['program', 'place', 'hours'], dtype='object')

Merged Data Preview:
                      Timestamp program  gender   AGE  HEIGHT  WEIGHT  \
0  2019/02/04 11:26:35 am GMT-8    HRMG  female  22.0  177.80   110.0   
1  2019/02/04 11:27:45 am GMT-8    HRMG  female  24.0  175.00    63.0   
2  2019/02/04 11:27:46 am GMT-8    HRMG    male  24.0  182.88   100.0   
3  2019/02/04 11:29:06 am GMT-8    HRMG  female  34.0    5.00    61.3   
4  2019/02/04 11:29:12 am GMT-8    HRMG  female  24.0  160.00    54.0   

   SALARY_OLD  SALARY_GRAD                     DESIRE_WORK_ORG  

## I chose an inner join to ensure that only matching values in both datasets are joined, therefore achieved a result of no NA values are created sue to join.