In [5]:
import pandas as pd
import os
import yaml

In [6]:
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

In [7]:
def csv_to_parquet(csv_path, save_dir, engine="pyarrow"):
    """
    Convert a CSV file to Parquet and save in a given directory.

    Parameters:
        csv_path (str): Path to the input CSV file
        save_dir (str): Folder where the parquet file will be saved
        engine (str): Parquet engine, "pyarrow" (default) or "fastparquet"
    """

    # Read CSV
    df = pd.read_csv(csv_path)

    # Fix mixed types: convert all object columns to string
    for col in df.select_dtypes(include=["object"]).columns:
        df[col] = df[col].astype(str)

    # Make sure the save directory exists
    os.makedirs(save_dir, exist_ok=True)

    # Build output path (same name as CSV, but .parquet extension)
    filename = os.path.splitext(os.path.basename(csv_path))[0] + ".parquet"
    save_path = os.path.join(save_dir, filename)

    # Save as Parquet
    df.to_parquet(save_path, engine=engine, index=False)

    print(f"✅ Converted CSV → Parquet: {save_path}")


# ===============================
# Example usage
# ===============================
csv_file = config["paths"]["LINK_TO_ONSIDE_DATASET"]
output_folder = r"Data"

csv_to_parquet(csv_file, output_folder)



  df = pd.read_csv(csv_path)


✅ Converted CSV → Parquet: Data\final_rxnorm_meddra.parquet
