In [8]:
import pandas as pd
import json
import os

# Paths for files
file_paths = [
    "FOOD-DATA-GROUP1.csv",
    "FOOD-DATA-GROUP2.csv",
    "FOOD-DATA-GROUP3.csv",
    "FOOD-DATA-GROUP4.csv",
    "FOOD-DATA-GROUP5.csv",
    "indian_food.csv",
    "pred_food.csv",
    "weight_change_dataset.csv"
]


In [9]:
# Step 1: Load datasets
dfs = []  # To hold dataframes
for file in file_paths:
    if os.path.exists(file):  # Check if file exists before loading
        try:
            df = pd.read_csv(file)  # Load CSV file into DataFrame
            dfs.append(df)  # Add the dataframe to the list
            print(f"Loaded {file} with shape {df.shape}")  # Log successful load
        except Exception as e:
            print(f"Error loading {file}: {e}")  # Handle any errors during loading
    else:
        print(f"File not found: {file}")  # Notify if file is missing

Loaded FOOD-DATA-GROUP1.csv with shape (551, 37)
Loaded FOOD-DATA-GROUP2.csv with shape (319, 37)
Loaded FOOD-DATA-GROUP3.csv with shape (571, 37)
Loaded FOOD-DATA-GROUP4.csv with shape (232, 37)
Loaded FOOD-DATA-GROUP5.csv with shape (722, 37)
Loaded indian_food.csv with shape (255, 9)
Loaded pred_food.csv with shape (502, 13)
Loaded weight_change_dataset.csv with shape (100, 13)


In [10]:
# Step 1 (Continued): Merge all datasets into a single DataFrame
if dfs:
    data = pd.concat(dfs, ignore_index=True)  # Merge all dataframes into one
    print(f"Merged dataset size: {data.shape}")  # Print the size of merged dataset
else:
    print("No data to merge.")  # If no data, notify


Merged dataset size: (3252, 69)


In [13]:
# Step 2: Handle missing values and text conversion
data.fillna("Unknown", inplace=True)  # Replace missing values with placeholder "Unknown"


In [14]:
# Convert categorical columns (strings) to lowercase and strip extra spaces
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype(str).str.lower().str.strip()  # Clean text data


In [15]:
# Step 3: Convert to JSONL format for Ollama
jsonl_path = "processed_food_data1.jsonl"  # Path to save the JSONL file
with open(jsonl_path, "w") as f:
    for record in data.to_dict(orient="records"):  # Convert each row to dictionary
        f.write(json.dumps(record) + "\n")  # Write to JSONL file

print(f"Preprocessed data saved at {jsonl_path}")  # Notify where the data was saved

Preprocessed data saved at processed_food_data1.jsonl
