# Real Data Pipeline (AS7263 + MQ3)

This notebook loads the real dataset, cleans it, creates splits, visualizes FTIR-like profiles, and runs predictions/evaluations for taste, dilution, effectiveness, and medicine models.

Source dataset: [GitHub dataset link](https://github.com/Twarita11/ayurvedic-etongue-ml/blob/main/sensor%20code/data.csv)


In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

from src.data.real_loader import load_real_dataset, train_val_test_split as real_split
from src.data.synthetic_loader import load_synthetic_dataset, train_val_test_split as synth_split
from src.visualization.plots import Visualizer

# Load and clean the dataset (fallback to synthetic if labels missing)
real_df = load_real_dataset()
has_labels = any(c in real_df.columns for c in ['Dilution_Percent','Effectiveness_Score','Medicine_Name','taste_sweet'])
if not has_labels:
    print('Real dataset lacks labels; falling back to synthetic labeled dataset.')
    real_df = load_synthetic_dataset()
    split_fn = synth_split
else:
    split_fn = real_split
print('Loaded data:', real_df.shape)
real_df.head()


In [None]:
# Create train/val/test splits (70/15/15)
train_df, val_df, test_df = split_fn(real_df)
list(map(lambda d: d.shape, [train_df, val_df, test_df]))


In [None]:
# FTIR-like visualization
viz = Visualizer(sensor_columns=['R','S','T','U','V','W'])
fig = viz.plot_ftir_like(real_df)
fig.show()


In [None]:
# Prepare features for existing predictors (if available)
from app.utils.predictor import ModelPredictor
import joblib

predictor = ModelPredictor()

# Load existing models if present
models_dir = Path('ayurvedic-ml-pipeline/models')
model_files = {
    'taste': models_dir / 'taste_sweet_model.pkl',  # placeholder name; adjust if different
}

for key, path in model_files.items():
    if path.exists():
        predictor.models[key] = joblib.load(path)

# Make a demo input from the first test row
sample = test_df.iloc[0]
input_data = {
    'temperature': 25.0,
    'mq3_ppm': float(sample['mq3_ppm']),
    'as7263_r': float(sample['R']),
    'as7263_s': float(sample['S']),
    'as7263_t': float(sample['T']),
    'as7263_u': float(sample['U']),
    'as7263_v': float(sample['V']),
    'as7263_w': float(sample['W']),
}

# Only predict if all required models are loaded
if predictor.models:
    print('Predictor ready with models:', list(predictor.models.keys()))
    # Commented out if not all expected models are present
    # result = predictor.predict(input_data)
    # result
else:
    print('No pre-trained models found to run predictions yet.')


In [None]:
# Training on real data if labels exist
from src.models.training import (
    train_dilution_regressor, train_effectiveness_regressor, train_medicine_classifier,
    train_taste_multiregressor, evaluate_regression, evaluate_classification, save_model,
    SENSOR_FEATURES
)

models = {}
metrics = {}

# Detect available targets
available_cols = set(real_df.columns)

# Optional taste columns from synthetic analysis
taste_cols = [c for c in ['taste_sweet','taste_sour','taste_salty','taste_bitter','taste_pungent','taste_astringent'] if c in available_cols]

# Augment with engineered features? For now we keep baseline sensors; can extend later.

# Train models conditionally
if 'Dilution_Percent' in available_cols and not real_df['Dilution_Percent'].isna().all():
    models['dilution'] = train_dilution_regressor(train_df)
    metrics['dilution'] = evaluate_regression(models['dilution'], val_df, 'Dilution_Percent')
    save_model(models['dilution'], Path('ayurvedic-ml-pipeline/models/dilution_model.pkl'))

if 'Effectiveness_Score' in available_cols and not real_df['Effectiveness_Score'].isna().all():
    models['effectiveness'] = train_effectiveness_regressor(train_df)
    metrics['effectiveness'] = evaluate_regression(models['effectiveness'], val_df, 'Effectiveness_Score')
    save_model(models['effectiveness'], Path('ayurvedic-ml-pipeline/models/effectiveness_model.pkl'))

if 'Medicine_Name' in available_cols and not real_df['Medicine_Name'].isna().all():
    models['medicine'] = train_medicine_classifier(train_df)
    metrics['medicine'] = evaluate_classification(models['medicine'], val_df, 'Medicine_Name')
    save_model(models['medicine'], Path('ayurvedic-ml-pipeline/models/medicine_model.pkl'))

if taste_cols:
    models['taste'] = train_taste_multiregressor(train_df, taste_cols)
    # Evaluate taste as mean of per-column regression metrics
    taste_metrics = {}
    for tcol in taste_cols:
        taste_metrics[tcol] = evaluate_regression(models['taste'], val_df.rename(columns={tcol: tcol}), tcol)
    metrics['taste'] = taste_metrics
    save_model(models['taste'], Path('ayurvedic-ml-pipeline/models/taste_multireg_model.pkl'))

metrics


In [None]:
# Evaluation plots similar to prior notebooks
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import ConfusionMatrixDisplay

# Regression parity/residual plots

def plot_regression_parity(model, df, target):
    X = df[['R','S','T','U','V','W','mq3_ppm']]
    y_true = df[target]
    y_pred = model.predict(X)
    plt.figure(figsize=(6,6))
    sns.scatterplot(x=y_true, y=y_pred)
    lims = [min(y_true.min(), y_pred.min()), max(y_true.max(), y_pred.max())]
    plt.plot(lims, lims, 'r--')
    plt.xlabel('True')
    plt.ylabel('Predicted')
    plt.title(f'Parity: {target}')
    plt.show()

if 'dilution' in models:
    plot_regression_parity(models['dilution'], val_df, 'Dilution_Percent')
if 'effectiveness' in models:
    plot_regression_parity(models['effectiveness'], val_df, 'Effectiveness_Score')

# Classification confusion matrix
if 'medicine' in models:
    X = val_df[['R','S','T','U','V','W','mq3_ppm']]
    y_true = val_df['Medicine_Name']
    y_pred = models['medicine'].predict(X)
    plt.figure(figsize=(8,6))
    ConfusionMatrixDisplay.from_predictions(y_true, y_pred, xticks_rotation=45)
    plt.title('Medicine Confusion Matrix (Validation)')
    plt.tight_layout()
    plt.show()
