# E-tongue Feature Analysis and Model Optimization

This notebook analyzes different feature combinations and preprocessing techniques for the E-tongue system with the new hardware configuration:

1. Original Features (7):
   - alcohol_ppm
   - as7263_r through as7263_w (6 NIR wavelengths)

2. Feature Engineering Methods:
   - Original features only
   - Original + Ratio features
   - Original + Ratio + Derivatives
   - PCA components

3. Scaling Comparison:
   - StandardScaler
   - MinMaxScaler

4. Data Processing:
   - Weighted averaging for 10 readings
   - Handling missing values
   - Removing duplicates
   - Feature correlation analysis

## Setup and Dependencies

In [2]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings

# Configure settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
plt.style.use('default')  # Using default style instead of seaborn
sns.set_theme()  # This will set up seaborn's default theme
np.random.seed(42)

# Define constants
SENSOR_COLUMNS = [
    'alcohol_ppm',
    'as7263_r', 'as7263_s', 'as7263_t', 
    'as7263_u', 'as7263_v', 'as7263_w'
]

TASTE_COLUMNS = [
    'sweet',     # Madhura
    'sour',      # Amla
    'salty',     # Lavana
    'bitter',    # Tikta
    'pungent',   # Katu
    'astringent' # Kashaya
]

print("Libraries and configurations loaded successfully!")

Libraries and configurations loaded successfully!


## Data Preprocessing Functions

Implementing preprocessing pipeline for the new hardware configuration:
1. Weighted averaging for 10 readings
2. Feature engineering
3. Scaling methods
4. Duplicate detection and removal

In [3]:
# Data preprocessing functions
def weighted_average_readings(readings_list, num_samples=10):
    """
    Calculate weighted average of sensor readings with increasing weights.
    Last reading has highest weight (1.0).
    """
    if len(readings_list) < num_samples:
        raise ValueError(f"Need at least {num_samples} readings. Got {len(readings_list)}")
    
    # Use last num_samples readings
    readings = readings_list[-num_samples:]
    
    # Generate weights with increasing values
    weights = np.linspace(0.1, 1.0, num_samples)
    weights = weights / np.sum(weights)  # Normalize weights
    
    # Initialize result dictionary
    result = {}
    
    # Calculate weighted average for each sensor
    for column in SENSOR_COLUMNS:
        values = [reading[column] for reading in readings]
        result[column] = np.average(values, weights=weights)
    
    return result

def calculate_ratios(X):
    """Calculate ratios between sensor channels."""
    ratios = pd.DataFrame(index=X.index)
    
    # Calculate ratios between NIR channels
    nir_channels = ['as7263_r', 'as7263_s', 'as7263_t', 
                   'as7263_u', 'as7263_v', 'as7263_w']
    
    for i in range(len(nir_channels)):
        for j in range(i+1, len(nir_channels)):
            col1 = nir_channels[i]
            col2 = nir_channels[j]
            ratio_name = f'ratio_{col1[-1]}_{col2[-1]}'
            ratios[ratio_name] = X[col1] / X[col2]
    
    # Calculate alcohol/NIR ratios
    for channel in nir_channels:
        ratio_name = f'ratio_alcohol_{channel[-1]}'
        ratios[ratio_name] = X['alcohol_ppm'] / X[channel]
    
    return ratios

def calculate_derivatives(X):
    """Calculate first-order differences between adjacent wavelengths."""
    derivatives = pd.DataFrame(index=X.index)
    
    channels = ['as7263_r', 'as7263_s', 'as7263_t', 
               'as7263_u', 'as7263_v', 'as7263_w']
    
    for i in range(len(channels)-1):
        col1 = channels[i]
        col2 = channels[i+1]
        deriv_name = f'deriv_{col1[-1]}_{col2[-1]}'
        derivatives[deriv_name] = X[col2] - X[col1]
    
    # Add alcohol rate of change
    derivatives['alcohol_rate'] = np.gradient(X['alcohol_ppm'])
    
    return derivatives

def remove_duplicates(X, threshold=1e-6):
    """Remove duplicate readings within a threshold."""
    # Calculate pairwise differences
    diff_matrix = np.abs(X.values[:, np.newaxis] - X.values) < threshold
    diff_matrix = diff_matrix.all(axis=2)
    np.fill_diagonal(diff_matrix, False)
    
    # Find unique indices
    unique_idx = ~diff_matrix.any(axis=1)
    
    return X[unique_idx]

def compare_scalers(X, scaler1=StandardScaler(), scaler2=MinMaxScaler()):
    """Compare different scaling methods."""
    X_standard = pd.DataFrame(
        scaler1.fit_transform(X),
        columns=X.columns,
        index=X.index
    )
    
    X_minmax = pd.DataFrame(
        scaler2.fit_transform(X),
        columns=X.columns,
        index=X.index
    )
    
    # Calculate correlation preservation
    corr_orig = X.corr()
    corr_standard = X_standard.corr()
    corr_minmax = X_minmax.corr()
    
    corr_diff_standard = np.abs(corr_orig - corr_standard).mean().mean()
    corr_diff_minmax = np.abs(corr_orig - corr_minmax).mean().mean()
    
    print("Correlation differences:")
    print(f"StandardScaler: {corr_diff_standard:.4f}")
    print(f"MinMaxScaler: {corr_diff_minmax:.4f}")
    
    return X_standard, X_minmax

# Test preprocessing functions with sample data
def generate_sample_data(n_samples=100, n_readings=10):
    """Generate sample data for testing."""
    samples = []
    for _ in range(n_samples):
        readings = []
        base_values = {
            'alcohol_ppm': np.random.uniform(30, 70),
            'as7263_r': np.random.uniform(0.3, 0.7),
            'as7263_s': np.random.uniform(0.3, 0.7),
            'as7263_t': np.random.uniform(0.3, 0.7),
            'as7263_u': np.random.uniform(0.3, 0.7),
            'as7263_v': np.random.uniform(0.3, 0.7),
            'as7263_w': np.random.uniform(0.3, 0.7)
        }
        
        for _ in range(n_readings):
            reading = {
                key: value + np.random.normal(0, 0.05) 
                for key, value in base_values.items()
            }
            readings.append(reading)
        
        # Get weighted average
        avg_reading = weighted_average_readings(readings)
        samples.append(avg_reading)
    
    return pd.DataFrame(samples)

# Generate and process sample data
print("Generating sample data...")
X_raw = generate_sample_data()
print("\nShape of raw data:", X_raw.shape)

# Calculate derived features
X_ratios = calculate_ratios(X_raw)
X_derivatives = calculate_derivatives(X_raw)

print("\nNumber of features:")
print(f"Original features: {X_raw.shape[1]}")
print(f"Ratio features: {X_ratios.shape[1]}")
print(f"Derivative features: {X_derivatives.shape[1]}")

# Compare scalers
print("\nComparing scalers on original features:")
X_standard, X_minmax = compare_scalers(X_raw)

# Visualize preprocessing effects
fig = make_subplots(
    rows=3, cols=1,
    subplot_titles=('Original Data', 'StandardScaler', 'MinMaxScaler')
)

for i, data in enumerate([X_raw, X_standard, X_minmax], 1):
    for col in data.columns:
        fig.add_trace(
            go.Box(y=data[col], name=col, showlegend=i==1),
            row=i, col=1
        )

fig.update_layout(height=800, title_text="Feature Distributions After Scaling")
fig.show()

Generating sample data...

Shape of raw data: (100, 7)

Number of features:
Original features: 7
Ratio features: 21
Derivative features: 6

Comparing scalers on original features:
Correlation differences:
StandardScaler: 0.0000
MinMaxScaler: 0.0000


## Feature Set Comparison

Let's compare the performance of different feature combinations:
1. Original features only (7 features)
2. Original + Ratio features
3. Original + Ratio + Derivatives
4. PCA components

We'll evaluate each combination using:
- Cross-validation
- Feature importance analysis
- Correlation analysis
- Performance metrics (R², MSE, confidence)

In [4]:
# Feature set comparison functions
def evaluate_feature_set(X, y, scaler=None, pca=None, cv=5):
    """Evaluate a feature set using cross-validation."""
    if scaler:
        X = scaler.fit_transform(X)
    if pca:
        X = pca.fit_transform(X)
        
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    
    # Calculate cross-validation scores
    r2_scores = cross_val_score(model, X, y, cv=cv, scoring='r2')
    mse_scores = -cross_val_score(model, X, y, cv=cv, scoring='neg_mean_squared_error')
    
    # Fit model on full data for feature importance
    model.fit(X, y)
    
    # Calculate tree variance for confidence
    if hasattr(model, 'estimators_'):
        tree_predictions = np.array([tree.predict(X) for tree in model.estimators_])
        variance = np.var(tree_predictions, axis=0)
        confidence = 1 / (1 + variance)
        confidence = np.clip(confidence, 0.1, 0.99)
    else:
        confidence = None
    
    return {
        'r2_mean': r2_scores.mean(),
        'r2_std': r2_scores.std(),
        'mse_mean': mse_scores.mean(),
        'mse_std': mse_scores.std(),
        'model': model,
        'confidence': confidence
    }

def compare_feature_sets(X_raw, y, cv=5):
    """Compare different feature combinations."""
    # Prepare feature sets
    X_with_ratios = pd.concat([X_raw, calculate_ratios(X_raw)], axis=1)
    X_all_features = pd.concat([X_with_ratios, calculate_derivatives(X_raw)], axis=1)
    
    # Prepare scalers and PCA
    standard_scaler = StandardScaler()
    minmax_scaler = MinMaxScaler()
    pca = PCA(n_components=0.95)  # Keep 95% of variance
    
    # Evaluate each combination
    results = {
        'Original (Standard)': evaluate_feature_set(X_raw, y, standard_scaler),
        'Original (MinMax)': evaluate_feature_set(X_raw, y, minmax_scaler),
        'Original + Ratios (Standard)': evaluate_feature_set(X_with_ratios, y, standard_scaler),
        'Original + Ratios (MinMax)': evaluate_feature_set(X_with_ratios, y, minmax_scaler),
        'All Features (Standard)': evaluate_feature_set(X_all_features, y, standard_scaler),
        'All Features (MinMax)': evaluate_feature_set(X_all_features, y, minmax_scaler),
        'PCA (Standard)': evaluate_feature_set(X_all_features, y, standard_scaler, pca),
        'PCA (MinMax)': evaluate_feature_set(X_all_features, y, minmax_scaler, pca)
    }
    
    return results

# Generate target variables for testing
np.random.seed(42)
y = np.random.uniform(0, 1, len(X_raw))

# Compare feature sets
print("Comparing feature sets...")
results = compare_feature_sets(X_raw, y)

# Create performance comparison plot
performance_data = pd.DataFrame({
    name: {
        'R² Score': metrics['r2_mean'],
        'R² Std': metrics['r2_std'],
        'MSE': metrics['mse_mean'],
        'MSE Std': metrics['mse_std']
    }
    for name, metrics in results.items()
}).T

# Plot R² scores
fig = go.Figure()
fig.add_trace(go.Bar(
    name='R² Score',
    x=performance_data.index,
    y=performance_data['R² Score'],
    error_y=dict(type='data', array=performance_data['R² Std'])
))

fig.update_layout(
    title='Model Performance Comparison',
    xaxis_title='Feature Set',
    yaxis_title='R² Score',
    barmode='group'
)
fig.show()

# Plot MSE scores
fig = go.Figure()
fig.add_trace(go.Bar(
    name='MSE',
    x=performance_data.index,
    y=performance_data['MSE'],
    error_y=dict(type='data', array=performance_data['MSE Std'])
))

fig.update_layout(
    title='Model Error Comparison',
    xaxis_title='Feature Set',
    yaxis_title='Mean Squared Error',
    barmode='group'
)
fig.show()

# Display numerical results
print("\nDetailed Performance Metrics:")
display(performance_data)

Comparing feature sets...



Detailed Performance Metrics:


Unnamed: 0,R² Score,R² Std,MSE,MSE Std
Original (Standard),-0.098157,0.037025,0.095812,0.012548
Original (MinMax),-0.098157,0.037025,0.095812,0.012548
Original + Ratios (Standard),-0.115228,0.10601,0.096321,0.0071
Original + Ratios (MinMax),-0.115228,0.10601,0.096321,0.0071
All Features (Standard),-0.091431,0.089503,0.094454,0.007898
All Features (MinMax),-0.091431,0.089503,0.094454,0.007898
PCA (Standard),-0.147964,0.113321,0.10024,0.016501
PCA (MinMax),-0.005169,0.081097,0.088087,0.015334


## Dilution Analysis and Taste Profile Verification

Let's analyze how different dilution levels affect our sensor readings and taste profiles. We'll:
1. Generate synthetic data with known dilution ratios
2. Use our best feature set (Original + Ratios with StandardScaler)
3. Test effectiveness across all 6 taste profiles
4. Analyze sensitivity at different dilution levels

In [5]:
# Generate synthetic data for different dilution levels
def generate_dilution_data(base_concentrations, dilution_factors, samples_per_dilution=10):
    """
    Generate synthetic data for different dilution levels of medicines.
    
    Parameters:
    - base_concentrations: Dict with baseline sensor values for each taste
    - dilution_factors: List of dilution ratios (e.g., [1.0, 0.5, 0.25])
    - samples_per_dilution: Number of samples to generate per dilution level
    """
    all_samples = []
    
    for taste, base_values in base_concentrations.items():
        for dilution in dilution_factors:
            for _ in range(samples_per_dilution):
                readings = []
                # Scale base values by dilution factor
                diluted_values = {
                    key: value * dilution + np.random.normal(0, 0.02 * value) 
                    for key, value in base_values.items()
                }
                
                # Generate 10 readings with noise
                for _ in range(10):
                    reading = {
                        key: value + np.random.normal(0, 0.05 * value)
                        for key, value in diluted_values.items()
                    }
                    readings.append(reading)
                
                # Get weighted average
                avg_reading = weighted_average_readings(readings)
                avg_reading['taste'] = taste
                avg_reading['dilution'] = dilution
                all_samples.append(avg_reading)
    
    return pd.DataFrame(all_samples)

# Define baseline concentrations for each taste profile
base_concentrations = {
    'sweet': {  # Madhura
        'alcohol_ppm': 45.0,
        'as7263_r': 0.65, 'as7263_s': 0.55, 'as7263_t': 0.45,
        'as7263_u': 0.40, 'as7263_v': 0.35, 'as7263_w': 0.30
    },
    'sour': {   # Amla
        'alcohol_ppm': 60.0,
        'as7263_r': 0.40, 'as7263_s': 0.50, 'as7263_t': 0.60,
        'as7263_u': 0.55, 'as7263_v': 0.45, 'as7263_w': 0.35
    },
    'salty': {  # Lavana
        'alcohol_ppm': 50.0,
        'as7263_r': 0.45, 'as7263_s': 0.55, 'as7263_t': 0.50,
        'as7263_u': 0.45, 'as7263_v': 0.40, 'as7263_w': 0.45
    },
    'bitter': { # Tikta
        'alcohol_ppm': 55.0,
        'as7263_r': 0.35, 'as7263_s': 0.40, 'as7263_t': 0.55,
        'as7263_u': 0.60, 'as7263_v': 0.50, 'as7263_w': 0.40
    },
    'pungent': { # Katu
        'alcohol_ppm': 65.0,
        'as7263_r': 0.50, 'as7263_s': 0.45, 'as7263_t': 0.40,
        'as7263_u': 0.50, 'as7263_v': 0.55, 'as7263_w': 0.50
    },
    'astringent': { # Kashaya
        'alcohol_ppm': 40.0,
        'as7263_r': 0.30, 'as7263_s': 0.35, 'as7263_t': 0.45,
        'as7263_u': 0.50, 'as7263_v': 0.60, 'as7263_w': 0.55
    }
}

# Generate data for different dilution levels
dilution_factors = [1.0, 0.75, 0.5, 0.25, 0.1]
dilution_data = generate_dilution_data(base_concentrations, dilution_factors)

# Prepare features using our best method (Original + Ratios with StandardScaler)
X_features = dilution_data[SENSOR_COLUMNS].copy()
X_with_ratios = pd.concat([X_features, calculate_ratios(X_features)], axis=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_with_ratios)
X_scaled = pd.DataFrame(X_scaled, columns=X_with_ratios.columns)

# Analyze separability of taste profiles at different dilutions
plt.figure(figsize=(15, 8))
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Create scatter plot with dilution levels
fig = go.Figure()

for taste in base_concentrations.keys():
    for dilution in dilution_factors:
        mask = (dilution_data['taste'] == taste) & (dilution_data['dilution'] == dilution)
        fig.add_trace(go.Scatter(
            x=X_pca[mask, 0],
            y=X_pca[mask, 1],
            mode='markers',
            name=f'{taste} ({dilution:.2f})',
            marker=dict(size=10),
        ))

fig.update_layout(
    title='Taste Profile Separation by Dilution Level (PCA)',
    xaxis_title='First Principal Component',
    yaxis_title='Second Principal Component',
    height=600
)
fig.show()

# Calculate separation metrics
def calculate_separation_metrics(X, taste_labels, dilution_levels):
    """Calculate metrics for taste profile separation at each dilution level."""
    metrics = []
    
    for dilution in dilution_levels:
        dilution_mask = dilution_data['dilution'] == dilution
        X_dilution = X[dilution_mask]
        tastes_dilution = taste_labels[dilution_mask]
        
        # Calculate average distance between different taste clusters
        distances = []
        for taste1 in base_concentrations.keys():
            for taste2 in base_concentrations.keys():
                if taste1 >= taste2:
                    continue
                    
                mask1 = tastes_dilution == taste1
                mask2 = tastes_dilution == taste2
                
                if not any(mask1) or not any(mask2):
                    continue
                
                centroid1 = X_dilution[mask1].mean(axis=0)
                centroid2 = X_dilution[mask2].mean(axis=0)
                distance = np.linalg.norm(centroid1 - centroid2)
                distances.append(distance)
        
        metrics.append({
            'dilution': dilution,
            'avg_separation': np.mean(distances),
            'min_separation': np.min(distances),
            'std_separation': np.std(distances)
        })
    
    return pd.DataFrame(metrics)

separation_metrics = calculate_separation_metrics(
    X_scaled, 
    dilution_data['taste'],
    dilution_factors
)

# Plot separation metrics
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=separation_metrics['dilution'],
    y=separation_metrics['avg_separation'],
    mode='lines+markers',
    name='Average Separation',
    error_y=dict(
        type='data',
        array=separation_metrics['std_separation'],
        visible=True
    )
))

fig.add_trace(go.Scatter(
    x=separation_metrics['dilution'],
    y=separation_metrics['min_separation'],
    mode='lines+markers',
    name='Minimum Separation'
))

fig.update_layout(
    title='Taste Profile Separation vs Dilution Level',
    xaxis_title='Dilution Factor',
    yaxis_title='Separation Distance (Standardized Units)',
    height=500
)
fig.show()

print("\nSeparation Metrics by Dilution Level:")
display(separation_metrics.round(4))

# Find optimal dilution range
optimal_dilution = separation_metrics.loc[
    separation_metrics['avg_separation'].idxmax(),
    'dilution'
]
min_effective_dilution = separation_metrics[
    separation_metrics['min_separation'] > 
    0.5 * separation_metrics['min_separation'].max()
]['dilution'].min()

print(f"\nOptimal dilution factor: {optimal_dilution:.2f}")
print(f"Minimum effective dilution: {min_effective_dilution:.2f}")


Separation Metrics by Dilution Level:


Unnamed: 0,dilution,avg_separation,min_separation,std_separation
0,1.0,5.9526,2.9561,1.6072
1,0.75,5.937,2.9215,1.5081
2,0.5,5.7639,2.6863,1.5144
3,0.25,5.906,3.12,1.4667
4,0.1,6.0428,3.2251,1.7232



Optimal dilution factor: 0.10
Minimum effective dilution: 0.10


<Figure size 1500x800 with 0 Axes>