In [12]:
import pandas as pd
import numpy as np
from scipy.stats import norm

In [13]:
X_df = pd.read_csv("X.csv", names=["weight", "height", "label"])
y_df = pd.read_csv("y.csv", names=["weight", 'height'])

X1_df, X2_df = np.array_split(X_df, 2)

  return bound(*args, **kwds)


In [14]:
X = pd.read_csv('X.csv', header=None).values
y_test = pd.read_csv('y.csv', header=None).values

# Separate data into weight, height, and labels
weights = X[:, 0]
heights = X[:, 1]

# Categories: 1 = small, 2 = average, 3 = large
labels = X[:, 2]

# Split data by category
small = X[labels == 1]
average = X[labels == 2]
large = X[labels == 3]

In [15]:
# Calculating mean and standard deviation of weight and height for each category.
params = {
    'small': {
        'weight_mean': np.mean(small[:, 0]),
        'weight_std': np.std(small[:, 0]),
        'height_mean': np.mean(small[:, 1]),
        'height_std': np.std(small[:, 1])
    },
    'average': {
        'weight_mean': np.mean(average[:, 0]),
        'weight_std': np.std(average[:, 0]),
        'height_mean': np.mean(average[:, 1]),
        'height_std': np.std(average[:, 1])
    },
    'large': {
        'weight_mean': np.mean(large[:, 0]),
        'weight_std': np.std(large[:, 0]),
        'height_mean': np.mean(large[:, 1]),
        'height_std': np.std(large[:, 1])
    }
}

In [16]:
def calculate_similarity(test_point, train_features, train_labels, alpha_weight=2, alpha_height=1, beta=1):
    # Define attention weights for each training point
    alpha = np.array([alpha_weight, alpha_height])
    
    # Calculating similarity for each training point
    similarities = []
    for i, exemplar in enumerate(train_features):
        # Computing distance
        distance = np.sum(alpha * np.abs(exemplar - test_point))
        
        # Computing similarity
        similarity = np.exp(-beta * distance)
        
        # Appending similarity along with its label
        similarities.append((similarity, train_labels[i]))

    # Aggregating similarities by category
    small_sim = sum(similar for similar, label in similarities if label == 1)
    average_sim = sum(similar for similar, label in similarities if label == 2)
    large_sim = sum(similar for similar, label in similarities if label == 3)

    # Apply politeness bias: reduce the similarity weight for "large"
    large_sim *= 0.8  # This factor represents the politeness adjustment

    # Choose category with highest similarity score
    similarities_dict = {1: small_sim, 2: average_sim, 3: large_sim}
    return max(similarities_dict, key=similarities_dict.get)


In [17]:
def likelihood(weight, height, category, params):
    """Calculate likelihood for weight and height using Gaussian distributions"""
    weight_likelihood = norm.pdf(weight, params[category]['weight_mean'], params[category]['weight_std'])
    height_likelihood = norm.pdf(height, params[category]['height_mean'], params[category]['height_std'])
    
    # Adjust likelihood based on weight's higher importance 
    likelihood = (weight_likelihood ** 0.7) * (height_likelihood ** 0.3)
    return likelihood

def posterior_prob(weight, height, params):
    """Calculate posterior probability for each category"""
    posterior_probs = {}
    categories = ['small', 'average', 'large']
    cat_labels = {category: i + 1 for i, category in enumerate(categories)}
    
    for category in categories:
        cat_label = cat_labels[category]
        posterior_probs[cat_label] = likelihood(weight, height, category, params)
    
    # Normalize the probabilities so that they sum to 1
    total_posterior = sum(posterior_probs.values())
    for cat_label in posterior_probs:
        posterior_probs[cat_label] /= total_posterior
    
    return posterior_probs

def predict_cat_label(posterior_probs):
    """Find the category with the highest posterior probability"""
    return max(posterior_probs, key=posterior_probs.get)

In [18]:
def gcm_train_predict(X_train, y_test):
    predictions = []
    for test_point in y_test.values:
        similarity = calculate_similarity(test_point, X_train.iloc[:, :2].values, X_train['label'].values)
        predictions.append(similarity)
    return predictions

GCM_X1_predictions = gcm_train_predict(X1_df, y_df)
GCM_X2_predictions = gcm_train_predict(X2_df, y_df)


In [19]:
def rmc_train_predict(X_train, y_test):
    predictions = []
    for test_point in y_test.values:
        posterior_probs = posterior_prob(test_point[0], test_point[1], params)
        predicted_label = predict_cat_label(posterior_probs)
        predictions.append(predicted_label)
    return predictions

RMC_X1_predictions = rmc_train_predict(X1_df, y_df)
RMC_X2_predictions = rmc_train_predict(X2_df, y_df)


In [20]:
print("GCM Predictions:")
print(np.array_equal(GCM_X1_predictions, GCM_X2_predictions))

print("RMC Predictions:")
print(np.array_equal(RMC_X1_predictions, RMC_X2_predictions))

GCM Predictions:
False
RMC Predictions:
True


### GCM Predictions: Not Exchangeable
The GCM model's predictions differ when trained on X1.csv versus X2.csv, indicating it does not treat the data as exchangeable.

### Factors Contributing to GCM's Non-Exchangeability
1. Similarity Calculation: The GCM model calculates similarity based on comparisons between each test point and every training point, making its predictions sensitive to the order and specifics of the data.
2. Politeness Bias: A politeness bias factor (0.8) is applied to the "large" category, which could magnify minor variations in similarity calculations and further affect the model's consistency across datasets.

### RMC Predictions: Exchangeable
Unlike GCM, the RMC model’s predictions remain consistent across both X1.csv and X2.csv, demonstrating that it assumes exchangeability of data.

### Reasons Behind RMC's Exchangeability
1. Parametric Approach: RMC is built on a parametric model using Gaussian distributions for each category, making it more robust to variations in data order.
2. Sufficient Statistics: The RMC model relies on summary statistics—mean and standard deviation—allowing it to capture the essential characteristics of the training data, thereby reducing sensitivity to the specific dataset arrangement.