In [None]:
import random
import numpy as np
from sklearn.utils import shuffle
import time
from langchain_groq import ChatGroq  # Assuming you are using the langchain_groq package
from sklearn.feature_selection import VarianceThreshold
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Define the model mapping
groq_models = {
    "llama3-70b": "llama3-70b-8192",
    "mixtral": "mixtral-8x7b-32768",
    "gemma-7b": "gemma-7b-it",
    "llama3.1-70b": "llama-3.1-70b-versatile",
    "llama3-8b": "llama3-8b-8192",
    "llama3.1-8b": "llama-3.1-8b-instant",
    "gemma-9b": "gemma2-9b-it"
}

activity_labels = {
    1: "WALKING",
    2: "WALKING_UPSTAIRS",
    3: "WALKING_DOWNSTAIRS",
    4: "SITTING",
    5: "STANDING",
    6: "LAYING"
}
reverse_activity_labels = {v: k for k, v in activity_labels.items()}

print("Activity Labels Dictionary: ", reverse_activity_labels)

# Load datasets
X_train = np.load('../FinalDataset/X_train.npy')
X_test = np.load('../FinalDataset/X_test.npy')
y_train = np.load('../FinalDataset/y_train.npy')
y_test = np.load('../FinalDataset/y_test.npy')

#############################################################################################
import tsfel

# Extract features using TSFEL
cfg = tsfel.get_features_by_domain()  # Get all features by default
X_train_features = tsfel.time_series_features_extractor(cfg, X_train, verbose=1, fs=50)
X_test_features = tsfel.time_series_features_extractor(cfg, X_test, verbose=1, fs=50)
print("Shape of train data after feature extraction using TSFEL:", X_train_features.shape)
print("Shape of test data after feature extraction using TSFEL:", X_test_features.shape)

# Remove highly correlated features
correlated_features = tsfel.correlated_features(X_train_features)
print("Highly correlated features (sample):", correlated_features[:5])
X_train_filtered = X_train_features.drop(correlated_features, axis=1)
X_test_filtered = X_test_features.drop(correlated_features, axis=1)
print("Shape of data after removing correlated features:", X_train_filtered.shape, X_test_filtered.shape)

# Remove low variance features
variance_selector = VarianceThreshold(threshold=0)  # Default is 0
X_train_reduced = variance_selector.fit_transform(X_train_filtered)
X_test_reduced = variance_selector.transform(X_test_filtered)
print("Shape of data after variance thresholding:", X_train_reduced.shape, X_test_reduced.shape)

# Normalize features
scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train_reduced)
X_test_normalized = scaler.transform(X_test_reduced)
print("Shape of data after normalization:", X_train_normalized.shape, X_test_normalized.shape)

# Apply PCA
pca = PCA(n_components=6)
X_train_pca = pca.fit_transform(X_train_normalized)
X_test_pca = pca.transform(X_test_normalized)

print("Shape of data after PCA:", X_train_pca.shape)
print("Shape of test data after PCA:", X_test_pca.shape)

##################################################################################################

# Constants
NUM_EXAMPLES_PER_CLASS = 4
NUM_SAMPLES = 20
MAX_RETRIES = 3
RETRY_DELAY = 5

# API Keys
API_KEYS = [
    "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"
]
api_key_index = 0

def get_next_api_key():
    global api_key_index
    key = API_KEYS[api_key_index]
    api_key_index = (api_key_index + 1) % len(API_KEYS)
    return key

def format_data_as_string(data):
    return str(data.tolist())

def create_few_shot_examples(X, y, activity_labels, num_examples_per_class, num_samples_per_class):
    examples = []
    for activity, label in activity_labels.items():
        indices = np.where(y == activity)[0]
        selected_indices = np.random.choice(indices, num_examples_per_class + num_samples_per_class, replace=False)
        for idx in selected_indices:
            example_data = X[idx]
            example_str = format_data_as_string(example_data)
            examples.append((example_str, label))
    return shuffle(examples)

def create_prompt(examples, data_str):
    example_strs = "\n".join([f"    - Example {i+1}: {ex[0]} -> {ex[1]}" for i, ex in enumerate(examples)])
    
    prompt = f"""
    You are a highly trained human activity classification model.

    Your task is to analyze the given accelerometer data and classify the human activity into one of the following categories:
    - WALKING
    - WALKING_UPSTAIRS
    - WALKING_DOWNSTAIRS
    - SITTING
    - STANDING
    - LAYING

    Here is the accelerometer data provided:
    - You have 500 readings, each containing three accelerometer values: (acceleration_x, acceleration_y, acceleration_z).
    - The data is collected over a 10-second period at a sampling rate of 50 Hz, which gives 500 readings.
    - I have used the TSFEL library to reduce the dataset to 116 features.
    Data Format:
    - The data is provided as a nested list. Each inner list represents a single reading: (acceleration_x, acceleration_y, acceleration_z).
    - The x component represents depth direction, measured in terms of g-force.
    - The y component represents sideways direction, measured in terms of g-force.
    - The z component represents forward direction, measured in terms of g-force.

    Here are a few examples:{example_strs}

    Please analyze the examples extensively and provide the most likely activity label for the below data from the list above.

    Provide ONLY the classification label (from the given options above) as output.

    Data: {data_str}
    """
    return prompt

def predict_activity_labels(X_test, examples, activity_labels, reverse_activity_labels, model_name):
    predictions = []
    for i in range(NUM_SAMPLES):
        test_example = X_test[i]
        test_data_str = format_data_as_string(test_example)
        prompt = create_prompt(examples, test_data_str)
        
        api_key = get_next_api_key()
        llm = ChatGroq(model=groq_models[model_name], api_key=api_key, temperature=0)

        for attempt in range(MAX_RETRIES):
            try:
                response = llm.invoke(prompt)
                print(response.usage_metadata)
                predicted_label = response.content.strip()
                activity_number = reverse_activity_labels.get(predicted_label, -1)
                actual_activity = activity_labels[y_test[i]]
                print(f"Predicted Activity: {predicted_label} | Actual Activity: {actual_activity}")
                predictions.append(activity_number)
                break
            except Exception as e:
                print(f"Error: {e}. Retrying in {RETRY_DELAY} seconds...")
                time.sleep(RETRY_DELAY)
    
    return predictions

# Create balanced few-shot examples
few_shot_examples = create_few_shot_examples(X_train_pca, y_train, activity_labels, NUM_EXAMPLES_PER_CLASS, NUM_SAMPLES)

# Predict activities
model_name = "llama3.1-70b"
predicted_labels = predict_activity_labels(X_test_pca, few_shot_examples, activity_labels, reverse_activity_labels, model_name)

# Calculate accuracy
correct_predictions = sum(predicted_labels[i] == y_test[i] for i in range(NUM_SAMPLES))
accuracy = correct_predictions / NUM_SAMPLES
print(f"Accuracy: {accuracy:.2f}")
print(f"Accuracy Percentage: {accuracy * 100:.2f}%")


In [None]:
import random
import numpy as np
from sklearn.utils import shuffle
import time
from langchain_groq import ChatGroq  # Assuming you are using the langchain_groq package
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Define the model mapping
groq_models = {
    "llama3-70b": "llama3-70b-8192",
    "mixtral": "mixtral-8x7b-32768",
    "gemma-7b": "gemma-7b-it",
    "llama3.1-70b": "llama-3.1-70b-versatile",
    "llama3-8b": "llama3-8b-8192",
    "llama3.1-8b": "llama-3.1-8b-instant",
    "gemma-9b": "gemma2-9b-it"
}

activity_labels = {
    1: "WALKING",
    2: "WALKING_UPSTAIRS",
    3: "WALKING_DOWNSTAIRS",
    4: "SITTING",
    5: "STANDING",
    6: "LAYING"
}
reverse_activity_labels = {v: k for k, v in activity_labels.items()}

print("Activity Labels Dictionary: ", reverse_activity_labels)

# Load datasets
X_train = np.load('../FinalDataset/X_train.npy')
X_test = np.load('../FinalDataset/X_test.npy')
y_train = np.load('../FinalDataset/y_train.npy')
y_test = np.load('../FinalDataset/y_test.npy')

# Feature extraction and processing steps...

import tsfel

# Extract features using TSFEL
cfg = tsfel.get_features_by_domain()  # Get all features by default
X_train_features = tsfel.time_series_features_extractor(cfg, X_train, verbose=1, fs=50)
X_test_features = tsfel.time_series_features_extractor(cfg, X_test, verbose=1, fs=50)

# Remove highly correlated features
correlated_features = tsfel.correlated_features(X_train_features)
X_train_filtered = X_train_features.drop(correlated_features, axis=1)
X_test_filtered = X_test_features.drop(correlated_features, axis=1)

# Remove low variance features
variance_selector = VarianceThreshold(threshold=0)
X_train_reduced = variance_selector.fit_transform(X_train_filtered)
X_test_reduced = variance_selector.transform(X_test_filtered)

# Normalize features
scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train_reduced)
X_test_normalized = scaler.transform(X_test_reduced)

# Apply PCA
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_normalized)
X_test_pca = pca.transform(X_test_normalized)

# Constants
NUM_SAMPLES_PER_ACTIVITY = 3
NUM_TRAINING_EXAMPLES = 18  # Total number of examples for few-shot learning
MAX_RETRIES = 3
RETRY_DELAY = 5

# API Keys
API_KEYS = [
    "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"
]
api_key_index = 0

def get_next_api_key():
    global api_key_index
    key = API_KEYS[api_key_index]
    api_key_index = (api_key_index + 1) % len(API_KEYS)
    return key

def format_data_as_string(data):
    return str(data.tolist())

def create_few_shot_examples(X_train_pca, y_train, activity_labels, num_samples_per_activity):
    few_shot_examples = []
    for activity, label in activity_labels.items():
        indices = np.where(y_train == activity)[0]
        selected_indices = np.random.choice(indices, num_samples_per_activity, replace=False)
        for idx in selected_indices:
            example_data = X_train_pca[idx]
            example_str = format_data_as_string(example_data)
            few_shot_examples.append((example_str, label))
    return shuffle(few_shot_examples)

def create_prompt(few_shot_examples, data_str):
    example_strs = "\n".join([f"    - Example {i+1}: {ex[0]} -> {ex[1]}" for i, ex in enumerate(few_shot_examples)])
    
    prompt = f"""
    You are a highly trained human activity classification model.

    Your task is to analyze the given accelerometer data and classify the human activity into one of the following categories:
    - WALKING
    - WALKING_UPSTAIRS
    - WALKING_DOWNSTAIRS
    - SITTING
    - STANDING
    - LAYING

    Here is the accelerometer data provided:
    - You have 500 readings, each containing three accelerometer values: (acceleration_x, acceleration_y, acceleration_z).
    - The data is collected over a 10-second period at a sampling rate of 50 Hz, which gives 500 readings.
    - I have used the TSFEL library to reduce the dataset to 116 features.
    Data Format:
    - The data is provided as a nested list. Each inner list represents a single reading: (acceleration_x, acceleration_y, acceleration_z).
    - The x component represents depth direction, measured in terms of g-force.
    - The y component represents sideways direction, measured in terms of g-force.
    - The z component represents forward direction, measured in terms of g-force.

    Here are a few examples:{example_strs}

    Please analyze the examples extensively and provide the most likely activity label for the below data from the list above.

    Provide ONLY the classification label (from the given options above) as output.

    Data: {data_str}
    """
    return prompt

def create_balanced_subset(X_test_pca, y_test, activity_labels):
    min_samples_per_class = min(np.sum(y_test == activity) for activity in activity_labels.keys())
    balanced_X_test = []
    balanced_y_test = []
    
    for activity in activity_labels.keys():
        indices = np.where(y_test == activity)[0]
        selected_indices = np.random.choice(indices, min_samples_per_class, replace=False)
        balanced_X_test.extend(X_test_pca[selected_indices])
        balanced_y_test.extend(y_test[selected_indices])
    
    return np.array(balanced_X_test), np.array(balanced_y_test)

def predict_activity_labels(X_test_pca, y_test, few_shot_examples, activity_labels, reverse_activity_labels, model_name):
    predictions = []
    balanced_X_test, balanced_y_test = create_balanced_subset(X_test_pca, y_test, activity_labels)
    
    # Use exactly 3 samples from each activity for prediction
    test_subset_indices = []
    for activity in activity_labels.keys():
        indices = np.where(balanced_y_test == activity)[0]
        selected_indices = np.random.choice(indices, NUM_SAMPLES_PER_ACTIVITY, replace=False)
        test_subset_indices.extend(selected_indices)
    
    test_subset = balanced_X_test[test_subset_indices]
    test_subset_labels = balanced_y_test[test_subset_indices]
    
    for i in range(len(test_subset)):
        test_example = test_subset[i]
        test_data_str = format_data_as_string(test_example)
        prompt = create_prompt(few_shot_examples, test_data_str)
        
        api_key = get_next_api_key()
        llm = ChatGroq(model=groq_models[model_name], api_key=api_key, temperature=0)

        for attempt in range(MAX_RETRIES):
            try:
                response = llm.invoke(prompt)
                print(response.usage_metadata)
                predicted_label = response.content.strip()
                activity_number = reverse_activity_labels.get(predicted_label, -1)
                actual_activity = activity_labels[test_subset_labels[i]]
                print(f"Predicted Activity: {predicted_label} | Actual Activity: {actual_activity}")
                predictions.append(activity_number)
                break
            except Exception as e:
                print(f"Error: {e}. Retrying in {RETRY_DELAY} seconds...")
                time.sleep(RETRY_DELAY)
    
    return predictions

# Create balanced few-shot examples
few_shot_examples = create_few_shot_examples(X_train_pca, y_train, activity_labels, NUM_SAMPLES_PER_ACTIVITY)

# Predict activities
model_name = "llama3.1-70b"
predicted_labels = predict_activity_labels(X_test_pca, y_test, few_shot_examples, activity_labels, reverse_activity_labels, model_name)

# Calculate accuracy
correct_predictions = sum(predicted_labels[i] == y_test[i] for i in range(len(predicted_labels)))
accuracy = correct_predictions / len(predicted_labels)
print(f"Accuracy: {accuracy:.2f}")
print(f"Accuracy Percentage: {accuracy * 100:.2f}%")


In [3]:
import random
import numpy as np
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import time
from langchain_groq import ChatGroq  # Assuming you are using the langchain_groq package
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Define the model mapping
groq_models = {
    "llama3-70b": "llama3-70b-8192",
    "mixtral": "mixtral-8x7b-32768",
    "gemma-7b": "gemma-7b-it",
    "llama3.1-70b": "llama-3.1-70b-versatile",
    "llama3-8b": "llama3-8b-8192",
    "llama3.1-8b": "llama-3.1-8b-instant",
    "gemma-9b": "gemma2-9b-it"
}

activity_labels = {
    1: "WALKING",
    2: "WALKING_UPSTAIRS",
    3: "WALKING_DOWNSTAIRS",
    4: "SITTING",
    5: "STANDING",
    6: "LAYING"
}
reverse_activity_labels = {v: k for k, v in activity_labels.items()}

print("Activity Labels Dictionary: ", reverse_activity_labels)

# Load datasets
X_train = np.load('../FinalDataset/X_train.npy')
X_test = np.load('../FinalDataset/X_test.npy')
y_train = np.load('../FinalDataset/y_train.npy')
y_test = np.load('../FinalDataset/y_test.npy')

# Feature extraction and processing steps...

import tsfel

# Extract features using TSFEL
cfg = tsfel.get_features_by_domain()  # Get all features by default
X_train_features = tsfel.time_series_features_extractor(cfg, X_train, verbose=1, fs=50)
X_test_features = tsfel.time_series_features_extractor(cfg, X_test, verbose=1, fs=50)

# Remove highly correlated features
correlated_features = tsfel.correlated_features(X_train_features)
X_train_filtered = X_train_features.drop(correlated_features, axis=1)
X_test_filtered = X_test_features.drop(correlated_features, axis=1)

# Remove low variance features
variance_selector = VarianceThreshold(threshold=0)
X_train_reduced = variance_selector.fit_transform(X_train_filtered)
X_test_reduced = variance_selector.transform(X_test_filtered)

# Normalize features
scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train_reduced)
X_test_normalized = scaler.transform(X_test_reduced)

# Apply PCA
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_normalized)
X_test_pca = pca.transform(X_test_normalized)

# Constants
NUM_SAMPLES_PER_ACTIVITY = 3
NUM_TRAINING_EXAMPLES = 18 # Total number of examples for few-shot learning
MAX_RETRIES = 3
RETRY_DELAY = 5

# # API Keys
# API_KEYS = [
#     "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"
# ]

API_KEYS = ["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",
               "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",
               "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",
                "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"
]
api_key_index = 0

def get_next_api_key():
    global api_key_index
    key = API_KEYS[api_key_index]
    api_key_index = (api_key_index + 1) % len(API_KEYS)
    return key

def format_data_as_string(data):
    return str(data.tolist())

def create_few_shot_examples(X_train_pca, y_train, activity_labels, num_samples_per_activity):
    few_shot_examples = []
    for activity, label in activity_labels.items():
        indices = np.where(y_train == activity)[0]
        selected_indices = np.random.choice(indices, num_samples_per_activity, replace=False)
        for idx in selected_indices:
            example_data = X_train_pca[idx]
            example_str = format_data_as_string(example_data)
            few_shot_examples.append((example_str, label))
    return shuffle(few_shot_examples)

def create_prompt(few_shot_examples, data_str):
    example_strs = "\n".join([f"    - Example {i+1}: {ex[0]} -> {ex[1]}" for i, ex in enumerate(few_shot_examples)])
    
    prompt = f"""
    You are a highly trained human activity classification model.

    Your task is to analyze the given accelerometer data and classify the human activity into one of the following categories:
    - WALKING
    - WALKING_UPSTAIRS
    - WALKING_DOWNSTAIRS
    - SITTING
    - STANDING
    - LAYING

    Here is the accelerometer data provided:
    - You have 500 readings, each containing three accelerometer values: (acceleration_x, acceleration_y, acceleration_z).
    - The data is collected over a 10-second period at a sampling rate of 50 Hz, which gives 500 readings.
    - I have used the TSFEL library to reduce the dataset to 116 features.
    Data Format:
    - The data is provided as a nested list. Each inner list represents a single reading: (acceleration_x, acceleration_y, acceleration_z).
    - The x component represents depth direction, measured in terms of g-force.
    - The y component represents sideways direction, measured in terms of g-force.
    - The z component represents forward direction, measured in terms of g-force.

    Here are a few examples:{example_strs}

    Please analyze the examples extensively and provide the most likely activity label for the below data from the list above.

    Provide ONLY the classification label (from the given options above) as output.

    Data: {data_str}
    """
    return prompt

def create_balanced_subset(X_test_pca, y_test, activity_labels):
    min_samples_per_class = min(np.sum(y_test == activity) for activity in activity_labels.keys())
    balanced_X_test = []
    balanced_y_test = []
    
    for activity in activity_labels.keys():
        indices = np.where(y_test == activity)[0]
        selected_indices = np.random.choice(indices, min_samples_per_class, replace=False)
        balanced_X_test.extend(X_test_pca[selected_indices])
        balanced_y_test.extend(y_test[selected_indices])
    
    return np.array(balanced_X_test), np.array(balanced_y_test)

def predict_activity_labels(X_test_pca, y_test, few_shot_examples, activity_labels, reverse_activity_labels, model_name):
    predictions = []
    balanced_X_test, balanced_y_test = create_balanced_subset(X_test_pca, y_test, activity_labels)
    
    # Use exactly 3 samples from each activity for prediction
    test_subset_indices = []
    for activity in activity_labels.keys():
        indices = np.where(balanced_y_test == activity)[0]
        selected_indices = np.random.choice(indices, NUM_SAMPLES_PER_ACTIVITY, replace=False)
        test_subset_indices.extend(selected_indices)
    
    test_subset = balanced_X_test[test_subset_indices]
    test_subset_labels = balanced_y_test[test_subset_indices]
    
    for i in range(len(test_subset)):
        test_example = test_subset[i]
        test_data_str = format_data_as_string(test_example)
        prompt = create_prompt(few_shot_examples, test_data_str)
        
        api_key = get_next_api_key()
        llm = ChatGroq(model=groq_models[model_name], api_key=api_key, temperature=0)

        for attempt in range(MAX_RETRIES):
            try:
                response = llm.invoke(prompt)
                print(response.usage_metadata)
                predicted_label = response.content.strip()
                activity_number = reverse_activity_labels.get(predicted_label, -1)
                actual_activity = activity_labels[test_subset_labels[i]]
                print(f"Predicted Activity: {predicted_label} | Actual Activity: {actual_activity}")
                predictions.append(activity_number)
                break
            except Exception as e:
                print(f"Error: {e}. Retrying in {RETRY_DELAY} seconds...")
                time.sleep(RETRY_DELAY)
    
    return np.array(predictions), balanced_y_test[test_subset_indices]

def plot_confusion_matrix(y_true, y_pred, activity_labels):
    # Create confusion matrix
    cm = confusion_matrix(y_true, y_pred, labels=list(activity_labels.keys()))
    
    # Create a heatmap for the confusion matrix
    plt.figure(figsize=(10, 8))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=list(activity_labels.values()))
    disp.plot(cmap=plt.cm.Blues, values_format='d')
    plt.title('Confusion Matrix')
    plt.show()

# Create balanced few-shot examples
few_shot_examples = create_few_shot_examples(X_train_pca, y_train, activity_labels, NUM_SAMPLES_PER_ACTIVITY)

# Predict activities
model_name = "llama3.1-70b"
predicted_labels, true_labels = predict_activity_labels(X_test_pca, y_test, few_shot_examples, activity_labels, reverse_activity_labels, model_name)




Activity Labels Dictionary:  {'WALKING': 1, 'WALKING_UPSTAIRS': 2, 'WALKING_DOWNSTAIRS': 3, 'SITTING': 4, 'STANDING': 5, 'LAYING': 6}
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
{'input_tokens': 895, 'output_tokens': 4, 'total_tokens': 899}
Predicted Activity: WALKING | Actual Activity: WALKING
{'input_tokens': 896, 'output_tokens': 4, 'total_tokens': 900}
Predicted Activity: WALKING | Actual Activity: WALKING
{'input_tokens': 896, 'output_tokens': 8, 'total_tokens': 904}
Predicted Activity: WALKING_UPSTAIRS | Actual Activity: WALKING
{'input_tokens': 895, 'output_tokens': 4, 'total_tokens': 899}
Predicted Activity: WALKING | Actual Activity: WALKING_UPSTAIRS
{'input_tokens': 896, 'output_tokens': 4, 'total_tokens': 900}
Predicted Activity: WALKING | Actual Activity: WALKING_UPSTAIRS
{'input_tokens': 896, 'output_tokens': 4, 'total_tokens': 900}
Predicted Activity: WALKING | Actual Activity: WALKING_UPSTAIRS
{'input_tokens': 896, 'output_tokens': 8, 'total_tokens': 904}
Predicted Activity: WALKING_DOWNSTAIRS | Actual Activity: WALKING_DOWNSTAIRS
{'input_tokens': 896, 'output_tokens': 8, 'total_tokens': 904}
Predicted Activi

In [2]:
# Calculate accuracy
correct_predictions = sum(predicted_labels[i] == true_labels[i] for i in range(len(predicted_labels)))
accuracy = correct_predictions / len(predicted_labels)
print(f"Accuracy: {accuracy:.2f}")
print(f"Accuracy Percentage: {accuracy * 100:.2f}%")
# plot_confusion_matrix(true_labels, predicted_labels, activity_labels)

Accuracy: 0.47
Accuracy Percentage: 47.06%


ValueError: Found input variables with inconsistent numbers of samples: [18, 17]

In [None]:
import random
import numpy as np
from sklearn.utils import shuffle
import time
from langchain_groq import ChatGroq  # Assuming you are using the langchain_groq package


# Define the model mapping
groq_models = {
    "llama3-70b": "llama3-70b-8192",
    "mixtral": "mixtral-8x7b-32768",
    "gemma-7b": "gemma-7b-it",
    "llama3.1-70b": "llama-3.1-70b-versatile",
    "llama3-8b": "llama3-8b-8192",
    "llama3.1-8b": "llama-3.1-8b-instant",
    "gemma-9b": "gemma2-9b-it"
}

activity_labels = {
    1: "WALKING",
    2: "WALKING_UPSTAIRS",
    3: "WALKING_DOWNSTAIRS",
    4: "SITTING",
    5: "STANDING",
    6: "LAYING"
}
reverse_activity_labels = {v: k for k, v in activity_labels.items()}

print("Activity Labels Dictionary: ", reverse_activity_labels)

# Load datasets
X_train = np.load('../FinalDataset/X_train.npy')
X_test = np.load('../FinalDataset/X_test.npy')
y_train = np.load('../FinalDataset/y_train.npy')
y_test = np.load('../FinalDataset/y_test.npy')
# Load data
# X_train_tsfel_reduced = np.load('../FinalDataset/X_train_tsfel_reduced.npy')
# X_test_tsfel_reduced = np.load('../FinalDataset/X_test_tsfel_reduced.npy')

print("Training data shape: ", X_train_tsfel_pca.shape)
print("Testing data shape: ", X_test_tsfel_pca.shape)

# Constants
num_examples_per_class = 5
num_samples = 20
max_retries = 3
retry_delay = 5

def shuffle_data(X_test, y_test):
    # Generate a random permutation of the indices
    permutation = np.random.permutation(len(X_test))
    
    # Apply the permutation to shuffle both X_test and y_test
    X_test_shuffled = X_test[permutation]
    y_test_shuffled = y_test[permutation]
    
    return X_test_shuffled, y_test_shuffled

# Example usage:
X_test_tsfel_pca_shuffled, y_test_shuffled = shuffle_data(X_test_tsfel_pca, y_test)

# API Keys and Index
Groq_Tokens = ["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"]
current_key_index = 0

def get_next_api_key():
    global current_key_index
    api_key = Groq_Tokens[current_key_index]
    current_key_index = (current_key_index + 1) % len(Groq_Tokens)
    return api_key

def format_data_for_prompt(data):
    # Example formatting function; adjust as needed
    return str(data.tolist())

# def create_few_shot_examples(X_train, y_train, activity_dict, num_examples_per_class):
#     examples = []
#     for activity, label in activity_dict.items():
#         class_indices = np.where(y_train == activity)[0]
#         selected_indices = np.random.choice(class_indices, num_examples_per_class, replace=False)
#         for idx in selected_indices:
#             data_example = X_train[idx]
#             data_str = format_data_for_prompt(data_example)
#             examples.append((data_str, label))
#     return shuffle(examples)

def add_class_examples(X_train, y_train, activity_dict, num_samples_per_class=4):
    examples = []
    for activity, label in activity_dict.items():
        class_indices = np.where(y_train == activity)[0]
        class_samples = np.random.choice(class_indices, num_samples_per_class, replace=False)
        for idx in class_samples:
            data_example = X_train[idx]
            data_str = format_data_for_prompt(data_example)
            examples.append((data_str, label))
    return shuffle(examples)

def generate_prompt(examples, data_str):
    output = "\n"
    for j, (example_input, example_output) in enumerate(examples):
        output += f"    - Example {j+1}: {example_input} -> {example_output}\n"
    # print(output)
    prompt = f"""
    You are a highly trained human activity classification model.

    Your task is to analyze the given accelerometer data and classify the human activity into one of the following categories:
    - WALKING
    - WALKING_UPSTAIRS
    - WALKING_DOWNSTAIRS
    - SITTING
    - STANDING
    - LAYING

    Here is the processed data provided:
    - You have two principal components: (principal_component_1, principal_component_2).
    - The data was originally derived from accelerometer readings collected over a 10-second period at a sampling rate of 50 Hz.
    - The principal components capture the most significant variance in the accelerometer data after dimensionality reduction using PCA.

    Here are a few examples:{output}
    
    Please analyze the examples extensively and provide the most likely activity label for the below data from the list above.
    
    Provide ONLY the classification label (from the given options above) as output.

    Data: {data_str}
    """
    return prompt

def predict_activity(X_test, examples, activity_dict, activity_reverse_dict, model_name):
    predictions = []
    count = 0
    for i in range(num_samples):
        X_i = X_test[i]
        data_str = format_data_for_prompt(X_i)
        prompt = generate_prompt(examples, data_str)
        
        api_key = get_next_api_key()
        llm = ChatGroq(model=groq_models[model_name], api_key=api_key, temperature=0)

        for attempt in range(max_retries):
            try:
                answer = llm.invoke(prompt)
                count+=answer.usage_metadata['total_tokens']
                predicted_activity = answer.content.strip()
                activity_number = activity_reverse_dict.get(predicted_activity, -1)
                # Get actual label
                actual_activity = activity_dict[y_test_shuffled[i]]
                # Print predicted and actual activity
                print(f"Predicted Activity: {predicted_activity} | Actual Activity: {actual_activity}")
                predictions.append(activity_number)
                break
            except Exception as e:
                print(f"Error: {e}. Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
    print("Total Tokens Used: ",count)
    return predictions

# Create balanced few-shot examples
# examples = create_few_shot_examples(X_train_tsfel_pca, y_train, activity_dict, num_examples_per_class)
examples = add_class_examples(X_train_tsfel_pca, y_train, activity_dict, num_samples_per_class=10)

model_name = "llama3.1-70b"
# Predict activities
predictions = predict_activity(X_test_tsfel_pca_shuffled, examples, activity_dict, activity_reverse_dict, model_name)

# Calculate accuracy
correct = sum(pred == y_test[i] for i, pred in enumerate(predictions))
accuracy = correct / len(predictions)
print(f"Accuracy: {accuracy:.2f}")
print(f"Accuracy Percentage: {accuracy * 100:.2f}%")