## Task 4 : Data Collection in the Wild [4 marks]

In [1]:
import pandas as pd
import os

# Define subjects
subjects = ['LAKSH', 'RUDRA', 'PARTHIV']

# Define activities

activities = ['LAYING', 'SITTING', 'STANDING', 'WALKING', 'WALKING_DOWNSTAIRS', 'WALKING_UPSTAIRS']

# Function to load the first 500 rows of each dataset
def load_data(subjects, activities):
    data_list = []
    for subject in subjects:
        for activity in activities:
            file_name = f"{subject}_{activity}.csv"
            file_path = os.path.join(os.getcwd(), file_name)
            print(f"Checking file: {file_path}")  # Debugging line
            if os.path.exists(file_path):
                # Read the first 500 rows
                df = pd.read_csv(file_path, nrows=500)
                df['Activity'] = activity
                df['Subject'] = subject
                data_list.append(df)
            else:
                print(f"File not found: {file_path}")
    return pd.concat(data_list, ignore_index=True)

# Load and process data
combined_data_df = load_data(subjects, activities)

# Save combined DataFrame to CSV if needed
combined_data_df.to_csv('combined_taken_data.csv', index=False)

print("Data merging complete. The combined dataset (first 500 rows) has been created and saved.")


Checking file: /Users/na/Machine Learning Assignment/Assignment-1_Task_2/ES335-assignment-1/HAR/task4/LAKSH_LAYING.csv
Checking file: /Users/na/Machine Learning Assignment/Assignment-1_Task_2/ES335-assignment-1/HAR/task4/LAKSH_SITTING.csv
Checking file: /Users/na/Machine Learning Assignment/Assignment-1_Task_2/ES335-assignment-1/HAR/task4/LAKSH_STANDING.csv
Checking file: /Users/na/Machine Learning Assignment/Assignment-1_Task_2/ES335-assignment-1/HAR/task4/LAKSH_WALKING.csv
Checking file: /Users/na/Machine Learning Assignment/Assignment-1_Task_2/ES335-assignment-1/HAR/task4/LAKSH_WALKING_DOWNSTAIRS.csv
Checking file: /Users/na/Machine Learning Assignment/Assignment-1_Task_2/ES335-assignment-1/HAR/task4/LAKSH_WALKING_UPSTAIRS.csv
Checking file: /Users/na/Machine Learning Assignment/Assignment-1_Task_2/ES335-assignment-1/HAR/task4/RUDRA_LAYING.csv
Checking file: /Users/na/Machine Learning Assignment/Assignment-1_Task_2/ES335-assignment-1/HAR/task4/RUDRA_SITTING.csv
Checking file: /Users

In [2]:
combined_data_df

Unnamed: 0,time,gFx,gFy,gFz,TgF,Activity,Subject
0,0.010562,0.3288,-0.3991,-0.8522,1.00,LAYING,LAKSH
1,0.036925,0.3302,-0.4017,-0.8520,1.00,LAYING,LAKSH
2,0.042296,0.3319,-0.4014,-0.8506,1.00,LAYING,LAKSH
3,0.058767,0.3346,-0.4027,-0.8482,1.00,LAYING,LAKSH
4,0.092472,0.3361,-0.4061,-0.8498,1.00,LAYING,LAKSH
...,...,...,...,...,...,...,...
8995,10.036258,0.6290,0.1049,0.2126,0.67,WALKING_UPSTAIRS,PARTHIV
8996,10.062572,0.6412,0.0711,0.2192,0.68,WALKING_UPSTAIRS,PARTHIV
8997,10.079894,0.6825,0.1017,0.2325,0.73,WALKING_UPSTAIRS,PARTHIV
8998,10.097549,0.7275,0.1801,0.2133,0.78,WALKING_UPSTAIRS,PARTHIV


In [3]:
# Get the unique subjects and activities
subjects = combined_data_df['Subject'].unique()
activities = combined_data_df['Activity'].unique()

# Initialize an empty DataFrame to store the trimmed data
trimmed_combined_data_df = pd.DataFrame()

# Iterate over each subject
for subject in subjects:
    # Filter data for the current subject
    subject_data = combined_data_df[combined_data_df['Subject'] == subject]
    
    # Iterate over each activity
    for activity in activities:
        # Filter data for the current activity of the current subject
        activity_data = subject_data[subject_data['Activity'] == activity]
        
        # Calculate total duration
        total_duration = activity_data['time'].iloc[-1] - activity_data['time'].iloc[0]
        desired_duration = 10
        trim_duration = (total_duration - desired_duration) / 2
        
        # Calculate start and end times for trimming
        st = activity_data['time'].iloc[0] + trim_duration
        et = activity_data['time'].iloc[-1] - trim_duration
        
        # Trim the data for the current activity and subject
        trimmed_data = activity_data[(activity_data['time'] >= st) & (activity_data['time'] <= et)]
        
        # Append the trimmed data to the main DataFrame
        trimmed_combined_data_df = pd.concat([trimmed_combined_data_df, trimmed_data])

# Assign the trimmed DataFrame back to combined_data_df
combined_data_df = trimmed_combined_data_df.reset_index(drop=True)

# Print the final trimmed DataFrame
display(combined_data_df[(combined_data_df['Subject']=='LAKSH') & (combined_data_df['Activity']=='LAYING')])


Unnamed: 0,time,gFx,gFy,gFz,TgF,Activity,Subject
0,0.092472,0.3361,-0.4061,-0.8498,1.00,LAYING,LAKSH
1,0.109648,0.3321,-0.4076,-0.8547,1.00,LAYING,LAKSH
2,0.127517,0.3305,-0.4095,-0.8591,1.01,LAYING,LAKSH
3,0.141784,0.3293,-0.4107,-0.8542,1.00,LAYING,LAKSH
4,0.158437,0.3290,-0.4079,-0.8498,1.00,LAYING,LAKSH
...,...,...,...,...,...,...,...
488,9.989899,0.3342,-0.4042,-0.8517,1.00,LAYING,LAKSH
489,10.007281,0.3339,-0.4042,-0.8501,1.00,LAYING,LAKSH
490,10.025262,0.3351,-0.4024,-0.8491,1.00,LAYING,LAKSH
491,10.041070,0.3366,-0.4035,-0.8496,1.00,LAYING,LAKSH


1. Use the Decision Tree model trained on the UCI-HAR dataset to predict the activities that you performed. Report the accuracy, precision, recall and confusion matrix of the model. You have three version of UCI dataset you can use a)Raw data from accelerometer, b)TSFEL featurised data, c)Features provided by author. Choose which version to use, ensuring that your test data is similar to your training data. How did the model perform? **[1 marks]**

In [4]:
curr_dir = os.getcwd()
working_dir = os.path.join(curr_dir , "..")
os.chdir(working_dir)

from Dataset.MakeDataset import X,y
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

# print(X)
X = X.mean(axis=1)
X_list = []
y_list = []


dt = DecisionTreeClassifier(criterion='entropy', max_depth=7)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=20,stratify=y)
dt.fit(X_train, y_train)
y_predicted = dt.predict(X_test)
y_predicted = pd.DataFrame({'Label':y_predicted})

accuracy = accuracy_score(y_test, y_predicted)

precision = precision_score(y_test, y_predicted, average='weighted')
recall = recall_score(y_test, y_predicted, average='weighted')
conf_matrix = confusion_matrix(y_test, y_predicted)

print('accuracy: ', accuracy)

Training data shape:  (126, 500, 3)
Testing data shape:  (54, 500, 3)
Training output data shape:  (126,)
Testing output data shape:  (54,)
accuracy:  0.6111111111111112


In [5]:
X = combined_data_df.iloc[:,1:4]
# X = X.sample()
y = combined_data_df.iloc[:,-2]
# print(X)
X = np.array(X)
y = np.array(y)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=42)



y_predict = dt.predict(X_test)
y_test = np.array(y_test)
activities = ['LAYING', 'SITTING', 'STANDING', 'WALKING', 'WALKING_DOWNSTAIRS', 'WALKING_UPSTAIRS']


y_predict_new = []
for i in range(len(y_predict)):
	y_predict_new.append(activities[y_predict[i]-1])

# print(y_predict)
y_predict = np.array(y_predict_new)
accuracy = 0
for i in range(len(y_predict)):
	if y_predict[i]==y_test[i]:
		accuracy +=1
accuracy = accuracy/len(y_test)
print('accuracy: ',accuracy)

accuracy:  0.18422535211267604


2. Use the data you collected to predict the activities that you performed. Decide whether to apply preprocessing and featurization, and if so, choose the appropriate methods. How did the model perform? **[1 marks]**

In [6]:
dt = DecisionTreeClassifier()

X_train = combined_data_df.iloc[:,0:-2]
y_train = combined_data_df.iloc[:,-2]
# print(X)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=42)


dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_predict = dt.predict(X_test)
y_test = np.array(y_test)

accuracy = 0
for i in range(len(y_test)):
    if (y_test[i]==y_predict[i]):
        accuracy +=1

accuracy = accuracy*100/len(y_test)
print(accuracy)

76.84507042253522


Without Leaking Subject data in Test set

In [7]:
dt = DecisionTreeClassifier()

combined_data_df_by_subject_train = combined_data_df[(combined_data_df['Subject']=='PARTHIV') | (combined_data_df['Subject']=='LAKSH')] 
combined_data_df_by_subject_test = combined_data_df[(combined_data_df['Subject']=='RUDRA')] 


X_train = combined_data_df_by_subject_train.iloc[:,0:-2]
y_train = combined_data_df_by_subject_train.iloc[:,-2]
X_test = combined_data_df_by_subject_test.iloc[:,0:-2]
y_test = combined_data_df_by_subject_test.iloc[:,-2]
# print(X)
# X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=42)


dt = DecisionTreeClassifier(max_depth=7)
dt.fit(X_train, y_train)
y_predict = dt.predict(X_test)
y_test = np.array(y_test)

accuracy = 0
for i in range(len(y_test)):
    if (y_test[i]==y_predict[i]):
        accuracy +=1

accuracy = accuracy*100/len(y_test)
print('accuracy: ', accuracy)

accuracy:  43.948613928329955


In [8]:
import tsfel
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Assuming X and y are already loaded as pandas DataFrames or NumPy arrays

# If y is not DataFrames, convert them
y = pd.DataFrame(y, columns=['Target'])

# Feature extraction with TSFEL
cfg_file = tsfel.get_features_by_domain()  # Use default configuration file
features = tsfel.time_series_features_extractor(cfg_file, X)

# Concatenate the extracted features with the target variable
features_with_target = pd.concat([features, y], axis=1)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_with_target.drop(columns='Target'), features_with_target['Target'], test_size=0.2, random_state=42)

# Initialize and train the Decision Tree classifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = clf.predict(X_test)

y_test = np.array(y_test)
y_test

accuracy = 0
for i in range(len(y_test)):
    if (y_test[i]==y_pred[i]):
        accuracy +=1

accuracy = accuracy*100/len(y_test)
print('accuracy: ', accuracy)

*** Feature extraction started ***


  features = tsfel.time_series_features_extractor(cfg_file, X)



*** Feature extraction finished ***
accuracy:  15.492957746478874


TSFEL is giving less result than original dataset therefore there is no need of featurization

3. Use the Few-Shot prompting method using UCI-HAR dataset to predict the activities that you performed. Ensure that both your examples and test query undergo similar preprocessing. How did the model perform? **[1 marks]**

In [9]:
import pandas as pd 
from langchain_groq.chat_models import ChatGroq
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Get the API key from the environment
groq_api_key = os.getenv("GROQ_API_KEY")
model_name = os.getenv("MODEL_NAME")

groq_models = {
    "llama3-70b": "llama3-70b-8192",
    "mixtral": "mixtral-8x7b-32768",
    "gemma-7b": "gemma-7b-it",
    "llama3.1-70b": "llama-3.1-70b-versatile",
    "llama3-8b": "llama3-8b-8192",
    "llama3.1-8b": "llama-3.1-8b-instant",
    "gemma-9b": "gemma2-9b-it"
}
llm = ChatGroq(model=groq_models[model_name] , api_key=groq_api_key, temperature=0.4)

>TRAINED ON GIVEN DATASET, TESTED ON COLLECTED DATA

In [12]:
curr_dir = os.getcwd()
working_dir = os.path.join(curr_dir , "..","task4")
os.chdir(working_dir)

import pandas as pd

# Assuming you have the test data loaded as a DataFrame
test_data_df = pd.read_csv('combined_taken_data.csv')

# Placeholder for storing results
results = []

# Define the activities
activities = ['LAYING', 'STANDING', 'WALKING', 'SITTING', 'WALKING UPSTAIRS', 'WALKING DOWNSTAIRS']

def sample_data(data, sample_rate=10):
    n=500//sample_rate
    return data[100:n+100]

i = 0

# Filter training data for specific subjects
train_subjects = [1, 3, 5, 7]
train_data_df = pd.read_csv('train_data_combined.csv')
train_data_df = train_data_df[train_data_df['Subject'].isin(train_subjects)]

# Iterate through each group by subject and activity
for (subject, activity), group in test_data_df.groupby(['Subject', 'Activity']):
    # Sample data
    accx = sample_data(group['gFx'].tolist())
    accy = sample_data(group['gFy'].tolist())
    accz = sample_data(group['gFz'].tolist())
    
    # Create prompt for the current group
    query_few_shot_Task4 = f"""
    * You are a highly accurate activity classification model.
    * Your task is to classify human activities based on the given accelerometer data.
    * The accelerometer data is provided as mean acceleration values in the x, y, and z directions.
    * You are given data corresponding to six different activities.
    * The possible activities to classify are: LAYING, STANDING, WALKING, SITTING, WALKING UPSTAIRS, and WALKING DOWNSTAIRS."""

    for (train_subject, train_activity), grp in train_data_df.groupby(['Subject', 'Activity']):
        query_few_shot_Task4 += """Here are some examples of accelerometer data and their corresponding activities:"""

        query_few_shot_Task4 += f"""
        * Activity: {train_activity}
          accx = {sample_data(grp['accx'].tolist(), 50)}
          accy = {sample_data(grp['accy'].tolist(), 50)}
          accz = {sample_data(grp['accz'].tolist(), 50)}
        """

    query_few_shot_Task4 += f"""
    * Analyze the accelerometer data and provide the most likely activity label for each case.
    * PRINT ONLY A WORD WHICH IS THE PREDICTED ACTIVITY AND NOTHING ELSE NO CONTENT NO REASON JUST A PREDICTION

    accx = {accx}
    accy = {accy}
    accz = {accz}
    """
    
    # Simulate model prediction (replace with actual model prediction code)
    result = llm.invoke(query_few_shot_Task4)
    
    print(activity, str(result).split(" ")[0][8:].strip("'"))
    if (activity.upper() ==str(result).split(" ")[0][8:].strip("'")):
        i+=1
        print(i)
    results.append({'Subject': subject, 'Activity': activity, 'Prediction': str(result).split(" ")[0][8:].strip("'")})

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Save results to CSV if needed
results_df.to_csv('model_predictions_fewshot_4.1.csv', index=False)

print("Classification complete. Results have been saved to 'model_predictions.csv'.")

print("accuracy",i/18*100)

LAYING LAYING
1
SITTING STANDING
STANDING STANDING
2
WALKING WALKING
3
WALKING_DOWNSTAIRS WALKING
WALKING_UPSTAIRS WALKING
LAYING LAYING
4
SITTING STANDING
STANDING STANDING
5
WALKING WALKING_UPSTAIRS
WALKING_DOWNSTAIRS WALKING_UPSTAIRS
WALKING_UPSTAIRS WALKING_UPSTAIRS
6
LAYING LAYING
7
SITTING STANDING
STANDING STANDING
8
WALKING WALKING
9
WALKING_DOWNSTAIRS WALKING
WALKING_UPSTAIRS WALKING
Classification complete. Results have been saved to 'model_predictions.csv'.
accuracy 50.0


ACCURACY FOR FEW SHOT IS 50% FOR UCI-HAR TRAINED MODEL

4. Use the Few-Shot prompting method using the data you collected to predict the activities that you performed. Adopt proper processing methods as needed. How did the model perform? **[1 marks]**

> FEW SHOT FOR TRAIN AND TEST BOTH ON THE COLLECTED DATA SET

In [11]:
import pandas as pd

# Assuming you have the test data loaded as a DataFrame
whole_df = pd.read_csv('combined_taken_data.csv')

# Placeholder for storing results
results = []

# Define the activities
activities = ['LAYING', 'STANDING', 'WALKING', 'SITTING', 'WALKING UPSTAIRS', 'WALKING DOWNSTAIRS']

def sample_data(data, sample_rate=10):
    n=500//sample_rate
    return data[100:n+100]

i = 0

# Filter training data for specific subjects

train_data_df = whole_df[whole_df["Subject"] == "LAKSH"]
test_data_df = whole_df[whole_df["Subject"] != "LAKSH"]



# Iterate through each group by subject and activity
for (subject, activity), group in test_data_df.groupby(['Subject', 'Activity']):
    # Sample data
    accx = sample_data(group['gFx'].tolist())
    accy = sample_data(group['gFy'].tolist())
    accz = sample_data(group['gFz'].tolist())
    
    # Create prompt for the current group
    query_few_shot_Task4 = f"""
    * You are a highly accurate activity classification model.
    * Your task is to classify human activities based on the given accelerometer data.
    * The accelerometer data is provided as mean acceleration values in the x, y, and z directions.
    * You are given data corresponding to six different activities.
    * The possible activities to classify are: LAYING, STANDING, WALKING, SITTING, WALKING UPSTAIRS, and WALKING DOWNSTAIRS."""

    for (train_subject, train_activity), grp in train_data_df.groupby(['Subject', 'Activity']):
        query_few_shot_Task4 += """Here are some examples of accelerometer data and their corresponding activities:"""

        query_few_shot_Task4 += f"""
        * Activity: {train_activity}
          accx = {sample_data(grp['gFx'].tolist(), 50)}
          accy = {sample_data(grp['gFy'].tolist(), 50)}
          accz = {sample_data(grp['gFz'].tolist(), 50)}
        """
    
    query_few_shot_Task4 += f"""
    * Analyze the accelerometer data and provide the most likely activity label for each case.
    * PRINT ONLY A WORD WHICH IS THE PREDICTED ACTIVITY AND NOTHING ELSE NO CONTENT NO REASON JUST A PREDICTION

    accx = {accx}
    accy = {accy}
    accz = {accz}
    """
    
    # Simulate model prediction (replace with actual model prediction code)
    result = llm.invoke(query_few_shot_Task4)
    
    print(activity, str(result).split(" ")[0][8:].strip("'"))
    if (activity.upper() ==str(result).split(" ")[0][8:].strip("'")):
        i+=1
        print(i)
    results.append({'Subject': subject, 'Activity': activity, 'Prediction': str(result).split(" ")[0][8:].strip("'")})

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Save results to CSV if needed
results_df.to_csv('model_predictions_fewshot_4.2.csv', index=False)

print("Classification complete. Results have been saved to 'model_predictions.csv'.")

print("accuracy",i/12*100)

LAYING LAYING
1
SITTING STANDING
STANDING STANDING
2
WALKING WALKING_UPSTAIRS
WALKING_DOWNSTAIRS WALKING_UPSTAIRS
WALKING_UPSTAIRS WALKING_UPSTAIRS
3
LAYING LAYING
4
SITTING STANDING
STANDING STANDING
5
WALKING WALKING_UPSTAIRS
WALKING_DOWNSTAIRS WALKING_UPSTAIRS
WALKING_UPSTAIRS WALKING_UPSTAIRS
6
Classification complete. Results have been saved to 'model_predictions.csv'.
accuracy 50.0


ACCURACY IS 50% FOR BOTH TEST AND TRAIN ON THE COLLECTED DATA