In [1]:
# Import necessary libraries
import os
import glob
import time
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

In [2]:
#assuming we are in Q2 directory , we go to the parent folder
folder_path = os.path.join('..', 'new_data')

# Get all the CSV files from the folder
csv_files = glob.glob(folder_path + '/*.csv')

all_data=[]
# Load each CSV file in one dataframe
for idx,csv_file in enumerate(csv_files):
    # Load the CSV file into a dataframe
    df = pd.read_csv(csv_file)
    
    # Append the DataFrame to the list
    all_data.append(df)

# Combine all data into a single DataFrame
combined_data = pd.concat(all_data)
#combined_data=combined_data.iloc[::10]

In [3]:
# Create lag features
combined_data['Activity_Lag'] = combined_data['label'].shift(1)

# Drop rows with NaN values resulting from the shift
combined_data.dropna(inplace=True) #drops 1 first rows of data

# Select relevant features and target variable
features = combined_data.drop(['label', 'timestamp'], axis=1).values
target = combined_data['label'].values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
 

In [4]:
# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

start=time.time() #start time of the training

# Train the classifier
rf_classifier.fit(X_train, y_train)

end=time.time() #calculating time
print(f'Training completed in : {end-start}')

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)
print(f'Prediction completed in: {time.time()-end}') 

Training completed in : 45.14632534980774
Prediction completed in: 0.7745904922485352


In [5]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_rep)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.99

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     24235
           1       1.00      1.00      1.00      5758
           2       0.90      0.90      0.90      5057
           3       0.97      0.97      0.97      1533
           4       0.95      0.96      0.96      1381
           5       0.97      0.97      0.97     14826
           6       1.00      1.00      1.00     57972
           7       1.00      1.00      1.00      8523
           8       1.00      1.00      1.00      7848
           9       0.99      0.99      0.99      1105
          10       0.97      0.98      0.98       841
          11       1.00      0.98      0.99       150

    accuracy                           0.99    129229
   macro avg       0.98      0.98      0.98    129229
weighted avg       0.99      0.99      0.99    129229

Confusion Matrix:
[[23936     8   127    25    39    83    14     1     2     0     0     0]


In [6]:
num_samples = 10

# Get 10 random indices from the x_test data set
random_indices = np.random.randint(0, len(X_test), size=num_samples)

for i, random_index in enumerate(random_indices):
    random_sample = X_test[random_index].reshape(1, -1)  # Reshape to match model's expected input shape
    actual_label = y_test[random_index]
    
    # Make the prediction
    predicted_label = rf_classifier.predict(random_sample)
    
    # Display the random sample and the label prediction
    print(f'\nSample {i+1}:')
    print(f'Random Sample Index: {random_index}')
    print(f'Input Features: {random_sample}')
    print(f'Actual Label: {actual_label}')
    print(f'Predicted Label: {predicted_label}')


Sample 1:
Random Sample Index: 53911
Input Features: [[-0.8439 -0.031  -0.6563 -0.3606  0.1908  0.9126  6.    ]]
Actual Label: 6
Predicted Label: [6]

Sample 2:
Random Sample Index: 13818
Input Features: [[-0.9849  0.0312 -0.063  -1.0312 -0.0625  0.0603  5.    ]]
Actual Label: 5
Predicted Label: [5]

Sample 3:
Random Sample Index: 4005
Input Features: [[-1.0039  0.0157 -0.0613 -0.9123  0.0674 -0.4152  5.    ]]
Actual Label: 5
Predicted Label: [5]

Sample 4:
Random Sample Index: 84968
Input Features: [[-1.0168  0.0466  0.0106 -0.9652 -0.0665 -0.2778  5.    ]]
Actual Label: 5
Predicted Label: [5]

Sample 5:
Random Sample Index: 78466
Input Features: [[ 0.1982  0.8949 -0.4095 -0.1912 -0.9659  0.1659  7.    ]]
Actual Label: 7
Predicted Label: [7]

Sample 6:
Random Sample Index: 94934
Input Features: [[-0.8438 -0.0251 -0.6609 -0.3443  0.2224  0.9225  6.    ]]
Actual Label: 6
Predicted Label: [6]

Sample 7:
Random Sample Index: 35872
Input Features: [[-1.0065  0.0664 -0.2532 -1.2807  0.1642

In [7]:
# 10-Fold Cross validation
print(np.mean(cross_val_score(rf_classifier, X_train, y_train, cv=10, n_jobs=5)))

0.9891103299997729
