In [1]:
# Import necessary libraries
import os
import glob
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

In [2]:
#assuming we are in Q2 directory , we go to the parent folder
folder_path = os.path.join('..', 'new_data')

# Get all the CSV files from the folder
csv_files = glob.glob(folder_path + '/*.csv')

all_data=[]
# Load each CSV file in one dataframe
for idx,csv_file in enumerate(csv_files):
    # Load the CSV file into a dataframe
    df = pd.read_csv(csv_file)
    
    # Append the DataFrame to the list
    all_data.append(df)

# Combine all data into a single DataFrame
combined_data = pd.concat(all_data)
#combined_data=combined_data.iloc[::10]

In [3]:
# Create lag features
combined_data['Activity_Lag1'] = combined_data['label'].shift(1)
combined_data['Activity_Lag2'] = combined_data['label'].shift(2)
combined_data['Activity_Lag3'] = combined_data['label'].shift(3)
# Drop rows with NaN values resulting from the shift
combined_data.dropna(inplace=True) #drops 3 first rows of data

# Select relevant features and target variable
features = combined_data.drop(['label', 'timestamp'], axis=1).values
target = combined_data['label'].values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
 

In [4]:
# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
 
# Train the classifier
rf_classifier.fit(X_train, y_train)
 
# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)
 

In [5]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
 
# Print the results
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_rep)

Accuracy: 0.99

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     23921
           1       1.00      1.00      1.00      5786
           2       0.91      0.90      0.91      5036
           3       0.97      0.97      0.97      1498
           4       0.95      0.95      0.95      1349
           5       0.97      0.98      0.98     14889
           6       1.00      1.00      1.00     58181
           7       1.00      1.00      1.00      8605
           8       1.00      1.00      1.00      7891
           9       0.99      0.99      0.99      1089
          10       0.98      0.99      0.98       830
          11       0.99      0.97      0.98       153

    accuracy                           0.99    129228
   macro avg       0.98      0.98      0.98    129228
weighted avg       0.99      0.99      0.99    129228



In [8]:
num_samples = 10

# Get 10 random indices from the x_test data set
random_indices = np.random.randint(0, len(X_test), size=num_samples)

for i, random_index in enumerate(random_indices):
    random_sample = X_test[random_index].reshape(1, -1)  # Reshape to match model's expected input shape
    actual_label = y_test[random_index]
    
    # Make the prediction
    predicted_label = rf_classifier.predict(random_sample)
    
    # Display the random sample and the label prediction
    print(f'\nSample {i+1}:')
    print(f'Random Sample Index: {random_index}')
    print(f'Input Features: {random_sample}')
    print(f'Actual Label: {actual_label}')
    print(f'Predicted Label: {predicted_label}')


Sample 1:
Random Sample Index: 13639
Input Features: [[-0.0359 -0.0312  0.8552 -0.1899  0.1189 -1.0625  7.      7.      7.    ]]
Actual Label: 7
Predicted Label: [7]

Sample 2:
Random Sample Index: 19524
Input Features: [[-0.8297  0.1018 -0.5769 -0.4532  0.214   0.8763  6.      6.      6.    ]]
Actual Label: 6
Predicted Label: [6]

Sample 3:
Random Sample Index: 117753
Input Features: [[-0.0781  0.0315  0.75   -0.0469 -0.5156 -0.9531  7.      7.      7.    ]]
Actual Label: 7
Predicted Label: [7]

Sample 4:
Random Sample Index: 10089
Input Features: [[-0.9891  0.0652  0.0114 -0.2324  0.0286  0.9715  6.      6.      6.    ]]
Actual Label: 6
Predicted Label: [6]

Sample 5:
Random Sample Index: 72172
Input Features: [[-0.9934 -0.0092  0.0687 -0.3302 -0.0161  0.9512  6.      6.      6.    ]]
Actual Label: 6
Predicted Label: [6]

Sample 6:
Random Sample Index: 57532
Input Features: [[-1.0002 -0.043   0.2426 -0.8434  0.2729  0.4986  8.      8.      8.    ]]
Actual Label: 8
Predicted Label: [

In [6]:
# Hyperparameter tuning using GridSearchCV with limited parallel jobs
param_grid = {
    'n_estimators': [100, 200],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [None, 10, 20],
    'criterion': ['gini', 'entropy']
}
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=3, n_jobs=3)
grid_search.fit(X_train, y_train)

print("\nBest Parameters from GridSearchCV:")
print(grid_search.best_params_)

KeyboardInterrupt: 