In [1]:
# Import necessary libraries
import os
import glob
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
#assuming we are in Q2 directory , we go to the parent folder
folder_path = os.path.join('..', 'new_data')

# Get all the CSV files from the folder
csv_files = glob.glob(folder_path + '/*.csv')

all_data=[]
# Load each CSV file in one dataframe
for idx,csv_file in enumerate(csv_files):
    # Load the CSV file into a dataframe
    df = pd.read_csv(csv_file)
    
    # Append the DataFrame to the list
    all_data.append(df)

# Combine all data into a single DataFrame
combined_data = pd.concat(all_data)
#combined_data=combined_data.iloc[::10]

In [3]:
# Create lag features
combined_data['Activity_Lag1'] = combined_data['label'].shift(1)
combined_data['Activity_Lag2'] = combined_data['label'].shift(2)
combined_data['Activity_Lag3'] = combined_data['label'].shift(3)
# Drop rows with NaN values resulting from the shift
combined_data.dropna(inplace=True) #drops 3 first rows of data

# Select relevant features and target variable
features = combined_data.drop(['label', 'timestamp'], axis=1)
target = combined_data['label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
 

In [4]:
# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
 
# Train the classifier
rf_classifier.fit(X_train, y_train)
 
# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)
 

In [5]:

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
 
# Print the results
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_rep)

Accuracy: 0.99

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     23921
           1       1.00      1.00      1.00      5786
           2       0.91      0.90      0.91      5036
           3       0.97      0.97      0.97      1498
           4       0.95      0.95      0.95      1349
           5       0.97      0.98      0.98     14889
           6       1.00      1.00      1.00     58181
           7       1.00      1.00      1.00      8605
           8       1.00      1.00      1.00      7891
           9       0.99      0.99      0.99      1089
          10       0.98      0.99      0.98       830
          11       0.99      0.97      0.98       153

    accuracy                           0.99    129228
   macro avg       0.98      0.98      0.98    129228
weighted avg       0.99      0.99      0.99    129228



In [None]:
# Hyperparameter tuning using GridSearchCV with limited parallel jobs
param_grid = {
    'n_estimators': [100, 200],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [None, 10, 20],
    'criterion': ['gini', 'entropy']
}
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=3, n_jobs=3)
grid_search.fit(X_train, y_train)

print("\nBest Parameters from GridSearchCV:")
print(grid_search.best_params_)