In [1]:
# Import necessary libraries
import os
import glob
import time
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
import numpy as np

In [2]:
#assuming we are in Q2 directory , we go to the parent folder
folder_path = os.path.join('..', 'new_data')

# Get all the CSV files from the folder
csv_files = glob.glob(folder_path + '/*.csv')

all_data=[]
# Load each CSV file in one dataframe
for idx,csv_file in enumerate(csv_files):
    # Load the CSV file into a dataframe
    df = pd.read_csv(csv_file)
    
    # Append the DataFrame to the list
    all_data.append(df)

# Combine all data into a single DataFrame
combined_data = pd.concat(all_data)
#combined_data=combined_data.iloc[::10]

In [3]:
# Create lag features
combined_data['Activity_Lag'] = combined_data['label'].shift(1)

# Drop rows with NaN values resulting from the shift
combined_data.dropna(inplace=True) #drops 1 first rows of data

# Select relevant features and target variable
features = combined_data.drop(['label', 'timestamp'], axis=1).values
target = combined_data['label'].values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
 

In [4]:
# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

start=time.time() #start time of the training

# Train the classifier
rf_classifier.fit(X_train, y_train)

end=time.time() #calculating time
print(f'Training completed in : {end-start}')

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)
print(f'Prediction completed in: {time.time()-end}') 

Training completed in : 44.52234888076782
Prediction completed in: 0.7811682224273682


In [5]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_rep)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.99

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     24235
           1       1.00      1.00      1.00      5758
           2       0.90      0.90      0.90      5057
           3       0.97      0.97      0.97      1533
           4       0.95      0.96      0.96      1381
           5       0.97      0.97      0.97     14826
           6       1.00      1.00      1.00     57972
           7       1.00      1.00      1.00      8523
           8       1.00      1.00      1.00      7848
           9       0.99      0.99      0.99      1105
          10       0.97      0.98      0.98       841
          11       1.00      0.98      0.99       150

    accuracy                           0.99    129229
   macro avg       0.98      0.98      0.98    129229
weighted avg       0.99      0.99      0.99    129229

Confusion Matrix:
[[23936     8   127    25    39    83    14     1     2     0     0     0]


In [6]:
num_samples = 10

# Get 10 random indices from the x_test data set
random_indices = np.random.randint(0, len(X_test), size=num_samples)

for i, random_index in enumerate(random_indices):
    random_sample = X_test[random_index].reshape(1, -1)  # Reshape to match model's expected input shape
    actual_label = y_test[random_index]
    
    # Make the prediction
    predicted_label = rf_classifier.predict(random_sample)
    
    # Display the random sample and the label prediction
    print(f'\nSample {i+1}:')
    print(f'Random Sample Index: {random_index}')
    print(f'Input Features: {random_sample}')
    print(f'Actual Label: {actual_label}')
    print(f'Predicted Label: {predicted_label}')


Sample 1:
Random Sample Index: 79035
Input Features: [[-0.1252  0.9498 -0.336  -0.2593 -0.9437  0.1818  7.    ]]
Actual Label: 7
Predicted Label: [7]

Sample 2:
Random Sample Index: 12847
Input Features: [[-0.6438  0.0319  0.7007 -1.8543 -0.6426 -0.6248 10.    ]]
Actual Label: 10
Predicted Label: [10]

Sample 3:
Random Sample Index: 41713
Input Features: [[-0.9758 -0.1824  0.0674 -0.3356 -0.1609  0.919   6.    ]]
Actual Label: 6
Predicted Label: [6]

Sample 4:
Random Sample Index: 89511
Input Features: [[-0.9851  0.0617 -0.3128 -0.0987 -0.2261  0.9723  6.    ]]
Actual Label: 6
Predicted Label: [6]

Sample 5:
Random Sample Index: 35901
Input Features: [[-1.0162e+00  4.0000e-04 -2.3380e-01  5.9000e-03  3.8050e-01  9.3170e-01
   6.0000e+00]]
Actual Label: 6
Predicted Label: [6]

Sample 6:
Random Sample Index: 119081
Input Features: [[-4.7860e-01  1.4000e-03 -8.7520e-01  8.6300e-02  4.5800e-02  1.0044e+00
   6.0000e+00]]
Actual Label: 6
Predicted Label: [6]

Sample 7:
Random Sample Index:

In [7]:
# 10-Fold Cross validation
print(np.mean(cross_val_score(rf_classifier, X_train, y_train, cv=10, n_jobs=5)))

0.9891103299997729


In [9]:
import scikitplot as skplt
import matplotlib.pyplot as plt

skplt.metrics.plot_roc_curve(y_test, y_pred)
plt.show()

ImportError: cannot import name 'interp' from 'scipy' (/home/mira/Documents/uni/12th/data_mining/Data_Mining_project/myvenv/lib/python3.10/site-packages/scipy/__init__.py)