In [32]:
import pandas as pd
import numpy as np
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score


In [33]:
import pickle

# Load the dataset from the pickle file
with open('processed/processed_final_dataset.pkl', 'rb') as f:
    data = pickle.load(f)

X = data['features']  # Features
y = data['labels']    # Labels

In [34]:
X.shape

(2821051, 42)

In [35]:
# Assuming 'y' is a numpy array containing class labels
previous_label = None
label_intervals = []

# Iterate over each label and identify intervals
for i, label in enumerate(y):
    if label != previous_label:
        if previous_label is not None:
            label_intervals.append((start_index, i - 1, previous_label))
        start_index = i
        previous_label = label

# Append the last interval
label_intervals.append((start_index, len(y) - 1, y[-1]))

# Print label intervals
for interval in label_intervals:
    start_idx, end_idx, label = interval
    print(f"Class {label}: Indices [{start_idx}, {end_idx}]")


Class 0: Indices [0, 541264]
Class 5: Indices [541265, 541270]
Class 0: Indices [541271, 541271]
Class 5: Indices [541272, 541285]
Class 0: Indices [541286, 541500]
Class 5: Indices [541501, 541510]
Class 0: Indices [541511, 541512]
Class 5: Indices [541513, 541522]
Class 0: Indices [541523, 541579]
Class 5: Indices [541580, 541585]
Class 0: Indices [541586, 541592]
Class 5: Indices [541593, 541593]
Class 0: Indices [541594, 541594]
Class 5: Indices [541595, 541606]
Class 0: Indices [541607, 541652]
Class 5: Indices [541653, 541658]
Class 0: Indices [541659, 541682]
Class 5: Indices [541683, 541690]
Class 0: Indices [541691, 541694]
Class 5: Indices [541695, 541700]
Class 0: Indices [541701, 541775]
Class 5: Indices [541776, 541778]
Class 0: Indices [541779, 541779]
Class 5: Indices [541780, 541781]
Class 0: Indices [541782, 541782]
Class 5: Indices [541783, 541783]
Class 0: Indices [541784, 541808]
Class 5: Indices [541809, 541816]
Class 0: Indices [541817, 541834]
Class 5: Indices [5

In [36]:
def generate_sequences(data, labels, seq_length=10, seq_overlap=5):
    seqs = []
    seq_labels = []
    for i in range(0, len(data) - seq_length + 1, seq_length - seq_overlap):
        seq = data[i:i + seq_length]
        seq_label = labels[i:i + seq_length]
        # Use the most common label in the sequence as the sequence label
        most_common_label = np.bincount(seq_label).argmax()
        seqs.append(seq)
        seq_labels.append(most_common_label)
    return np.array(seqs), np.array(seq_labels)


# Generate sequences
sequences, seq_labels = generate_sequences(X, y)

In [37]:
seq_labels.shape

(564209,)

In [38]:
def select_balanced_test_set(sequences, seq_labels, samples_per_class=1000):
    unique_classes = np.unique(seq_labels)
    test_indices = []

    for cls in unique_classes:
        # Find indices where the sequence label is the class
        class_indices = np.where(seq_labels == cls)[0]
        # If there are fewer than required samples, take them all; otherwise, sample
        if len(class_indices) <= samples_per_class:
            test_indices.extend(class_indices)
        else:
            test_indices.extend(np.random.choice(class_indices, samples_per_class, replace=False))

    # Extract the sequences and labels for the test set
    test_seqs = sequences[test_indices]
    test_seq_labels = seq_labels[test_indices]
    return test_seqs, test_seq_labels

# Select balanced test set
test_X, test_y = select_balanced_test_set(sequences, seq_labels)


In [None]:
test_y.shape

In [39]:
def pad_sequences(seqs, seq_length):
    padded_seqs = []
    for seq in seqs:
        if len(seq) < seq_length:
            padded_seq = np.vstack((seq, np.zeros((seq_length - len(seq), seq.shape[1]))))
        else:
            padded_seq = seq
        padded_seqs.append(padded_seq)
    return np.array(padded_seqs)

In [40]:
# Pad sequences to ensure uniformity
test_X = pad_sequences(test_X, seq_length=10)

# Normalize the test data
# Assuming 'train_seqs' was your training dataset used to fit the normalization parameters
#mean = np.mean(train_seqs, axis=(0, 1))
#std = np.std(train_seqs, axis=(0, 1))
mean = np.array([-0.30251599, 0.43214656, -0.0071984, -0.040559, -0.13262795, -0.30594404,
                 1.01610603, -0.57040977, 0.34489319, 0.68380633, 0.01809874, -0.04045581,
                 0.00344279, 0.31909552, -0.16974303, 0., -0.01056754, 0., 
                 0.00124247, 0.00163327, 0.09341123, 0.06387345, -0.62513747, 0.39036366,
                 -0.01559586, 0.34854081, 0.26082964, -0.30246426, -0.09879487, 0., 
                 0., 0., 0., 0., 0., 0.01556358, -0.21182333, 0.00255652, -0.01398662,
                 -0.05906157, -0.06749402, 0.20718232])

std = np.array([4.47656907e-01, 1.25963045e+00, 4.10815829e-03, 3.17013314e-02,
                2.28254224e-01, 5.82985819e-02, 1.77483654e+00, 2.00166998e-01,
                1.22776363e+00, 1.53183995e+00, 1.29877994e+00, 9.30325112e-01,
                8.03830508e-01, 1.76526422e+00, 4.87879356e-01, 1e-7,  # Replaced 0 with a small value
                7.60988494e-14, 1e-7,  # Replaced 0 with a small value
                4.58429628e-06, 7.96110689e-05, 1.22611589e+00, 1.00794963e+00,
                6.92865015e-02, 1.67553342e+00, 9.10348186e-14, 1.09050881e+00,
                1.06688757e+00, 2.65669909e-01, 8.67575085e-01, 1e-7,  # Replaced 0 with a small value
                1e-7, 1e-7, 1e-7, 1e-7, 1e-7, 8.03441259e-01,
                1.72948384e-01, 6.72715809e-06, 7.71187226e-01, 7.92794469e-01,
                6.02577562e-01, 1.82455050e+00])


std[std == 0] = 1  # Avoid division by zero

normalized_test_X = (test_X - mean) / std

# Convert labels to categorical for the model
test_y = to_categorical(test_y , 8)  # Adjust label indices and set number of classes



In [None]:
print("test_X shape: " , test_X.shape)
print("\ntest_y shape: " , test_y.shape)


In [None]:
test_y[501]

In [41]:
from tensorflow.keras.models import load_model

# Load the saved best binary classifier model
model_binary = load_model('models/best_binary_model_42features.keras')


In [42]:
def sample_predict_evaluate(test_X, test_y, model_binary, num_samples=100):
    # Ensure that test_y is in the expected shape: (n_samples, n_classes)
    print("Shape of test_y before sampling:", test_y.shape)
    
    # Randomly select sequences and their corresponding labels
    indices = np.random.choice(test_X.shape[0], num_samples, replace=False)
    sampled_sequences = test_X[indices]
    sampled_labels = test_y[indices]
    #print(sampled_labels)
    
    # List to hold predictions and actual labels for calculating accuracy
    predictions = []
    true_labels = []

    # Process each sequence
    for sequence, one_hot_labels in zip(sampled_sequences, sampled_labels):
        print(sampled_labels)
        # Predict using the binary model for each point in the sequence
        pred_probs = model_binary.predict(sequence)  # shape should be (10, 1) if model expects (1, num_features)
        
        #print("predicted prb", pred_probs.shape)
        # Calculate the average prediction for the sequence
        avg_pred = np.mean(pred_probs)
        predicted_label = 1 if avg_pred > 0.50 else 0
        
        predictions.append(predicted_label)
        
        # Make sure we handle labels correctly
        if one_hot_labels.ndim == 1:
            true_label = np.argmax(one_hot_labels)
        else:
            true_label = np.argmax(np.bincount(np.argmax(one_hot_labels, axis=1)))
        
        if true_label > 0 :
            true_label = 1
        
        
        true_labels.append(true_label)
        
        # Print both the predicted and the true label
        print(f"Predicted average: {avg_pred:.4f}, Predicted label: {predicted_label}, True label: {true_label}")

    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predictions)
    print(f"\nAccuracy of the model on the sampled data: {accuracy:.4%}")

    return predictions, true_labels, accuracy

# Example usage
predictions, true_labels, accuracy = sample_predict_evaluate(test_X, test_y, model_binary, num_samples=10)


Shape of test_y before sampling: (6150, 8)
[[0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
predicted prb (10, 1)
Predicted average: 1.0000, Predicted label: 1, True label: 1
[[0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 

In [None]:
def print_and_predict_random_point(X, y, model):
    # Select a random index from your dataset
    random_index = np.random.randint(0, X.shape[0])
    
    # Extract the data point and the corresponding label
    random_data_point = X[random_index]
    actual_label = y[random_index]
    
    # Print the actual label
    print("Actual Label:", actual_label)
    
    # Reshape the data point if necessary (depends on model input shape)
    # For example, if your model expects data in the shape (1, num_features):
    random_data_point_reshaped = random_data_point.reshape(1, *random_data_point.shape)
    
    # Predict using the model
    prediction = model.predict(random_data_point_reshaped)
    
    # If your model is a classifier with softmax, for example, find the class with the highest probability
    if prediction.shape[-1] > 1:  # Multi-class classification
        predicted_label = np.argmax(prediction, axis=-1)
    else:  # Binary classification
        predicted_label = (prediction > 0.5).astype(int)
    
    # Print the predicted label
    print("Predicted Label:", predicted_label)

# Assuming X_test and y_test are your data and labels



In [None]:
print_and_predict_random_point(test_X, test_y, model_binary)