In [99]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tslearn.utils import to_time_series_dataset


In [102]:
data_dir = '../../data/raw/data/'
label_file = '../../data/raw/class_labels.csv'
files_with_4000_rows = []

max_rows=0
df_labels = pd.read_csv(label_file)
label_map = dict(zip(df_labels['File'], df_labels['Label']))
for fname in os.listdir(data_dir):
    if fname.endswith('.csv'):
        file_path = os.path.join(data_dir, fname)
        try:
            df = pd.read_csv(file_path)
            num_rows = len(df)
            if num_rows<=4000:
                files_with_4000_rows.append((fname,num_rows))
        except Exception as e:
            print(f"Error reading {fname}: {e}")

# Print the result
# files_with_4000_rows=sorted(files_with_4000_rows,key=lambda x: x[1])
# print("\nFiles with <=4000 rows:")
# for (f,nr) in files_with_4000_rows:
#     if f!='1991-07-01_19-14.csv':
#         print(f,nr)
#         print(label_map[f])
# Optional: print total count
print(f"\nTotal files with <=4000 rows: {len(files_with_4000_rows)}, max_rows{max_rows}")



Total files with <=4000 rows: 2846, max_rows0


In [157]:
label_file = '../../data/raw/class_labels.csv'
data_dir = '../../data/raw/data/'
X = []
y = []
cols_to_keep = ['p3_flux_ic'] #, 'p5_flux_ic', 'p7_flux_ic', 'long'
timestamps_list=[]
filenames = []

for (fname, nr) in files_with_4000_rows:
    if fname.endswith('.csv') and fname in label_map and fname != '2005-09-07_14-25.csv':
        file_path = os.path.join(data_dir, fname)
        ts_df = pd.read_csv(file_path)

        ts_selected = ts_df.reindex(columns=cols_to_keep, fill_value=np.nan)
        ts_data = ts_selected.astype(float).values
        X.append(ts_data)
        y.append(label_map[fname])
        filenames.append(fname)  # Track filename
X = to_time_series_dataset(X)  # will pad with NaN if needed
X[np.isnan(X)] = 0
y = np.array(y)

In [158]:
# First split
X_temp, X_test, y_temp, y_test, filenames_temp, filenames_test = train_test_split(
    X, y, filenames, test_size=0.33, random_state=42, stratify=y
)
# Second split
X_train, X_val, y_train, y_val, filenames_train, filenames_val = train_test_split(
    X_temp, y_temp, filenames_temp, test_size=0.5, random_state=42, stratify=y_temp
)


In [159]:
X_train_ts=to_time_series_dataset(X_train)
X_test_ts=to_time_series_dataset(X_test)
X_val_ts=to_time_series_dataset(X_val)
np.save("X_train_ts.npy", X_train_ts)
np.save("X_test_ts.npy", X_test_ts)
np.save("X_val_ts.npy", X_val_ts)
print(X_train_ts.shape,X_test_ts.shape,X_val_ts.shape)

(953, 3926, 1) (939, 3926, 1) (953, 3926, 1)


In [160]:
filename_to_split_index = {}

for i, fname in enumerate(filenames_train):
    filename_to_split_index[fname] = ("train", i)

for i, fname in enumerate(filenames_val):
    filename_to_split_index[fname] = ("val", i)

for i, fname in enumerate(filenames_test):
    filename_to_split_index[fname] = ("test", i)

X_splits = {
    'train': X_train,
    'val': X_val,
    'test': X_test
}
y_splits = {
    'train': y_train,
    'val': y_val,
    'test': y_test
}



In [161]:
# === Get the Dataset and Index of the Query timeseries === #
def get_split_and_index(filename):
    return [filename_to_split_index.get(filename, ("not_found", -1)),label_map[filename]]


In [162]:
import joblib
from tslearn.neighbors import KNeighborsTimeSeries, KNeighborsTimeSeriesClassifier
model = joblib.load('../../models/KNN_1_TS__classifier.pkl')

In [172]:
model._X_fit = X_test_ts  # manually override it

In [173]:
# === Get the Query Instance === #
query_ts_filename= '2012-06-13_10-29.csv'
print(get_split_and_index(query_ts_filename)[0])

('test', 862)


In [None]:
import numpy as np
X_test_ts=np.load("X_test_ts.npy")
# Replace NaNs with zeros
X_test_ts = np.nan_to_num(X_test_ts, nan=0.0)
y_pred = model.predict(X_test_ts)
print("Predicted label:", y_pred[0])



In [128]:
# dataset,q_idx = get_split_and_index(query_ts_filename)[0]
X_test = to_time_series_dataset(X_test)
X_test_filled_with_0= np.nan_to_num(X_test, nan=0.0)
y_pred = model.predict(X_test_filled_with_0)
print(f"Label of Query Instance: {y_test[862]}, Predicted label of Query Instance: {y_pred[862]}" )

ValueError: Dimensions of the provided timeseries(except first) must match those of the fitted data! ((953, 953) and (939, 3926) are passed shapes)

In [93]:
from tslearn.neighbors import KNeighborsTimeSeries, KNeighborsTimeSeriesClassifier
#finding the nearest unlike neighbour. NB will need to account for regularization
def native_guide_retrieval(query, predicted_label, distance, n_neighbors):

    df = pd.DataFrame(y_train, columns = ['label'])
    df.index.name = 'index'
    #df[df['label'] == 1].index.values, df[df['label'] != 1].index.values

    ts_length = X_train.shape[1]

    knn = KNeighborsTimeSeries(n_neighbors=n_neighbors, metric = distance)

    knn.fit(X_train[list(df[df['label'] != predicted_label].index.values)])

    dist,ind = knn.kneighbors(query.reshape(1,ts_length), return_distance=True)
    return dist[0], df[df['label'] != predicted_label].index[ind[0][:]]

In [67]:
print(model._X_fit.shape)


(953, 953)


In [51]:
# For KNN neighbors
knn = KNeighborsTimeSeries(n_neighbors=1, metric='dtw').fit(X_train)
distances, indices = knn.kneighbors(x_query, return_distance=True)
print("Nearest neighbor distance:", distances)
print("Nearest neighbor index:", indices)


x_query shape before reshape: (961, 1)
x_query shape after reshape: (961, 1)


ValueError: Dimensions of the provided timeseries(except first) must match those of the fitted data! ((953, 953) and (961, 1) are passed shapes)

In [45]:
# ===Find the nearest neighbour of the query instance with smallest DTW=== #
knn = KNeighborsTimeSeries(n_neighbors=1, metric='dtw').fit(X_train)
knn.kneighbors(x_query.reshape(1,-1), return_distance=True)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (953,) + inhomogeneous part.

In [None]:
# ===Find the nearest neighbour of the query instance with opposite class(Native Guide)=== #
native_guide_retrieval(X_test[12], y_pred[12], 'dtw', 1)