In [None]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean, cosine
from dtaidistance import dtw
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [1]:
def read_csv_files(datafile_path, queryfile_path):
    # Read the datafile and queryfile
    datafile = pd.read_csv(datafile_path)
    queryfile = pd.read_csv(queryfile_path, header=None)
    
    return datafile, queryfile

In [2]:
# Define a function which z-normalizes the data
def z_normalize(datafile):
    # Initialize the StandardScaler
    scaler = StandardScaler()
    
    # Fit the scaler on the datafile and transform the data
    z_normalized_data = scaler.fit_transform(datafile)
    
    # Create a DataFrame for the z-normalized data
    z_normalized_df = pd.DataFrame(z_normalized_data, columns=datafile.columns)
    
    # Get the mean and standard deviation used for normalization
    mean = scaler.mean_
    std_dev = scaler.scale_
    
    # Create a DataFrame for the mean and standard deviation
    stats_df = pd.DataFrame({'Mean': mean, 'Standard Deviation': std_dev}, index=datafile.columns)
    
    return z_normalized_df, stats_df


In [3]:
# Define a function which minmax-normalizes the data
def minmax_normalize(datafile):
    # Initialize the MinMaxScaler
    scaler = MinMaxScaler()
    
    # Fit the scaler on the datafile and transform the data
    minmax_normalized_data = scaler.fit_transform(datafile)
    
    # Create a DataFrame for the minmax-normalized data
    minmax_normalized_df = pd.DataFrame(minmax_normalized_data, columns=datafile.columns)
    
    # Get the minimum and maximum values used for normalization
    min_values = scaler.data_min_
    max_values = scaler.data_max_
    
    # Create a DataFrame for the minimum and maximum values
    minmax_stats_df = pd.DataFrame({'Minimum': min_values, 'Maximum': max_values}, index=datafile.columns)
    
    return minmax_normalized_df, minmax_stats_df

In [4]:
def calculate_cosine_distances(datafile, queryfile):
    # Extract the query vector (assuming it's the first row of the queryfile)
    query_vector = queryfile.iloc[0].values
    
    # Calculate cosine distances
    distances = []
    for index, row in datafile.iterrows():
        distance = cosine(row.values, query_vector)
        distances.append(distance)
    
    # Add distances to the datafile DataFrame
    datafile['Cosine Distance'] = distances
    
    # Sort the DataFrame by the 'Cosine Distance' column
    sorted_datafile = datafile.sort_values(by='Cosine Distance')
    
    return sorted_datafile

def calculate_euclidean_distances(datafile, queryfile):
    # Extract the query vector (assuming it's the first row of the queryfile)
    query_vector = queryfile.iloc[0].values
    
    # Calculate Euclidean distances
    distances = []
    for index, row in datafile.iterrows():
        distance = euclidean(row.values, query_vector)
        distances.append(distance)
    
    # Add distances to the datafile DataFrame
    datafile['Euclidean Distance'] = distances
    
    # Sort the DataFrame by the 'Euclidean Distance' column
    sorted_datafile = datafile.sort_values(by='Euclidean Distance')
    
    return sorted_datafile

In [5]:
def main(datafile_path, queryfile_path):
    # Read the CSV files
    datafile, queryfile= read_csv_files(datafile_path, queryfile_path)
    print(queryfile)
    
    """ #Calculate the z-normalized data and print the mean and standard deviation
    # z_normalized_df, stats_df = z_normalize(datafile)
    # print("Z-normalized Datafile:")
    # print(z_normalized_df)
    # print("\nMean and Standard Deviation:")
    # print(stats_df)

    # Calculate the min-max normalized data and print the minimum and maximum values
    # minmax_normalized_df, minmax_stats_df = minmax_normalize(datafile)
    # print("MinMax-normalized Datafile:")
    # print(minmax_normalized_df)
    # print("\nMinimum and Maximum Values:")
    # print(minmax_stats_df)

    # Calculate cosine distances and sort the datafile
    # sorted_cosine_datafile = calculate_cosine_distances(datafile.copy(), queryfile)
    # print("Sorted Datafile by Cosine Distance:")
    # print(sorted_cosine_datafile)
    
    # Calculate Euclidean distances and sort the datafile
    #sorted_euclidean_datafile = calculate_euclidean_distances(datafile.copy(), queryfile)
    #print("\nSorted Datafile by Euclidean Distance:")
    #print(sorted_euclidean_datafile)

    # Calculate DTW distances by Cosine and sort the datafile
    # sorted_dtw_cosine_datafile = calculate_dtw_cosine(datafile.copy(), queryfile)
    # print("\nSorted Datafile by DTW Cosine Distance:")
    # print(sorted_dtw_cosine_datafile)

    # Calculate DTW distances by Cosine and sort the datafile
    #sorted_dtw_euclidean_datafile = calculate_dtw_euclidean(datafile.copy(), queryfile)
    #print("\nSorted Datafile by DTW Euclidean Distance:")
    #print(sorted_dtw_euclidean_datafile) """

if __name__ == "__main__":
    # Replace 'datafile.csv' and 'queryfile.csv' with your actual file paths
    datafile_path = 'gt_2012.csv'
    queryfile_path = 'qy_2013.csv'
    
    main(datafile_path, queryfile_path)

NameError: name 'pd' is not defined