# unzip all files

In [None]:
import os
import shutil

def unzip_all(main_path, current_path=None):
    """
    Recursively unzip all zip files starting from the main_path.
    If current_path is None, it starts with main_path.
    """
    if current_path is None:
        current_path = main_path

    # List all files and directories in the current path
    for item in os.listdir(current_path):
        item_path = os.path.join(current_path, item)

        # If the item is a directory, recurse into it
        if os.path.isdir(item_path):
            unzip_all(main_path, item_path)
        elif item.endswith('.zip'):
            # Construct the directory path to extract the ZIP file into
            extract_to = os.path.splitext(item_path)[0]

            # Unzip the file into the directory
            print(f"Unzipping: {item_path}")
            shutil.unpack_archive(item_path, extract_to)

            # Remove the ZIP file after extraction if desired
            # os.remove(item_path)

            # Recurse into the extracted directory in case there are nested ZIP files
            unzip_all(main_path, extract_to)

# Define the main path where your primary ZIP file is located
MAIN_PATH = r"C:\Users\shafi\Downloads\doi_10_5061_dryad_5hqbzkh6f__v20210917"

# Assuming the primary ZIP file is directly under MAIN_PATH and needs to be extracted first
for item in os.listdir(MAIN_PATH):
    if item.endswith('.zip'):
        primary_zip_path = os.path.join(MAIN_PATH, item)
        # Unzip the primary ZIP file
        extract_to = os.path.splitext(primary_zip_path)[0]
        print(f"Unzipping primary ZIP: {primary_zip_path}")
        shutil.unpack_archive(primary_zip_path, extract_to)
        # Now, unzip all other ZIP files recursively within the extracted directory
        unzip_all(MAIN_PATH)

print("Completed unzipping all files.")


#Combine each csv for each signal from files

In [None]:
import os
import pandas as pd

# Paths configuration
DATA_PATH = r"C:\Users\shafi\Downloads\doi_10_5061_dryad_5hqbzkh6f__v20210917\Stress_dataset\CE"
SAVE_PATH = r"C:\Users\shafi\Downloads\doi_10_5061_dryad_5hqbzkh6f__v20210917\Stress_dataset\CE\processed_data1"

# Create the save path directory if it doesn't exist
os.makedirs(SAVE_PATH, exist_ok=True)

# Column names for the final combined CSV based on the signal type
final_columns = {
    'ACC': ['id', 'X', 'Y', 'Z', 'datetime'],
    'EDA': ['id', 'EDA', 'datetime'],
    'HR': ['id', 'HR', 'datetime'],
    'TEMP': ['id', 'TEMP', 'datetime'],
}

# Initial empty dataframes for each signal
signal_dataframes = {signal: pd.DataFrame(columns=cols) for signal, cols in final_columns.items()}

# Function to process and return a dataframe from a CSV file, adding ID and datetime
def process_signal_file(filepath, file_id, signal):
    df = pd.read_csv(filepath, header=None)
    start_timestamp, sample_rate = df.iloc[0, 0], df.iloc[1, 0]
    df = df.iloc[2:]
    df.columns = final_columns[signal][1:-1]  # Exclude 'id' and 'datetime'
    df['id'] = file_id
    df['datetime'] = pd.to_datetime(start_timestamp, unit='s') + pd.to_timedelta(df.index / sample_rate, unit='s')
    return df

# Walk through the dataset directory and process each signal CSV file
for root, dirs, files in os.walk(DATA_PATH):
    for directory in dirs:
        dir_path = os.path.join(root, directory)
        file_id = directory  # Assuming the directory name is the ID
        for signal in final_columns.keys():
            signal_file = f"{signal}.csv"
            signal_filepath = os.path.join(dir_path, signal_file)
            if os.path.isfile(signal_filepath):
                df = process_signal_file(signal_filepath, file_id, signal)
                signal_dataframes[signal] = pd.concat([signal_dataframes[signal], df])

# Save the combined dataframes to CSV files
for signal, df in signal_dataframes.items():
    save_filepath = os.path.join(SAVE_PATH, f"combined_{signal.lower()}.csv")
    df.to_csv(save_filepath, index=False)

print('All CSV files have been combined and saved.')


In [None]:
#code to combine all signal csv into one for each nurse

import pandas as pd
import os
import multiprocessing

COMBINED_DATA_PATH = "/content/drive/MyDrive/Stress_dataset/E4/processed_data1"
SAVE_PATH = "/content/drive/MyDrive/Stress_dataset/E4/processed_data1/merged"

if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)

print("Setting up ...")

signals = ['acc', 'eda', 'hr', 'temp']
columns=['X', 'Y', 'Z', 'EDA', 'HR', 'TEMP', 'id', 'datetime']

def optimize_dtype(df):
    # This function can be enhanced based on actual data types in your CSVs
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = df[col].astype('float32')
    for col in df.select_dtypes(include=['int64']).columns:
        df[col] = df[col].astype('int32')
    return df

def read_and_merge(signal):
    print(f"Reading and preprocessing {signal} data...")
    df = pd.read_csv(os.path.join(COMBINED_DATA_PATH, f"combined_{signal}.csv"), dtype={'id': str}, chunksize=10000)  # Adjust chunksize based on your dataset and memory capacity
    optimized_dfs = []
    for chunk in df:
        optimized_chunk = optimize_dtype(chunk)
        optimized_dfs.append(optimized_chunk)
    return pd.concat(optimized_dfs, ignore_index=True)

def merge_data(id, acc, eda, hr, temp):
    print(f"Merging data for ID: {id}")
    acc_id = acc[acc['id'] == id]
    eda_id = eda[eda['id'] == id].drop(['id'], axis=1)
    hr_id = hr[hr['id'] == id].drop(['id'], axis=1)
    temp_id = temp[temp['id'] == id].drop(['id'], axis=1)

    df = acc_id.merge(eda_id, on='datetime', how='outer')
    df = df.merge(temp_id, on='datetime', how='outer')
    df = df.merge(hr_id, on='datetime', how='outer')

    df.fillna(method='ffill', inplace=True)
    df.fillna(method='bfill', inplace=True)
    return df

def process_id(id):
    acc = read_and_merge('acc')
    eda = read_and_merge('eda')
    hr = read_and_merge('hr')
    temp = read_and_merge('temp')
    merged_df = merge_data(id, acc, eda, hr, temp)
    return merged_df

if __name__ == "__main__":
    # Example for reading a single file to get unique IDs, adjust based on your data structure
    sample_df = pd.read_csv(os.path.join(COMBINED_DATA_PATH, "combined_acc.csv"), usecols=['id'], dtype={'id': str})
    unique_ids = sample_df['id'].unique()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    results = pool.map(process_id, unique_ids)
    pool.close()
    pool.join()

    final_df = pd.concat(results, ignore_index=True)
    print("Saving merged data...")
    final_df.to_csv(os.path.join(SAVE_PATH, "merged_data.csv"), index=False)


Setting up ...
Reading and preprocessing acc data...
Reading and preprocessing acc data...
Reading and preprocessing eda data...
Reading and preprocessing eda data...
Reading and preprocessing hr data...
Reading and preprocessing hr data...
Reading and preprocessing temp data...Reading and preprocessing temp data...

Merging data for ID: E4_1587318394Merging data for ID: E4_1587206108

Reading and preprocessing acc data...
Reading and preprocessing acc data...
Reading and preprocessing eda data...
Reading and preprocessing eda data...
Reading and preprocessing hr data...

Reading and preprocessing hr data...Reading and preprocessing temp data...Reading and preprocessing temp data...

Merging data for ID: E4_1587324331
Merging data for ID: E4_1587236732
Reading and preprocessing acc data...
Reading and preprocessing acc data...
Reading and preprocessing eda data...
Reading and preprocessing eda data...
Reading and preprocessing hr data...
Reading and preprocessing hr data...
Reading and

# Survey excel formating based on epoch timestamp
# succesfull

In [None]:
#Timestamp based trying to convert to timestamp

import pandas as pd
from datetime import datetime, timedelta

# Load the Excel file
excel_path = '/content/drive/MyDrive/Stress_dataset/SurveyResults.xlsx'  # Update this path
df = pd.read_excel(excel_path)

def process_row(row):
    """Generate secondly timestamps for each row and include additional columns."""
    # Ensure 'date' is a datetime object
    date = pd.to_datetime(row['date'])

    # Convert 'Start time' and 'End time' from Excel to timedelta
    start_timedelta = timedelta(hours=row['Start time'].hour, minutes=row['Start time'].minute, seconds=row['Start time'].second)
    end_timedelta = timedelta(hours=row['End time'].hour, minutes=row['End time'].minute, seconds=row['End time'].second)

    # Combine 'date' with 'Start time' and 'End time' to create datetime objects
    start_datetime = date + start_timedelta
    end_datetime = date + end_timedelta

    # Generate a range of secondly timestamps
    secondly_timestamps = pd.date_range(start=start_datetime, end=end_datetime, freq='S')

    # Convert secondly timestamps to epoch timestamps
    epoch_timestamps = secondly_timestamps.astype('int64') // 10**9

    # Create a DataFrame for each second in the range, including additional columns from the Excel file
    data = {
        'ID_Timestamp': [f"{row['ID']}_{ts}" for ts in epoch_timestamps],
        'ID': row['ID'],
        'Stress level': row['Stress level'],
        'COVID related': row['COVID related'],
        # Add other columns here as needed
    }

    return pd.DataFrame(data)

# Apply the function to each row in the DataFrame
processed_rows = [process_row(row) for _, row in df.iterrows()]

# Combine all the processed rows into a single DataFrame
expanded_df = pd.concat(processed_rows, ignore_index=True)

# Export to CSV
csv_output_path = '/content/drive/MyDrive/Stress_dataset/output_csv.csv'  # Update this path
expanded_df.to_csv(csv_output_path, index=False)

print(f'Data has been successfully processed and saved to {csv_output_path}')


Data has been successfully processed and saved to /content/drive/MyDrive/Stress_dataset/output_csv.csv


In [None]:
#updated version for "na" value in survey excel file
import pandas as pd
from datetime import datetime, timedelta

# Load the Excel file
excel_path = '/content/drive/MyDrive/Stress_dataset/SurveyResults.xlsx'  # Update this path
df = pd.read_excel(excel_path)

def process_row(row):
    """Generate secondly timestamps for each row and include additional columns."""
    # Ensure 'date' is a datetime object
    date = pd.to_datetime(row['date'])

    # Convert 'Start time' and 'End time' from Excel to timedelta
    start_timedelta = timedelta(hours=row['Start time'].hour, minutes=row['Start time'].minute, seconds=row['Start time'].second)
    end_timedelta = timedelta(hours=row['End time'].hour, minutes=row['End time'].minute, seconds=row['End time'].second)

    # Combine 'date' with 'Start time' and 'End time' to create datetime objects
    start_datetime = date + start_timedelta
    end_datetime = date + end_timedelta

    # Generate a range of secondly timestamps
    secondly_timestamps = pd.date_range(start=start_datetime, end=end_datetime, freq='S')

    # Convert secondly timestamps to epoch timestamps
    epoch_timestamps = secondly_timestamps.astype('int64') // 10**9

    # Create a DataFrame for each second in the range, including additional columns from the Excel file
    data = {
        'ID_Timestamp': [f"{row['ID']}_{ts}" for ts in epoch_timestamps],
        'ID': row['ID'],
        'Stress level': row['Stress level'],
        'COVID related': row['COVID related'],
        # Add other columns here as needed
    }

    return pd.DataFrame(data)

# Apply the function to each row in the DataFrame
processed_rows = [process_row(row) for _, row in df.iterrows()]

# Combine all the processed rows into a single DataFrame
expanded_df = pd.concat(processed_rows, ignore_index=True)

# Replace 'na' string values with 0 in the entire DataFrame
expanded_df.replace('na', 0, inplace=True)

# Replace NaN values with 0 in the entire DataFrame
expanded_df.fillna(0, inplace=True)

# Export to CSV
csv_output_path = '/content/drive/MyDrive/Stress_dataset/output_csv_v2.csv'
expanded_df.to_csv(csv_output_path, index=False)

print(f'Data has been successfully processed and saved to {csv_output_path}')


Data has been successfully processed and saved to /content/drive/MyDrive/Stress_dataset/output_csv_v2.csv


#dropping the extra column and final merge with nurses combined data based on Id_timestamp column

In [None]:
import pandas as pd

# Load the processed data with 'ID_Timestamp' column
processed_data_path = '/content/drive/MyDrive/Stress_dataset/output_csv.csv'  # Update with the actual path
processed_data = pd.read_csv(processed_data_path)

# Load the second dataset
second_dataset_path = '/content/drive/MyDrive/Stress_dataset/15/processed_data1/merged/merged_data.csv'  # Update with the actual path
second_dataset = pd.read_csv(second_dataset_path)

# Merge the datasets on the 'ID_Timestamp' and 'ID' columns
merged_dataset = pd.merge(processed_data, second_dataset, left_on='ID_Timestamp', right_on='id', how='inner')

# Drop the 'ID_Timestamp' column from the first dataset and 'id' from the second dataset, if present
columns_to_drop = ['ID_Timestamp', 'id']
# Ensure only to drop columns that exist in the dataset to avoid KeyError
columns_to_drop = [col for col in columns_to_drop if col in merged_dataset.columns]
merged_dataset.drop(columns=columns_to_drop, inplace=True)

# Save the merged and cleaned dataset
merged_dataset_path = '/content/drive/MyDrive/Stress_dataset/15/processed_data1/Timestamp_merged_dataset_v2.csv'  # Update with the desired save path
merged_dataset.to_csv(merged_dataset_path, index=False)

print(f'Merged and cleaned dataset saved to: {merged_dataset_path}')


Merged and cleaned dataset saved to: /content/drive/MyDrive/Stress_dataset/15/processed_data1/Timestamp_merged_dataset_v2.csv


# After merging everything Combining all csv into one

In [None]:
#Combine all nurse timestamp combined data csv to one

import pandas as pd
import os

# Directory where all CSV files are stored
csv_directory = '/content/drive/MyDrive/Stress_dataset/all csv'  # Update with the actual directory path

# List to hold data from each CSV file
dataframes = []

# Loop through the directory and read each CSV file
for filename in os.listdir(csv_directory):
    if filename.endswith('.csv'):  # Ensure to process only CSV files
        file_path = os.path.join(csv_directory, filename)
        # Read the CSV file and append it to the list of DataFrames
        df = pd.read_csv(file_path)
        dataframes.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

# Save the combined DataFrame to a new CSV file
combined_csv_path = '/content/drive/MyDrive/Stress_dataset/all_combined_csv.csv'  # Update with the desired save path
combined_df.to_csv(combined_csv_path, index=False)

print(f'All CSV files have been combined and saved to {combined_csv_path}')


  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)


All CSV files have been combined and saved to /content/drive/MyDrive/Stress_dataset/all_combined_csv.csv


In [None]:
train = pd.read_csv('/content/drive/MyDrive/Stress_dataset/all_combined_csv.csv')
display(train)

  train = pd.read_csv('/content/drive/MyDrive/Stress_dataset/all_combined_csv.csv')


Unnamed: 0,ID,Stress level,COVID related,X,Y,Z,datetime,EDA,TEMP,HR
0,7E,0,0,-37.0,10.0,-51.0,2020-11-05 14:43:48.062500,0.000000,33.09,98.00
1,7E,0,0,-36.0,8.0,-53.0,2020-11-05 14:43:48.093750,0.000000,33.09,98.00
2,7E,0,0,-36.0,8.0,-55.0,2020-11-05 14:43:48.125000,0.000000,33.09,98.00
3,7E,0,0,-37.0,10.0,-55.0,2020-11-05 14:43:48.156250,0.000000,33.09,98.00
4,7E,0,0,-31.0,4.0,-55.0,2020-11-05 14:43:48.187500,0.000000,33.09,98.00
...,...,...,...,...,...,...,...,...,...,...
10008089,15,2,0,9.0,-78.0,15.0,2020-07-24 15:45:11,0.207538,33.15,93.63
10008090,15,2,0,9.0,-78.0,15.0,2020-07-24 15:45:12,0.207538,33.15,93.60
10008091,15,2,0,9.0,-78.0,15.0,2020-07-24 15:45:13,0.207538,33.15,93.55
10008092,15,2,0,9.0,-78.0,15.0,2020-07-24 15:45:14,0.207538,33.15,93.47


# First attempt with adding Secondly timestamp in survey data  
# Not succesfull

In [None]:
#converting survey excel to secondly timestamp

import pandas as pd

# Load the Excel file
excel_path = '/content/drive/MyDrive/Stress_dataset/SurveyResults.xlsx'  # Update this path
df = pd.read_excel(excel_path)

def process_row(row):
    """Generate secondly timestamps for each row and include additional columns."""
    # Combine 'date' with 'Start time' and 'End time' to create datetime objects
    start_datetime = pd.to_datetime(row['date']) + timedelta(hours=row['Start time'].hour, minutes=row['Start time'].minute, seconds=row['Start time'].second)
    end_datetime = pd.to_datetime(row['date']) + timedelta(hours=row['End time'].hour, minutes=row['End time'].minute, seconds=row['End time'].second)

    # Generate a range of secondly timestamps
    secondly_timestamps = pd.date_range(start=start_datetime, end=end_datetime, freq='S')

    # Create a DataFrame for each second in the range, including additional columns from the Excel file
    data = {
        'Secondly Timestamps': secondly_timestamps,
        'ID': row['ID'],
        # Add other columns here
        'Stress level': row['Stress level'],
        'COVID related': row['COVID related'],
        # You can add as many additional columns as needed
    }
    return pd.DataFrame(data)

# Apply the function to each row in the DataFrame
processed_rows = [process_row(row) for _, row in df.iterrows()]

# Combine all the processed rows into a single DataFrame
expanded_df = pd.concat(processed_rows, ignore_index=True)

# Export to CSV
csv_output_path = '/content/drive/MyDrive/Stress_dataset/output_csv.csv'  # Update this path
expanded_df.to_csv(csv_output_path, index=False)

print(f'Data has been successfully processed and saved to {csv_output_path}')


In [None]:
#extracting each nurse data from survey excel file

import pandas as pd

# Load the dataset
file_path = '/content/drive/MyDrive/Stress_dataset/survey_csv.csv'  # Adjust the file path as necessary
data = pd.read_csv(file_path)

# Identify all unique nurse IDs
unique_ids = data['ID'].unique()

# Directory to save individual nurse data; ensure this directory exists or is created
output_directory = '/content/drive/MyDrive/Stress_dataset/Survey csv for each nurse/nurse_data'

# Iterate through each unique ID to filter and save data
for nurse_id in unique_ids:
    # Filter data for the current nurse ID
    nurse_data = data[data['ID'] == nurse_id]

    # Save the filtered data to a CSV file named after the nurse ID
    nurse_data_file_path = f'{output_directory}{nurse_id}_data.csv'
    nurse_data.to_csv(nurse_data_file_path, index=False)
    print(f'Data for Nurse ID {nurse_id} saved to {nurse_data_file_path}')

# If you need to merge these individual files later into a combined dataset
# you can read each file and concatenate them into a single DataFrame
combined_data_frames = []
for nurse_id in unique_ids:
    nurse_data_file_path = f'{output_directory}{nurse_id}_data.csv'
    nurse_data = pd.read_csv(nurse_data_file_path)
    combined_data_frames.append(nurse_data)

# Combine all the DataFrames into one
combined_data = pd.concat(combined_data_frames, ignore_index=True)

# Optionally, save the combined data to a new CSV file
combined_data_file_path = '/content/drive/MyDrive/Stress_dataset/Survey csv for each nurse/combined_nurse_data.csv'
combined_data.to_csv(combined_data_file_path, index=False)
print(f'Combined data saved to {combined_data_file_path}')


Data for Nurse ID 5C saved to /content/drive/MyDrive/Stress_dataset/Survey csv for each nurse/nurse_data5C_data.csv
Data for Nurse ID E4 saved to /content/drive/MyDrive/Stress_dataset/Survey csv for each nurse/nurse_dataE4_data.csv
Data for Nurse ID 7A saved to /content/drive/MyDrive/Stress_dataset/Survey csv for each nurse/nurse_data7A_data.csv
Data for Nurse ID 94 saved to /content/drive/MyDrive/Stress_dataset/Survey csv for each nurse/nurse_data94_data.csv
Data for Nurse ID CE saved to /content/drive/MyDrive/Stress_dataset/Survey csv for each nurse/nurse_dataCE_data.csv
Data for Nurse ID 6B saved to /content/drive/MyDrive/Stress_dataset/Survey csv for each nurse/nurse_data6B_data.csv
Data for Nurse ID 6D saved to /content/drive/MyDrive/Stress_dataset/Survey csv for each nurse/nurse_data6D_data.csv
Data for Nurse ID 15 saved to /content/drive/MyDrive/Stress_dataset/Survey csv for each nurse/nurse_data15_data.csv
Data for Nurse ID F5 saved to /content/drive/MyDrive/Stress_dataset/Surv

# merging sensor data with survey data by secondly timestamp

In [None]:
#sorting both dataset

import pandas as pd

# Define file paths for the input datasets
nurse_data_path = '/content/drive/MyDrive/Stress_dataset/Survey csv for each nurse/nurse_dataEG_data.csv'  # Update this path as needed
second_dataset_path = '/content/drive/MyDrive/Stress_dataset/EG/processed_data1/merged/merged_data.csv'  # Update this path as needed

# Define file paths for the output sorted datasets
output_nurse_data_sorted_path = '/content/drive/MyDrive/Stress_dataset/EG/nurse_dataEG_data_sorted.csv'
output_second_dataset_sorted_path = '/content/drive/MyDrive/Stress_dataset/EG/second_dataset_sorted.csv'

# Load the nurse dataset and sort it by the 'Secondly Timestamps' column
nurse_data = pd.read_csv(nurse_data_path, parse_dates=['Secondly Timestamps'])
nurse_data_sorted = nurse_data.sort_values(by='Secondly Timestamps')
# Save the sorted nurse dataset
nurse_data_sorted.to_csv(output_nurse_data_sorted_path, index=False)
print(f"Nurse dataset sorted and saved to: {output_nurse_data_sorted_path}")

# Load the second dataset and sort it by the 'datetime' column
second_dataset = pd.read_csv(second_dataset_path, parse_dates=['datetime'])
second_dataset_sorted = second_dataset.sort_values(by='datetime')
# Save the sorted second dataset
second_dataset_sorted.to_csv(output_second_dataset_sorted_path, index=False)
print(f"Second dataset sorted and saved to: {output_second_dataset_sorted_path}")


Nurse dataset sorted and saved to: /content/drive/MyDrive/Stress_dataset/EG/nurse_dataEG_data_sorted.csv
Second dataset sorted and saved to: /content/drive/MyDrive/Stress_dataset/EG/second_dataset_sorted.csv


In [None]:
#merging both dataset after sorting

import pandas as pd

# Paths to the datasets
nurse_data_path = '/content/drive/MyDrive/Stress_dataset/E4/nurse_dataE4_data_sorted.csv'  # Update with the actual path
second_dataset_path = '/content/drive/MyDrive/Stress_dataset/E4/second_dataset_sorted.csv'  # Update with the sorted second dataset path

# Load the nurse dataset
nurse_data = pd.read_csv(nurse_data_path, parse_dates=['Secondly Timestamps'])

# Load the second dataset
second_dataset = pd.read_csv(second_dataset_path, parse_dates=['datetime'])

# Ensure both datasets are sorted by their datetime columns
nurse_data_sorted = nurse_data.sort_values(by='Secondly Timestamps')
second_dataset_sorted = second_dataset.sort_values(by='datetime')

# Perform an exact match merge on the datetime columns
merged_dataset = pd.merge(nurse_data_sorted, second_dataset_sorted, left_on='Secondly Timestamps', right_on='datetime', how='inner')

# Save the merged dataset
output_merged_dataset_path = '/content/drive/MyDrive/Stress_dataset/E4/merged_nurse_and_second_dataset.csv'  # Adjust the filename as needed
merged_dataset.to_csv(output_merged_dataset_path, index=False)

print(f"Merged dataset saved to: {output_merged_dataset_path}")


Merged dataset saved to: /content/drive/MyDrive/Stress_dataset/E4/merged_nurse_and_second_dataset.csv


# validation

In [None]:
#checking datetime in each dataset if its exist

import pandas as pd

# Load the dataset
file_path = '/content/drive/MyDrive/Stress_dataset/5C/processed_data1/Timestamp_merged_dataset.csv'  # Adjust this to the path of your CSV file
data = pd.read_csv(file_path)

# Specify the column name that contains date values and the date you are searching for
date_column = 'datetime'  # Replace 'YourDateColumnName' with the actual column name
search_date = '15/04/2020'  # Adjust this to the date you are looking for, formatted as a string

# Convert the date column to datetime format (if not already in datetime format)
data[date_column] = pd.to_datetime(data[date_column])

# Convert the search date to datetime format for accurate comparison
search_date = pd.to_datetime(search_date)

# Check if the date exists in the column
date_exists = search_date in data[date_column].values

# Print the result
if date_exists:
    print(f"The date {search_date.date()} exists in the dataset.")
else:
    print(f"The date {search_date.date()} does not exist in the dataset.")


The date 2020-04-15 exists in the dataset.


  search_date = pd.to_datetime(search_date)


NameError: name 'df' is not defined