In [14]:
import os
import tkinter as tk
from tkinter import ttk, messagebox
import pandas as pd


# Function to process selected months and CSV files
def process_selected_data(selected_months, selected_csvs, base_path):
    data = {}

    for month in selected_months:
        year, month_number = month.split('_')
        year_folder = f"{year}_V3"
        month_folder = f"{year}_{month_number}"
        month_path = os.path.join(base_path, year_folder, month_folder)

        print(f"Checking directory: {month_path}")
        if os.path.isdir(month_path):  # Ensure it is a directory
            print(f"Directory exists: {month_path}")
            month_data = {}
            for csv_type in selected_csvs:
                # Define the file path
                file_name_with_space = f"{csv_type}- {month_folder}.csv"
                file_name_without_space = f"{csv_type}-{month_folder}.csv"

                file_path_with_space = os.path.join(month_path, file_name_with_space)
                file_path_without_space = os.path.join(month_path, file_name_without_space)

                # Determine which file exists
                file_path = None
                if os.path.isfile(file_path_with_space):
                    file_path = file_path_with_space
                    print(f"Found file: {file_path}")
                elif os.path.isfile(file_path_without_space):
                    file_path = file_path_without_space
                    print(f"Found file: {file_path}")
                else:
                    print(f"File not found: {file_name_with_space} or {file_name_without_space} in {month_folder}")
                    continue  # Skip if neither file exists

                try:
                    # Read the CSV file
                    print(f"Attempting to load file: {file_path}")
                    data_frame = pd.read_csv(file_path, low_memory=False)
                    print(f"Successfully loaded {len(data_frame)} rows from {file_path}")

                    # Add metadata columns
                    data_frame['Source'] = csv_type
                    data_frame['Year'] = year
                    data_frame['Month'] = month_number

                    # Add to month-level data
                    month_data[csv_type] = data_frame

                except Exception as e:
                    print(f"Error loading file {file_path}: {e}")

            # Add the month data to the corresponding year
            if year not in data:
                data[year] = {}
            data[year][month] = month_data
        else:
            print(f"Directory does not exist: {month_path}")

    return data



# GUI for selecting months and CSV files
def selection_gui(base_path):
    def on_submit():
        selected_months = [month for month, var in month_checkboxes.items() if var.get()]
        selected_csvs = [csv for csv, var in csv_checkboxes.items() if var.get()]

        if not selected_months:
            messagebox.showwarning("No Selection", "Please select at least one month.")
            return
        if not selected_csvs:
            messagebox.showwarning("No Selection", "Please select at least one CSV file type.")
            return

        root.destroy()

        # Process data and create the hierarchical structure
        final_data = process_selected_data(selected_months, selected_csvs, base_path)

        # Display a preview of the processed structure in the terminal
        print("\nData Structure Overview:")
        for year, months in final_data.items():
            print(f"Year: {year}")
            for month, month_data in months.items():
                print(f"  Month: {month}")
                for csv_type, df in month_data.items():
                    print(f"    {csv_type}: {len(df)} rows")

    # Create the main GUI window
    root = tk.Tk()
    root.title("Select Months and CSV Files")
    root.geometry("1200x800")  # Set window size to maximize available screen space

    # Title label
    tk.Label(root, text="Select the months and CSV files you want to process:", font=("Arial", 16, "bold")).pack(pady=10)

    # Section for month selection
    month_frame = ttk.LabelFrame(root, text="Months", padding=(10, 10))
    month_frame.pack(fill="both", expand=True, padx=10, pady=5)

    month_checkboxes = {}
    row, col = 0, 0
    for year_folder in os.listdir(base_path):
        year_path = os.path.join(base_path, year_folder)
        if os.path.isdir(year_path):
            for month_folder in os.listdir(year_path):
                month_path = os.path.join(year_path, month_folder)
                if os.path.isdir(month_path):
                    var = tk.BooleanVar()
                    month_checkboxes[month_folder] = var
                    cb = tk.Checkbutton(month_frame, text=month_folder, variable=var, font=("Arial", 12))
                    cb.grid(row=row, column=col, sticky="w", padx=10, pady=5)
                    col += 1
                    if col >= 6:  # Change number of columns here to adjust layout
                        col = 0
                        row += 1

    # Section for CSV file selection
    csv_frame = ttk.LabelFrame(root, text="CSV Files", padding=(10, 10))
    csv_frame.pack(fill="both", expand=True, padx=10, pady=5)

    csv_checkboxes = {}
    csv_types = ["IVCurves", "LightSpectra", "SolarFieldData"]
    row = 0
    for csv_type in csv_types:
        var = tk.BooleanVar()
        csv_checkboxes[csv_type] = var
        cb = tk.Checkbutton(csv_frame, text=csv_type, variable=var, font=("Arial", 12))
        cb.grid(row=row, column=0, sticky="w", padx=10, pady=5)
        row += 1

    # Submit button
    tk.Button(root, text="Submit", command=on_submit, font=("Arial", 14), bg="green", fg="white").pack(pady=20)

    root.mainloop()




# Run the selection GUI
selection_gui(base_path)

# Call the function
df = load_data(base_path, selected_months, selected_csvs, debug=True)





NameError: name 'selected_months' is not defined

In [84]:
import os
import pandas as pd
from pathlib import Path

def load_and_sync_data(base_path, selected_months, selected_csvs, debug=False):
    """
    Load and synchronize data from multiple CSV sources, ensuring rows have data from all sources.
    
    Parameters:
        base_path (str): Base path to the dataset folder.
        selected_months (list of str): List of months to load (e.g., ['2017_01', '2018_01']).
        selected_csvs (list of str): List of data sources to include (e.g., ['LightSpectra', 'SolarFieldData']).
        debug (bool): Whether to enable debug output.
        
    Returns:
        pd.DataFrame: Combined DataFrame with synchronized rows across all sources.
    """
    base_path = Path(base_path)
    if not base_path.is_dir():
        raise FileNotFoundError(f"The base path does not exist: {base_path}")

    # Dictionary to store DataFrames from each source
    source_dataframes = {}

    for csv_type in selected_csvs:
        combined_source_data = []

        for month in selected_months:
            year, month_number = month.split('_')
            year_folder = f"{year}_V3"
            month_folder = f"{year}_{month_number}"
            month_path = base_path / year_folder / month_folder

            if not month_path.is_dir():
                if debug:
                    print(f"Skipping non-existent directory: {month_path}")
                continue

            # Determine the correct file name for the source
            file_name_with_space = f"{csv_type}- {month_folder}.csv"
            file_name_without_space = f"{csv_type}-{month_folder}.csv"

            file_path = None
            if csv_type == "LightSpectra":
                # Handle LightSpectra's unique naming format
                file_path_with_space = month_path / file_name_with_space
                file_path_without_space = month_path / file_name_without_space
                if file_path_with_space.exists():
                    file_path = file_path_with_space
                elif file_path_without_space.exists():
                    file_path = file_path_without_space
            else:
                file_path = month_path / file_name_without_space

            if file_path and file_path.exists():
                try:
                    data_frame = pd.read_csv(file_path, low_memory=False)
                    data_frame['Source'] = csv_type  # Add source metadata
                    data_frame['Year'] = year
                    data_frame['Month'] = month_number
                    combined_source_data.append(data_frame)
                except Exception as e:
                    if debug:
                        print(f"Error loading file {file_path}: {e}")
            else:
                if debug:
                    print(f"File not found: {file_path}")

        # Concatenate all data for the current source
        if combined_source_data:
            source_dataframes[csv_type] = pd.concat(combined_source_data, ignore_index=True)

    # Synchronize data across all sources using inner join on AbsTime[s]
    synced_data = None
    for source, df in source_dataframes.items():
        if synced_data is None:
            synced_data = df
        else:
            synced_data = pd.merge(synced_data, df, on='AbsTime[s]', how='inner', suffixes=('', f'_{source}'))

    return synced_data if synced_data is not None else pd.DataFrame()

if __name__ == "__main__":
    # Base path to the dataset
    base_path = "../../OneDrive - HvA/Jaar_4/PV systems modeling and analysis/Data/data_folder"

    # Specify months and sources to include
    selected_months = get_selected_months(years=[2017, 2018], months=[1])
    selected_csvs = ["LightSpectra", "IVCurves"]

    # Load and synchronize data
    synced_df = load_and_sync_data(base_path, selected_months, selected_csvs, debug=True)

    # Display the synchronized DataFrame
    print("\nSynchronized DataFrame:")
    print(synced_df.head())

    # Save the synchronized data for later use (optional)
    synced_df.to_csv("synced_data_january_2017_2018.csv", index=False)
    print("Synchronized data saved to synced_data_january_2017_2018.csv")



Synchronized DataFrame:
   AbsTime[s] Date[weekday]  Date[day] Date[month]  Date[year] Time[hh:mm:ss]  \
0  3692217600        Sunday          1     January        2017       00:00:00   
1  3692217900        Sunday          1     January        2017       00:05:00   
2  3692218200        Sunday          1     January        2017       00:10:00   
3  3692218500        Sunday          1     January        2017       00:15:00   
4  3692218800        Sunday          1     January        2017       00:20:00   

  Memo (Spectrometer) Sensor (Spectrometer)  \
0           EKO WISER                MS-711   
1           EKO WISER                MS-711   
2           EKO WISER                MS-711   
3           EKO WISER                MS-711   
4           EKO WISER                MS-711   

   Exposure time[milliseconds] (Spectrometer)  \
0                                      5000.0   
1                                      5000.0   
2                                      5000.0   
3        

In [85]:
synced_df

Unnamed: 0,AbsTime[s],Date[weekday],Date[day],Date[month],Date[year],Time[hh:mm:ss],Memo (Spectrometer),Sensor (Spectrometer),Exposure time[milliseconds] (Spectrometer),Sensor temperature[°C] (Spectrometer),...,V list[V]_5 (Solarpanel curves),I list[A]_5 (Solarpanel curves),average G [W/m²]_5 (Solarpanel curves),Module #_6 (Solarpanel curves),V list[V]_6 (Solarpanel curves),I list[A]_6 (Solarpanel curves),average G [W/m²]_6 (Solarpanel curves),Source_IVCurves,Year_IVCurves,Month_IVCurves
0,3692217600,Sunday,1,January,2017,00:00:00,EKO WISER,MS-711,5000.0,24.4,...,,,,,,,,IVCurves,2017,01
1,3692217900,Sunday,1,January,2017,00:05:00,EKO WISER,MS-711,5000.0,24.3,...,,,,,,,,IVCurves,2017,01
2,3692218200,Sunday,1,January,2017,00:10:00,EKO WISER,MS-711,5000.0,24.5,...,,,,,,,,IVCurves,2017,01
3,3692218500,Sunday,1,January,2017,00:15:00,EKO WISER,MS-711,5000.0,24.3,...,,,,,,,,IVCurves,2017,01
4,3692218800,Sunday,1,January,2017,00:20:00,EKO WISER,MS-711,5000.0,24.5,...,,,,,,,,IVCurves,2017,01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17851,3726430500,Wednesday,31,January,2018,23:35:00,EKO WISER,MS-711,5000.0,24.4,...,,,,,,,,IVCurves,2018,01
17852,3726430800,Wednesday,31,January,2018,23:40:00,EKO WISER,MS-711,5000.0,24.5,...,,,,,,,,IVCurves,2018,01
17853,3726431100,Wednesday,31,January,2018,23:45:00,EKO WISER,MS-711,5000.0,24.5,...,,,,,,,,IVCurves,2018,01
17854,3726431400,Wednesday,31,January,2018,23:50:00,EKO WISER,MS-711,5000.0,24.3,...,,,,,,,,IVCurves,2018,01


In [76]:
synced_data

Unnamed: 0,AbsTime[s],Date[weekday],Date[day],Date[month],Date[year],Time[hh:mm:ss],Module #_1 (Solarpanel curves)_LightSpectra,V list[V]_1 (Solarpanel curves)_LightSpectra,I list[A]_1 (Solarpanel curves)_LightSpectra,average G [W/m²]_1 (Solarpanel curves)_LightSpectra,...,Integration time (LAD Sensor 11)_SolarFieldData,Sensor (LAD Sensor 12)_SolarFieldData,Red light (LAD Sensor 12)_SolarFieldData,Green light (LAD Sensor 12)_SolarFieldData,Blue light (LAD Sensor 12)_SolarFieldData,Clear light (LAD Sensor 12)_SolarFieldData,Infrared light (LAD Sensor 12)_SolarFieldData,Temperature [°C] (LAD Sensor 12)_SolarFieldData,Gain (LAD Sensor 12)_SolarFieldData,Integration time (LAD Sensor 12)_SolarFieldData
0,3692217600,Sunday,1,January,2017,00:00:00,,,,,...,,,,,,,,,,
1,3692217900,Sunday,1,January,2017,00:05:00,,,,,...,,,,,,,,,,
2,3692218200,Sunday,1,January,2017,00:10:00,,,,,...,,,,,,,,,,
3,3692218500,Sunday,1,January,2017,00:15:00,,,,,...,,,,,,,,,,
4,3692218800,Sunday,1,January,2017,00:20:00,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17851,3726430500,Wednesday,31,January,2018,23:35:00,,,,,...,,,,,,,,,,
17852,3726430800,Wednesday,31,January,2018,23:40:00,,,,,...,,,,,,,,,,
17853,3726431100,Wednesday,31,January,2018,23:45:00,,,,,...,,,,,,,,,,
17854,3726431400,Wednesday,31,January,2018,23:50:00,,,,,...,,,,,,,,,,
