In [14]:
import os
import tkinter as tk
from tkinter import ttk, messagebox
import pandas as pd


# Function to process selected months and CSV files
def process_selected_data(selected_months, selected_csvs, base_path):
    data = {}

    for month in selected_months:
        year, month_number = month.split('_')
        year_folder = f"{year}_V3"
        month_folder = f"{year}_{month_number}"
        month_path = os.path.join(base_path, year_folder, month_folder)

        print(f"Checking directory: {month_path}")
        if os.path.isdir(month_path):  # Ensure it is a directory
            print(f"Directory exists: {month_path}")
            month_data = {}
            for csv_type in selected_csvs:
                # Define the file path
                file_name_with_space = f"{csv_type}- {month_folder}.csv"
                file_name_without_space = f"{csv_type}-{month_folder}.csv"

                file_path_with_space = os.path.join(month_path, file_name_with_space)
                file_path_without_space = os.path.join(month_path, file_name_without_space)

                # Determine which file exists
                file_path = None
                if os.path.isfile(file_path_with_space):
                    file_path = file_path_with_space
                    print(f"Found file: {file_path}")
                elif os.path.isfile(file_path_without_space):
                    file_path = file_path_without_space
                    print(f"Found file: {file_path}")
                else:
                    print(f"File not found: {file_name_with_space} or {file_name_without_space} in {month_folder}")
                    continue  # Skip if neither file exists

                try:
                    # Read the CSV file
                    print(f"Attempting to load file: {file_path}")
                    data_frame = pd.read_csv(file_path, low_memory=False)
                    print(f"Successfully loaded {len(data_frame)} rows from {file_path}")

                    # Add metadata columns
                    data_frame['Source'] = csv_type
                    data_frame['Year'] = year
                    data_frame['Month'] = month_number

                    # Add to month-level data
                    month_data[csv_type] = data_frame

                except Exception as e:
                    print(f"Error loading file {file_path}: {e}")

            # Add the month data to the corresponding year
            if year not in data:
                data[year] = {}
            data[year][month] = month_data
        else:
            print(f"Directory does not exist: {month_path}")

    return data



# GUI for selecting months and CSV files
def selection_gui(base_path):
    def on_submit():
        selected_months = [month for month, var in month_checkboxes.items() if var.get()]
        selected_csvs = [csv for csv, var in csv_checkboxes.items() if var.get()]

        if not selected_months:
            messagebox.showwarning("No Selection", "Please select at least one month.")
            return
        if not selected_csvs:
            messagebox.showwarning("No Selection", "Please select at least one CSV file type.")
            return

        root.destroy()

        # Process data and create the hierarchical structure
        final_data = process_selected_data(selected_months, selected_csvs, base_path)

        # Display a preview of the processed structure in the terminal
        print("\nData Structure Overview:")
        for year, months in final_data.items():
            print(f"Year: {year}")
            for month, month_data in months.items():
                print(f"  Month: {month}")
                for csv_type, df in month_data.items():
                    print(f"    {csv_type}: {len(df)} rows")

    # Create the main GUI window
    root = tk.Tk()
    root.title("Select Months and CSV Files")
    root.geometry("1200x800")  # Set window size to maximize available screen space

    # Title label
    tk.Label(root, text="Select the months and CSV files you want to process:", font=("Arial", 16, "bold")).pack(pady=10)

    # Section for month selection
    month_frame = ttk.LabelFrame(root, text="Months", padding=(10, 10))
    month_frame.pack(fill="both", expand=True, padx=10, pady=5)

    month_checkboxes = {}
    row, col = 0, 0
    for year_folder in os.listdir(base_path):
        year_path = os.path.join(base_path, year_folder)
        if os.path.isdir(year_path):
            for month_folder in os.listdir(year_path):
                month_path = os.path.join(year_path, month_folder)
                if os.path.isdir(month_path):
                    var = tk.BooleanVar()
                    month_checkboxes[month_folder] = var
                    cb = tk.Checkbutton(month_frame, text=month_folder, variable=var, font=("Arial", 12))
                    cb.grid(row=row, column=col, sticky="w", padx=10, pady=5)
                    col += 1
                    if col >= 6:  # Change number of columns here to adjust layout
                        col = 0
                        row += 1

    # Section for CSV file selection
    csv_frame = ttk.LabelFrame(root, text="CSV Files", padding=(10, 10))
    csv_frame.pack(fill="both", expand=True, padx=10, pady=5)

    csv_checkboxes = {}
    csv_types = ["IVCurves", "LightSpectra", "SolarFieldData"]
    row = 0
    for csv_type in csv_types:
        var = tk.BooleanVar()
        csv_checkboxes[csv_type] = var
        cb = tk.Checkbutton(csv_frame, text=csv_type, variable=var, font=("Arial", 12))
        cb.grid(row=row, column=0, sticky="w", padx=10, pady=5)
        row += 1

    # Submit button
    tk.Button(root, text="Submit", command=on_submit, font=("Arial", 14), bg="green", fg="white").pack(pady=20)

    root.mainloop()




# Run the selection GUI
selection_gui(base_path)

# Call the function
df = load_data(base_path, selected_months, selected_csvs, debug=True)





NameError: name 'selected_months' is not defined

In [21]:
import os
import pandas as pd
from pathlib import Path

# Function to load the data with enhanced debugging and path consistency
def load_data(base_path, selected_months, selected_csvs, debug=False):
    # Ensure the base path is normalized
    base_path = Path(base_path)

    # Check if the master folder exists
    if debug:
        print(f"Master folder exists: {base_path}: {'Yes' if base_path.is_dir() else 'No'}")
    if not base_path.is_dir():
        raise FileNotFoundError(f"The master folder does not exist: {base_path}")

    data = []  # Store data as a list of DataFrames

    for month in selected_months:
        year, month_number = month.split('_')
        year_folder = f"{year}_V3"
        month_folder = f"{year}_{month_number}"
        month_path = base_path / year_folder / month_folder  # Use Path for consistency

        if debug:
            print(f"Checking directory: {month_path}")

        if month_path.is_dir():  # Ensure it is a directory
            if debug:
                print(f"Directory exists: {month_path}")

            for csv_type in selected_csvs:
                if debug:
                    print(f"Processing CSV type: {csv_type}")

                # Construct the file name, handling the special case for LightSpectra
                file_path = None
                if csv_type == "LightSpectra":
                    # Check for both cases with and without space after the dash
                    file_name_with_space = f"{csv_type}- {month_folder}.csv"
                    file_name_without_space = f"{csv_type}-{month_folder}.csv"

                    file_path_with_space = month_path / file_name_with_space
                    file_path_without_space = month_path / file_name_without_space

                    # Determine which file exists
                    if file_path_with_space.exists():
                        file_path = file_path_with_space
                        if debug:
                            print(f"Found file: {file_path}")
                    elif file_path_without_space.exists():
                        file_path = file_path_without_space
                        if debug:
                            print(f"Found file: {file_path}")
                    else:
                        if debug:
                            print(f"LightSpectra file not found for either format in: {month_path}")
                        continue  # Skip if neither file exists

                else:
                    file_name = f"{csv_type}-{month_folder}.csv"
                    file_path = month_path / file_name  # Use Path for consistency
                    if debug:
                        print(f"Checking file: {file_path}")

                if file_path and file_path.exists():
                    if debug:
                        print(f"Loading file: {file_path}")
                    try:
                        # Read the CSV file
                        data_frame = pd.read_csv(file_path, low_memory=False)

                        if debug:
                            print(f"Successfully loaded {len(data_frame)} rows from {file_path}")

                        # Add metadata columns
                        data_frame['Source'] = csv_type
                        data_frame['Year'] = year
                        data_frame['Month'] = month_number

                        # Append to the main data list
                        data.append(data_frame)

                    except Exception as e:
                        if debug:
                            print(f"Error loading file {file_path}: {e}")
                        pass
                else:
                    if debug:
                        print(f"File does not exist: {file_path}")
        else:
            if debug:
                print(f"Directory does not exist: {month_path}")

    # Combine all DataFrames into a single DataFrame
    combined_data = pd.concat(data, ignore_index=True) if data else pd.DataFrame()
    return combined_data

if __name__ == "__main__":
    # Base path to the data
    base_path = "../../OneDrive - HvA/Jaar_4/PV systems modeling and analysis/Data/data_folder"

    # Generate selected months for all months in 2017
    selected_months = [f"2017_{str(month).zfill(2)}" for month in range(1, 6)]

    # Specify the CSV types you want to load
    selected_csvs = ["IVCurves", "LightSpectra", "SolarFieldData"]

    # Load the data
    combined_df = load_data(base_path, selected_months, selected_csvs, debug=True)

    # Display the combined DataFrame
    print("\nCombined DataFrame for 2017:")
    print(combined_df.head())

    # Example: Access all LightSpectra data for 2017
    light_spectra_2017 = combined_df[combined_df['Source'] == 'LightSpectra']
    print("\nLightSpectra Data for 2017:")
    print(light_spectra_2017.head())

    # Example: Access all data for January 2017
    data_january_2017 = combined_df[
        (combined_df['Year'] == '2017') & 
        (combined_df['Month'] == '01')
    ]
    print("\nData for January 2017:")
    print(data_january_2017.head())

    # Example: Save the combined data to a CSV for later use
    combined_df.to_csv("combined_data_2017.csv", index=False)


Master folder exists: ..\..\OneDrive - HvA\Jaar_4\PV systems modeling and analysis\Data\data_folder: Yes
Checking directory: ..\..\OneDrive - HvA\Jaar_4\PV systems modeling and analysis\Data\data_folder\2017_V3\2017_01
Directory exists: ..\..\OneDrive - HvA\Jaar_4\PV systems modeling and analysis\Data\data_folder\2017_V3\2017_01
Processing CSV type: IVCurves
Checking file: ..\..\OneDrive - HvA\Jaar_4\PV systems modeling and analysis\Data\data_folder\2017_V3\2017_01\IVCurves-2017_01.csv
Loading file: ..\..\OneDrive - HvA\Jaar_4\PV systems modeling and analysis\Data\data_folder\2017_V3\2017_01\IVCurves-2017_01.csv
Successfully loaded 8928 rows from ..\..\OneDrive - HvA\Jaar_4\PV systems modeling and analysis\Data\data_folder\2017_V3\2017_01\IVCurves-2017_01.csv
Processing CSV type: LightSpectra
Found file: ..\..\OneDrive - HvA\Jaar_4\PV systems modeling and analysis\Data\data_folder\2017_V3\2017_01\LightSpectra- 2017_01.csv
Loading file: ..\..\OneDrive - HvA\Jaar_4\PV systems modeling an

In [23]:
data_january_2017

Unnamed: 0,AbsTime[s],Date[weekday],Date[day],Date[month],Date[year],Time[hh:mm:ss],Module #_1 (Solarpanel curves),V list[V]_1 (Solarpanel curves),I list[A]_1 (Solarpanel curves),average G [W/m²]_1 (Solarpanel curves),...,Integration time (LAD Sensor 11),Sensor (LAD Sensor 12),Red light (LAD Sensor 12),Green light (LAD Sensor 12),Blue light (LAD Sensor 12),Clear light (LAD Sensor 12),Infrared light (LAD Sensor 12),Temperature [°C] (LAD Sensor 12),Gain (LAD Sensor 12),Integration time (LAD Sensor 12)
0,3692217600,Sunday,1,January,2017,00:00:00,,,,,...,,,,,,,,,,
1,3692217900,Sunday,1,January,2017,00:05:00,,,,,...,,,,,,,,,,
2,3692218200,Sunday,1,January,2017,00:10:00,,,,,...,,,,,,,,,,
3,3692218500,Sunday,1,January,2017,00:15:00,,,,,...,,,,,,,,,,
4,3692218800,Sunday,1,January,2017,00:20:00,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26779,3694894500,Tuesday,31,January,2017,23:35:00,,,,,...,,,,,,,,,,
26780,3694894800,Tuesday,31,January,2017,23:40:00,,,,,...,,,,,,,,,,
26781,3694895100,Tuesday,31,January,2017,23:45:00,,,,,...,,,,,,,,,,
26782,3694895400,Tuesday,31,January,2017,23:50:00,,,,,...,,,,,,,,,,


In [40]:
unique_sources = combined_df['Source'].unique()
print(unique_sources)


['IVCurves' 'LightSpectra' 'SolarFieldData']


In [42]:
# Assuming combined_df is the loaded DataFrame
# Filter the data for January 2017
january_2017_data = combined_df[
    (combined_df['Year'] == '2017') & 
    (combined_df['Month'] == '01')
]

# Get the unique values in the 'Source' column
unique_sources = january_2017_data['Source'].unique()

# Print the data for each unique source
for source in unique_sources:
    print(f"Data for Source: {source}")
    source_data = january_2017_data[january_2017_data['Source'] == source]
    print(source_data)
    print("\n")


Data for Source: IVCurves
      AbsTime[s] Date[weekday]  Date[day] Date[month]  Date[year]  \
0     3692217600        Sunday          1     January        2017   
1     3692217900        Sunday          1     January        2017   
2     3692218200        Sunday          1     January        2017   
3     3692218500        Sunday          1     January        2017   
4     3692218800        Sunday          1     January        2017   
...          ...           ...        ...         ...         ...   
8923  3694894500       Tuesday         31     January        2017   
8924  3694894800       Tuesday         31     January        2017   
8925  3694895100       Tuesday         31     January        2017   
8926  3694895400       Tuesday         31     January        2017   
8927  3694895700       Tuesday         31     January        2017   

     Time[hh:mm:ss]  Module #_1 (Solarpanel curves)  \
0          00:00:00                             NaN   
1          00:05:00                

In [43]:
# Identify columns with non-NaN values for each source
for source in unique_sources:
    print(f"Non-NaN columns for Source: {source}")
    source_data = january_2017_data[january_2017_data['Source'] == source]
    non_nan_columns = source_data.columns[source_data.notna().any()].tolist()
    print(non_nan_columns)
    print("\n")


Non-NaN columns for Source: IVCurves
['AbsTime[s]', 'Date[weekday]', 'Date[day]', 'Date[month]', 'Date[year]', 'Time[hh:mm:ss]', 'Module #_1 (Solarpanel curves)', 'V list[V]_1 (Solarpanel curves)', 'I list[A]_1 (Solarpanel curves)', 'average G [W/m²]_1 (Solarpanel curves)', 'Module #_2 (Solarpanel curves)', 'V list[V]_2 (Solarpanel curves)', 'I list[A]_2 (Solarpanel curves)', 'average G [W/m²]_2 (Solarpanel curves)', 'Module #_3 (Solarpanel curves)', 'V list[V]_3 (Solarpanel curves)', 'I list[A]_3 (Solarpanel curves)', 'average G [W/m²]_3 (Solarpanel curves)', 'Module #_4 (Solarpanel curves)', 'V list[V]_4 (Solarpanel curves)', 'I list[A]_4 (Solarpanel curves)', 'average G [W/m²]_4 (Solarpanel curves)', 'Module #_5 (Solarpanel curves)', 'V list[V]_5 (Solarpanel curves)', 'I list[A]_5 (Solarpanel curves)', 'average G [W/m²]_5 (Solarpanel curves)', 'Module #_6 (Solarpanel curves)', 'V list[V]_6 (Solarpanel curves)', 'I list[A]_6 (Solarpanel curves)', 'average G [W/m²]_6 (Solarpanel curve

In [35]:
light_spectra_january_2017.head(1)


Unnamed: 0,AbsTime[s],Date[weekday],Date[day],Date[month],Date[year],Time[hh:mm:ss],Module #_1 (Solarpanel curves),V list[V]_1 (Solarpanel curves),I list[A]_1 (Solarpanel curves),average G [W/m²]_1 (Solarpanel curves),...,Integration time (LAD Sensor 11),Sensor (LAD Sensor 12),Red light (LAD Sensor 12),Green light (LAD Sensor 12),Blue light (LAD Sensor 12),Clear light (LAD Sensor 12),Infrared light (LAD Sensor 12),Temperature [°C] (LAD Sensor 12),Gain (LAD Sensor 12),Integration time (LAD Sensor 12)
8928,3692217600,Sunday,1,January,2017,00:00:00,,,,,...,,,,,,,,,,
