In [1]:
import math
import pathlib
import os.path
import pandas as pd
from typing import Tuple
import datetime
import re

In [2]:

PATH = pathlib.Path().parent.resolve()
DATA_PATH = os.path.join(PATH, "auxillary")

In [13]:
files = []
for folder in os.listdir(DATA_PATH):
    #print(folder)
    for filename in os.listdir(os.path.join(DATA_PATH,folder)):
        #print(filename)
        if filename.endswith(".txt"):
            parts = filename.split("_")
            if len(parts) >= 3:
                year = parts[0][-4:]
                zone = " ".join(parts[1:-1])
                temp = parts[-1].split(".")[0]
                filepath = os.path.join(DATA_PATH, folder, filename)
                files.append({"Year": year, 
                              "Zone": zone, 
                              "Temp": temp, 
                              "Filename": filename, 
                              "Path": filepath,
                              })

In [14]:
df = pd.DataFrame(files)
df

Unnamed: 0,Year,Zone,Temp,Filename,Path
0,2015,Berlin,kalt,TRY2015_Berlin_kalt.txt,/mnt/d/CODE/github/wissen-digital-ewb/webcentr...
1,2015,Berlin,normal,TRY2015_Berlin_normal.txt,/mnt/d/CODE/github/wissen-digital-ewb/webcentr...
2,2015,Berlin,warm,TRY2015_Berlin_warm.txt,/mnt/d/CODE/github/wissen-digital-ewb/webcentr...
3,2015,Bremerhaven,kalt,TRY2015_Bremerhaven_kalt.txt,/mnt/d/CODE/github/wissen-digital-ewb/webcentr...
4,2015,Bremerhaven,normal,TRY2015_Bremerhaven_normal.txt,/mnt/d/CODE/github/wissen-digital-ewb/webcentr...
...,...,...,...,...,...
72,2045,Rostock,normal,TRY2045_Rostock_normal.txt,/mnt/d/CODE/github/wissen-digital-ewb/webcentr...
73,2045,Rostock,warm,TRY2045_Rostock_warm.txt,/mnt/d/CODE/github/wissen-digital-ewb/webcentr...
74,2045,Worms,kalt,TRY2045_Worms_kalt.txt,/mnt/d/CODE/github/wissen-digital-ewb/webcentr...
75,2045,Worms,normal,TRY2045_Worms_normal.txt,/mnt/d/CODE/github/wissen-digital-ewb/webcentr...


In [18]:
df["Zone"].unique()


array(['Berlin', 'Bremerhaven', 'Dresden', 'Freiburg',
       'Garmisch Partenkirchen', 'Göttingen', 'Hannover', 'Kiel',
       'München', 'Nürnberg', 'Oldenburg', 'Rostock', 'Worms'],
      dtype=object)

In [50]:
def filter_filenames(df, year=None, zone=None, temp=None):
    """
    filter file by filename
    """
    filtered_df = df.copy()

    filtered_df = filtered_df[filtered_df['Year'] == year]
    filtered_df = filtered_df[filtered_df['Zone'] == zone]
    filtered_df = filtered_df[filtered_df['Temp'] == temp]
    return filtered_df


def read_file_contents(df):
    """
    read content from files accorfing to filenamesand filepath
    """
    contents = {}
    for filename in df["Filename"]:
        #print(filename)
        path = df[df["Filename"]==filename]["Path"].iloc[0]
        #print(path)

        with open(path, 'r') as file:
            lines = file.readlines()

        # Find the line containing column headers
        header_index = -1
        rw = 0
        for i, line in enumerate(lines):
            if line.strip().startswith("RW"):
                if rw == 0:
                    rw = 1
                    continue
                if rw == 1:
                    header_index = i
                    break

        if header_index == -1:
            raise ValueError("Column headers not found in the file.")

        # Extract header and data lines
        header_line = lines[header_index].strip()
        data_lines = lines[header_index + 2:]  # Skip "***" line after the header

        # Parse the data into a DataFrame
        column_names = header_line.split()
        data = []
        for line in data_lines:
            # Skip empty lines or comments
            if line.strip() and not line.strip().startswith("***"):
                data.append(line.split())

        df = pd.DataFrame(data, columns=column_names)

        # Convert columns to appropriate data types
        numeric_columns = ["RW", "HW", "MM", "DD", "HH", "t", "p", "WR", "WG", "N", "x", "RF", "B", "D", "A", "E", "IL"]
        for col in numeric_columns:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')

    return df

In [51]:
year = "2015"
zone = "Zone0"
temp = "kalt"

In [52]:
filtered_df = filter_filenames(df,year,zone,temp)
filtered_df

Unnamed: 0,Year,Zone,Temp,Filename,Path
0,2015,Zone0,kalt,TRY2015_Zone0_kalt.txt,/mnt/d/CODE/github/wissen-digital-ewb/webcentr...


In [71]:
file_contents = read_file_contents(filtered_df)
file_contents = file_contents[["MM", "DD", "HH", "t"]]
file_contents["year"] = "2015"
file_contents

Unnamed: 0,MM,DD,HH,t,year
0,1,1,1,-0.8,2015
1,1,1,2,-1.0,2015
2,1,1,3,-1.2,2015
3,1,1,4,-1.1,2015
4,1,1,5,-1.3,2015
...,...,...,...,...,...
8755,12,31,20,-0.4,2015
8756,12,31,21,-0.4,2015
8757,12,31,22,-0.5,2015
8758,12,31,23,-0.5,2015


In [70]:
a = "2015"

print(a)
print(type(a))

2015
<class 'str'>
