In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import glob

In [20]:
# Initialize an empty DataFrame to append all the data
overall_df = pd.DataFrame()

# Iterate over all CSV files in the downloaded_csvs folder
for file_name in glob.glob('data/downloaded_csvs/*.csv'):
    # Read the current CSV file
    temp_df = pd.read_csv(file_name, header=None, names=['hour_number', 'price', 'to_drop'])
    # Extract date from the filename dynamically and prepend to the hour number
    date_str = file_name.split('_')[-1].split('.')[0]  # Extracts date from the filename
    temp_df['hour_number'] = date_str + '-' + temp_df['hour_number'].astype(str)
    # Append the current DataFrame to the overall DataFrame
    overall_df = pd.concat([overall_df, temp_df], ignore_index=True)


In [21]:
overall_df

Unnamed: 0,hour_number,price,to_drop
0,20240223-1,25.32,DSO-RD;
1,20240223-2,23.86,DSO-RD;
2,20240223-3,20.92,DSO-RD;
3,20240223-4,19.73,DSO-RD;
4,20240223-5,20.07,DSO-RD;
...,...,...,...
2190,20240212-20,34.26,DSO-RD;
2191,20240212-21,42.97,DSO-RD;
2192,20240212-22,38.65,DSO-RD;
2193,20240212-23,29.44,DSO-RD;


In [22]:
# Initialize an empty DataFrame to append all the weekly data
overall_weekly_df = pd.DataFrame()

# Iterate over all CSV files in the weekly_market folder
for file_name in glob.glob('data/weekly_market/*.csv'):
    # Read the current CSV file
    temp_weekly_df = pd.read_csv(file_name, header=None, names=['date', 'hour_number', 'demand', 'capacity'])
    # Remove the slash in the date and append the date to the hour number
    temp_weekly_df['date'] = temp_weekly_df['date'].str.replace('/', '')
    temp_weekly_df['hour_number'] = temp_weekly_df['date'] + '-' + temp_weekly_df['hour_number'].astype(str)
    # Drop the now redundant date column
    temp_weekly_df.drop(columns=['date'], inplace=True)
    # Append the current DataFrame to the overall weekly DataFrame
    overall_weekly_df = pd.concat([overall_weekly_df, temp_weekly_df], ignore_index=True)


In [24]:
overall_weekly_df

Unnamed: 0,hour_number,demand,capacity
0,20230409-1,12954,23325
1,20230409-2,12822,23388
2,20230409-3,12800,23366
3,20230409-4,12946,23397
4,20230409-5,13300,23467
...,...,...,...
156830,20230410-20,15815,24515
156831,20230410-21,15188,24587
156832,20230410-22,14066,24864
156833,20230410-23,13041,25173


In [25]:
# Merge overall_df and overall_weekly_df on 'hour_number' with an outer join to keep all records
combined_df = pd.merge(overall_df, overall_weekly_df, on='hour_number', how='outer', indicator=True)

# Filter rows that exist in either overall_df or overall_weekly_df but not in both
unmatched_df = combined_df[combined_df['_merge'] != 'both']

# Drop the '_merge' column as it is no longer needed
unmatched_df.drop(columns=['_merge'], inplace=True)

# Display the first few rows of the unmatched DataFrame
unmatched_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unmatched_df.drop(columns=['_merge'], inplace=True)


Unnamed: 0,hour_number,price,to_drop,demand,capacity
0,20190608-1,,,12057.0,26283.0
1,20190608-10,,,13095.0,25564.0
2,20190608-11,,,13275.0,25620.0
3,20190608-12,,,13359.0,25748.0
4,20190608-13,,,13346.0,25824.0


In [28]:
combined_df.iloc[1000]

hour_number    20190626-12
price                  NaN
to_drop                NaN
demand             17802.0
capacity           25653.0
_merge          right_only
Name: 1000, dtype: object