Monthly data: Natural Gas prices, consumer price energy, production in industry

In [None]:
import pandas as pd


def monthly_upsampled(file_path1, column_name1, name_file):
  # Load your CSV file
  file_path = file_path1  # Replace with your file path
  df = pd.read_csv(file_path, parse_dates=["date"], dayfirst=False)  # Uses mm/dd/yyyy format
  df.set_index("date", inplace=True)

  # print("Raw Data Head:")
  # print(df.head())


  # Detect low-frequency columns (e.g., monthly) manually or by checking NaNs
  low_freq_cols = [column_name1] # Replace with your columns
  daily_cols = [col for col in df.columns if col not in low_freq_cols]

  # Separate data
  daily_data = df[daily_cols].copy()
  monthly_data = df[low_freq_cols].dropna(how="all")  # Drops rows where all values are NaN

  monthly_upsampled = monthly_data.reindex(daily_data.index, method="ffill")  # Forward-fill

  monthly_upsampled = monthly_data.resample("D").interpolate("linear")  # Linear interpolation

  final_df = pd.concat([daily_data, monthly_upsampled], axis=1)

  # Optionally, backfill initial NaN values (e.g., if first month starts later)
  final_df[low_freq_cols] = final_df[low_freq_cols].bfill()

  print("\nFinal Dataset Head:")
  print(final_df.head())

  final_df.to_csv(name_file)






In [None]:
#function to intrapolate from monthly to weekly
import pandas as pd

def weekly_upsampled(file_path1, column_name1, name_file):
    # Load your CSV file
    df = pd.read_csv(file_path1, parse_dates=["date"])
    df.set_index("date", inplace=True)

        # Ensure output folder exists
    os.makedirs("features_daily_weekly", exist_ok=True)

    # Full output path
    output_path = os.path.join("features_daily_weekly", name_file)

    # Identify which column(s) are lower frequency (e.g., monthly)
    low_freq_cols = [column_name1]
    high_freq_cols = [col for col in df.columns if col not in low_freq_cols]

    # Separate datasets
    high_freq_data = df[high_freq_cols].copy()
    low_freq_data = df[low_freq_cols].dropna(how="all")

    # Resample everything to weekly frequency (Sunday end of week by default)
    high_freq_weekly = high_freq_data.resample("W").mean()
    low_freq_weekly = low_freq_data.resample("W").interpolate("linear")

    # Merge datasets
    final_df = pd.concat([high_freq_weekly, low_freq_weekly], axis=1)

    # Optionally backfill missing values at the start
    final_df[low_freq_cols] = final_df[low_freq_cols].bfill()

    print("\nFinal Weekly Dataset Head:")
    print(final_df.head())

    # Save to CSV
    final_df.to_csv(output_path)



In [None]:
#Intrapolating 5 year data from monthly to daily

#Natural Gas prices
monthly_upsampled("/content/Adv_ml_feature_engineering - natural gas prices.csv", "EU Natural Gas Prices", "natural_gas_daily.csv")

#consumer price energy index
monthly_upsampled("/content/Adv_ml_feature_engineering - consumer price index for energy.csv", "cpi", "cpi_daily.csv")

#production in industry
monthly_upsampled("/content/Adv_ml_feature_engineering - PRODUCTION IN INDUSTRY.csv", "euro-20","production_industry_daily.csv")

In [None]:

#Intrapolating 10 year data from monthly to daily

#Natural Gas prices

monthly_upsampled("/content/10_year_dataset - Copy - natural_gas_prices_10.csv", "EU Natural Gas Prices", "10_natural_gas_daily.csv")

#consumer price energy index

monthly_upsampled("/content/10_year_dataset - Copy - consumer_price_index_10.csv", " Price Index for Consumer: Energy(electricity, heat, fuel) ", "10_cpi_daily.csv")

#production in industry

monthly_upsampled("/content/10_year_dataset - Copy - production_in_industry_10.csv","production volume in industry","10_production_industry_daily.csv")


Final Dataset Head:
            EU Natural Gas Prices
date                             
2015-03-01               9.290000
2015-03-02               9.228710
2015-03-03               9.167419
2015-03-04               9.106129
2015-03-05               9.044839

Final Dataset Head:
            Price Index for Consumer: Energy(electricity, heat, fuel) 
date                                                                  
2015-03-01                                         100.490000         
2015-03-02                                         100.474839         
2015-03-03                                         100.459677         
2015-03-04                                         100.444516         
2015-03-05                                         100.429355         

Final Dataset Head:
            production volume in industry
date                                     
2015-01-01                      93.400000
2015-01-02                      93.474194
2015-01-03                      93

In [None]:
#Intrapolating 10 year data from monthly to weekly

#
weekly_upsampled("/content/10_year_dataset - Copy - natural_gas_prices_10.csv", "EU Natural Gas Prices", "10_natural_gas_weekly.csv")

#
weekly_upsampled("/content/10_year_dataset - Copy - consumer_price_index_10.csv", " Price Index for Consumer: Energy(electricity, heat, fuel) ", "10_cpi_weekly.csv")

  #
weekly_upsampled("/content/10_year_dataset - Copy - production_in_industry_10.csv","production volume in industry","10_production_industry_weekly.csv")




Final Weekly Dataset Head:
            EU Natural Gas Prices
date                             
2015-03-01               9.290000
2015-03-08               9.192286
2015-03-15               9.094571
2015-03-22               8.996857
2015-03-29               8.899143

Final Weekly Dataset Head:
            Price Index for Consumer: Energy(electricity, heat, fuel) 
date                                                                  
2015-03-01                                         100.490000         
2015-03-08                                         100.471143         
2015-03-15                                         100.452286         
2015-03-22                                         100.433429         
2015-03-29                                         100.414571         

Final Weekly Dataset Head:
            production volume in industry
date                                     
2015-01-04                           95.7
2015-01-11                           95.7
2015-01-18   

In [None]:
#Intrapolating 5 year data from monthly to weekly




In [None]:
import pandas as pd
import os

def daily_to_weekly(input_file, output_folder, filename, date_col="date", method="mean"):
    """
    Converts daily data to weekly data and saves it to a specified folder with a given filename.

    Parameters:
    - input_file: Path to the input CSV file with daily data.
    - output_folder: Folder where the output should be saved.
    - filename: Desired name for the output file (e.g. 'weekly_sentiment.csv').
    - date_col: Column containing the date values.
    - method: Aggregation method: 'mean', 'sum', or 'last'.
    """

    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Full output path
    output_path = os.path.join(output_folder, filename)

    # Load data
    df = pd.read_csv(input_file, parse_dates=[date_col])
    df.set_index(date_col, inplace=True)

    # Resample
    if method == "mean":
        df_weekly = df.resample("W").mean()
    elif method == "sum":
        df_weekly = df.resample("W").sum()
    elif method == "last":
        df_weekly = df.resample("W").last()
    else:
        raise ValueError("Unsupported method. Choose from 'mean', 'sum', or 'last'.")

    # Save
    df_weekly.to_csv(output_path)
    print(f"Weekly data saved to: {output_path}")
    return df_weekly


In [None]:

daily_to_weekly("/content/10_year_dataset - Copy - coal_prices_10.csv","features_daily_weekly", "coal_daily_2_weekly.csv")

daily_to_weekly("/content/10_year_dataset - Copy - crude_oil_prices_10.csv","features_daily_weekly", "crude_daily_2_weekly.csv")

daily_to_weekly("/content/10_year_dataset - Copy - global_clean_index_10.csv","features_daily_weekly", "global_clean_daily_2_weekly.csv")

daily_to_weekly("/content/10_year_dataset - Copy - euro_stoxx_50_10.csv","features_daily_weekly", "euro_stoxx_daily_2_weekly.csv")

daily_to_weekly("/content/10_year_dataset - Copy - ecb_interest_rate_10.csv","features_daily_weekly", "ecb_interest_rate_daily_2_weekly.csv")

daily_to_weekly("/content/features/sentiment_matched.csv","features_daily_weekly", "sentiment_daily_2_weekly.csv")

Weekly data saved to: features_daily_weekly/coal_daily_2_weekly.csv
Weekly data saved to: features_daily_weekly/crude_daily_2_weekly.csv
Weekly data saved to: features_daily_weekly/global_clean_daily_2_weekly.csv
Weekly data saved to: features_daily_weekly/euro_stoxx_daily_2_weekly.csv
Weekly data saved to: features_daily_weekly/ecb_interest_rate_daily_2_weekly.csv
Weekly data saved to: features_daily_weekly/sentiment_daily_2_weekly.csv


Unnamed: 0_level_0,sentiment
date,Unnamed: 1_level_1
2015-01-04,0.000000
2015-01-11,0.105152
2015-01-18,-0.561826
2015-01-25,-0.194128
2015-02-01,0.174059
...,...
2025-03-09,-0.038762
2025-03-16,0.127470
2025-03-23,-0.365760
2025-03-30,0.280145


GETTING ALL INTO 1 DATABASE

In [None]:
import pandas as pd
import os

# Load datasets

def same_date (path_features, path_return):
  #df_labels = pd.read_csv("/content/Adv_ml_feature_engineering - Carbon Emissions Futures in EUR.csv")
  df_labels = pd.read_csv("/content/10_year_dataset - Copy - Carbon_Emissions_Futures_10.csv")
  df_features = pd.read_csv(path_features)

  # Convert first column to datetime
  df_labels.iloc[:, 0] = pd.to_datetime(df_labels.iloc[:, 0])
  df_features.iloc[:, 0] = pd.to_datetime(df_features.iloc[:, 0])

  # Sort and set index
  df_labels = df_labels.sort_values(by=df_labels.columns[0]).set_index(df_labels.columns[0])
  df_features = df_features.sort_values(by=df_features.columns[0]).set_index(df_features.columns[0])

  # Step 1: Ensure features cover all label dates
  full_dates = df_labels.index

  # Step 2: Reindex features to match label dates, fill forward
  df_features_aligned = df_features.reindex(full_dates, method='ffill')

  # Step 3: Drop label rows with no corresponding features (after reindex fill still NaN)
  valid_rows = df_features_aligned.notnull().all(axis=1)
  df_labels_final = df_labels[valid_rows]
  df_features_final = df_features_aligned[valid_rows]

  # Step 4: Reset index
  df_labels_final = df_labels_final.reset_index().rename(columns={'index': 'Date'})
  df_features_final = df_features_final.reset_index().rename(columns={'index': 'Date'})

  # Save results
  output_folder = "features2/"
  os.makedirs(output_folder, exist_ok=True)
  output_path = os.path.join(output_folder, path_return)
  # Save results
  df_labels_final.to_csv("labels_aligned.csv", index=False)
  df_features_final.to_csv(output_path, index=False)

  print("✅ Final features and labels aligned perfectly by date.")








In [None]:
import pandas as pd

def merge_weekly_features_to_labels(feature_path, output_path):
    # Load datasets

    df_features = pd.read_csv(feature_path, parse_dates=["date"])
    df_labels = pd.read_csv("/content/10_year_dataset - Copy - Carbon_emissions_weekly_10.csv", parse_dates=["date"])

    os.makedirs("features_weekly", exist_ok=True)

    # Full output path
    output_path = os.path.join("features_weekly", output_path)

    # Step 1: Resample both to weekly frequency (ending Saturday to match label pattern)
    df_features_weekly = df_features.set_index("date").resample("W-SAT").mean().reset_index()
    df_labels_weekly = df_labels.set_index("date").resample("W-SAT").mean().reset_index()

    # Step 2: Left join — keep all labels' weekly dates
    df_merged = pd.merge(df_labels_weekly, df_features_weekly, on="date", how="left")
    df_merged = df_merged.drop(columns=["label"])
    print("✅ Merged dataset:")
    print(df_merged.head())
    # Step 3: Save to CSV
    df_merged.to_csv(output_path, index=False)
    print(f"Merged dataset saved to: {output_path}")

    return df_merged


In [None]:
#Getting same date for 5 year data, monthly intrapolated, daily frequency

#euro_stoxx
same_date ("/content/Adv_ml_feature_engineering - euro stoxx 50.csv","euro_stoxx_matched.csv")

#ecb interest rate
same_date ("/content/Adv_ml_feature_engineering - ECB iNTEREST RATE.csv","ecb_interest_rate_matched.csv")

#crude oil prices
same_date ("/content/Adv_ml_feature_engineering - CRUDE OIL PRICES.csv","crude_oil_prices_matched.csv")

#coal prices
same_date ("/content/Adv_ml_feature_engineering - COAL PRICES.csv","coal_prices_matched.csv")

#natural gas prices
same_date ("/content/Adv_ml_feature_engineering - natural gas prices.csv","natural_gas_prices_matched.csv")

#s&p 500
same_date ("/content/Adv_ml_feature_engineering - S&P 500.csv","s&p_500_matched.csv")

#consumer price
same_date ("/content/cpi_daily.csv","consumer_price_matched.csv")

#production in industry
same_date ("/content/production_industry_daily.csv","production_industry_matched.csv")


In [None]:
#10 years monthly, daily frequency

#euro_stoxx
same_date ("/content/10_year_dataset - Copy - euro_stoxx_50_10.csv","10md_euro_stoxx_matched.csv")

#ecb interest rate
same_date ("/content/10_year_dataset - Copy - ecb_interest_rate_10.csv","10md_ecb_interest_rate_matched.csv")

#crude oil prices
same_date ("/content/10_year_dataset - Copy - crude_oil_prices_10.csv","10md_crude_oil_prices_matched.csv")

#coal prices
same_date ("/content/10_year_dataset - Copy - coal_prices_10.csv","10md_coal_prices_matched.csv")

#natural gas prices
same_date ("/content/10_natural_gas_daily.csv","10md_natural_gas_prices_matched.csv")

# #s&p 500
# same_date ("/content/Adv_ml_feature_engineering - S&P 500.csv","s&p_500_matched.csv")

#consumer price
same_date ("/content/10_cpi_daily.csv","10mdconsumer_price_matched.csv")

#production in industry
same_date ("/content/10_production_industry_daily.csv","10mdproduction_industry_matched.csv")

#global clean energy index
same_date ("/content/10_year_dataset - Copy - global_clean_index_10.csv","10md_global_clean_energy_index_matched.csv")


  df_labels.iloc[:, 0] = pd.to_datetime(df_labels.iloc[:, 0])
  return Index(sequences[0], name=names)
  df_labels.iloc[:, 0] = pd.to_datetime(df_labels.iloc[:, 0])


✅ Final features and labels aligned perfectly by date.
✅ Final features and labels aligned perfectly by date.


  return Index(sequences[0], name=names)
  df_labels.iloc[:, 0] = pd.to_datetime(df_labels.iloc[:, 0])
  return Index(sequences[0], name=names)
  df_labels.iloc[:, 0] = pd.to_datetime(df_labels.iloc[:, 0])


✅ Final features and labels aligned perfectly by date.
✅ Final features and labels aligned perfectly by date.


  return Index(sequences[0], name=names)
  df_labels.iloc[:, 0] = pd.to_datetime(df_labels.iloc[:, 0])
  return Index(sequences[0], name=names)
  df_labels.iloc[:, 0] = pd.to_datetime(df_labels.iloc[:, 0])


✅ Final features and labels aligned perfectly by date.
✅ Final features and labels aligned perfectly by date.


  return Index(sequences[0], name=names)
  df_labels.iloc[:, 0] = pd.to_datetime(df_labels.iloc[:, 0])
  return Index(sequences[0], name=names)
  df_labels.iloc[:, 0] = pd.to_datetime(df_labels.iloc[:, 0])


✅ Final features and labels aligned perfectly by date.
✅ Final features and labels aligned perfectly by date.


  return Index(sequences[0], name=names)


In [None]:
# 10 years no monthly, daily frequency

#euro_stoxx
same_date ("/content/10_year_dataset - Copy - euro_stoxx_50_10.csv","10md_euro_stoxx_matched.csv")

#ecb interest rate
same_date ("/content/10_year_dataset - Copy - ecb_interest_rate_10.csv","10md_ecb_interest_rate_matched.csv")

#crude oil prices
same_date ("/content/10_year_dataset - Copy - crude_oil_prices_10.csv","10md_crude_oil_prices_matched.csv")

#coal prices
same_date ("/content/10_year_dataset - Copy - coal_prices_10.csv","10md_coal_prices_matched.csv")

#global clean energy index
same_date ("/content/10_year_dataset - Copy - global_clean_index_10.csv","10md_global_clean_energy_index_matched.csv")


  df_labels.iloc[:, 0] = pd.to_datetime(df_labels.iloc[:, 0])
  return Index(sequences[0], name=names)
  df_labels.iloc[:, 0] = pd.to_datetime(df_labels.iloc[:, 0])


✅ Final features and labels aligned perfectly by date.
✅ Final features and labels aligned perfectly by date.


  return Index(sequences[0], name=names)
  df_labels.iloc[:, 0] = pd.to_datetime(df_labels.iloc[:, 0])
  return Index(sequences[0], name=names)
  df_labels.iloc[:, 0] = pd.to_datetime(df_labels.iloc[:, 0])


✅ Final features and labels aligned perfectly by date.
✅ Final features and labels aligned perfectly by date.
✅ Final features and labels aligned perfectly by date.


  return Index(sequences[0], name=names)
  df_labels.iloc[:, 0] = pd.to_datetime(df_labels.iloc[:, 0])
  return Index(sequences[0], name=names)


In [None]:
import pandas as pd
import os



def merge_sentiment_with_label(sentiment_path, label_path, output_path):
    # Load datasets
    df_sentiment = pd.read_csv(sentiment_path)
    df_label = pd.read_csv(label_path)

    # Convert to datetime
    df_sentiment['date'] = pd.to_datetime(df_sentiment['date'])
    df_label['date'] = pd.to_datetime(df_label['date'])

    # Sort sentiment values so non-zeros come first
    df_sentiment_sorted = df_sentiment.sort_values(by='sentiment', ascending=False, key=lambda x: x != 0)

    # Keep only one row per day, prioritizing non-zero
    df_sentiment_deduped = df_sentiment_sorted.drop_duplicates(subset='date', keep='first')

    # Merge label with sentiment
    merged = pd.merge(df_label, df_sentiment_deduped, on='date', how='left')
    merged['sentiment'] = merged['sentiment'].fillna(0)
    merged=merged.drop(columns=["label"])

    # Save result
    merged.to_csv(output_path, index=False)

    return merged




merge_sentiment_with_label("/content/10_year_dataset - Copy - sentiment_10.csv","/content/10_year_dataset - Copy - Carbon_Emissions_Futures_10.csv","sentiment_matched.csv")


  df_label['date'] = pd.to_datetime(df_label['date'])


Unnamed: 0,date,sentiment
0,2015-01-01,0.000000
1,2015-01-04,0.000000
2,2015-01-05,0.525762
3,2015-01-06,0.000000
4,2015-01-07,0.000000
...,...,...
2633,2025-03-30,0.000000
2634,2025-03-31,-0.956960
2635,2025-04-01,0.742659
2636,2025-04-02,0.770304


In [None]:
# 10 years monthly intrapolated to weekly, weekly frequency for all data


merge_weekly_features_to_labels("/content/features_daily_weekly/10_cpi_weekly.csv", "10cpi_weekly_matched.csv")

merge_weekly_features_to_labels("/content/features_daily_weekly/10_natural_gas_weekly.csv", "10natural_gas_weekly_matched.csv")

merge_weekly_features_to_labels("/content/features_daily_weekly/10_production_industry_weekly.csv", "10production_industry_weekly_matched.csv")

merge_weekly_features_to_labels("/content/features_daily_weekly/euro_stoxx_daily_2_weekly.csv", "10euro_stoxx_weekly_matched.csv")

merge_weekly_features_to_labels("/content/features_daily_weekly/ecb_interest_rate_daily_2_weekly.csv", "10ecb_interest_rate_weekly_matched.csv")

merge_weekly_features_to_labels("/content/features_daily_weekly/crude_daily_2_weekly.csv", "10crude_oil_weekly_matched.csv")

merge_weekly_features_to_labels("/content/features_daily_weekly/coal_daily_2_weekly.csv", "10coal_weekly_matched.csv")

merge_weekly_features_to_labels("/content/features_daily_weekly/global_clean_daily_2_weekly.csv", "10global_clean_weekly_matched.csv")

merge_weekly_features_to_labels("/content/features_daily_weekly/sentiment_daily_2_weekly.csv", "10sentiment_weekly_matched.csv")



✅ Merged dataset:
        date   Price Index for Consumer: Energy(electricity, heat, fuel) 
0 2015-01-03                                                NaN          
1 2015-01-10                                                NaN          
2 2015-01-17                                                NaN          
3 2015-01-24                                                NaN          
4 2015-01-31                                                NaN          
Merged dataset saved to: features_weekly/10cpi_weekly_matched.csv
✅ Merged dataset:
        date  EU Natural Gas Prices
0 2015-01-03                    NaN
1 2015-01-10                    NaN
2 2015-01-17                    NaN
3 2015-01-24                    NaN
4 2015-01-31                    NaN
Merged dataset saved to: features_weekly/10natural_gas_weekly_matched.csv
✅ Merged dataset:
        date  production volume in industry
0 2015-01-03                            NaN
1 2015-01-10                           95.7
2 2015-01-17  

Unnamed: 0,date,sentiment
0,2015-01-03,
1,2015-01-10,0.000000
2,2015-01-17,0.105152
3,2015-01-24,-0.561826
4,2015-01-31,-0.194128
...,...,...
532,2025-03-15,-0.038762
533,2025-03-22,0.127470
534,2025-03-29,-0.365760
535,2025-04-05,0.280145


In [None]:

#10md_sentiment

import pandas as pd
import glob
import os

# === Path to your feature CSV files ===
feature_folder = "features/"
feature_files = glob.glob(os.path.join(feature_folder, "*.csv"))

# === Load label dataset ===
df_labels = pd.read_csv("labels_aligned.csv")
df_labels['date'] = pd.to_datetime(df_labels['date'])

# === Start with labels so we can merge into this
df_merged = df_labels.copy()

# === Merge each feature file
for filepath in feature_files:
    df_feature = pd.read_csv(filepath)
    df_feature['date'] = pd.to_datetime(df_feature['date'])

    # Only keep the feature column (excluding Date)
    feature_name = [col for col in df_feature.columns if col != 'date'][0]

    # Merge on Date
    df_merged = df_merged.merge(df_feature, on='date', how='left')

    print(f"✅ Merged: {feature_name}")

# === Optional: Reorder columns
columns = ['date'] + [col for col in df_merged.columns if col != 'date' and col != 'label'] + ['label']
df_merged = df_merged[columns]

# === Save final dataset
df_merged.to_csv("10md_sentiment.csv", index=False)
print("✅ Final merged dataset saved to '10d_sentiment.csv'")

df_merged


✅ Merged: Crude Oil Prices: Brent - Europe
✅ Merged: sentiment
✅ Merged: Closing Coal Prices
✅ Merged: global_clean_index
✅ Merged: production volume in industry
✅ Merged: EU Natural Gas Prices
✅ Merged:  Price Index for Consumer: Energy(electricity, heat, fuel) 
✅ Merged: euro_stoxx
✅ Merged: ecb_interest_rate
✅ Final merged dataset saved to '10d_sentiment.csv'


Unnamed: 0,date,Crude Oil Prices: Brent - Europe,sentiment,Closing Coal Prices,global_clean_index,production volume in industry,EU Natural Gas Prices,"Price Index for Consumer: Energy(electricity, heat, fuel)",euro_stoxx,ecb_interest_rate,label
0,2015-01-04,,0.000000,,626.91,93.622581,,,3139.44,0.05,6.90
1,2015-01-05,,0.525762,,615.40,93.696774,,,3023.14,0.05,6.77
2,2015-01-06,,0.000000,,614.41,93.770968,,,3007.91,0.05,6.78
3,2015-01-07,,0.000000,,614.58,93.845161,,,3026.79,0.05,6.81
4,2015-01-08,,0.000000,,627.30,93.919355,,,3135.08,0.05,6.71
...,...,...,...,...,...,...,...,...,...,...,...
2632,2025-03-30,74.69,0.000000,101.30,736.58,97.900000,13.128,166.52,5331.40,2.65,67.13
2633,2025-03-31,77.23,-0.956960,104.20,725.99,97.900000,13.128,166.52,5248.39,2.65,68.98
2634,2025-04-01,77.78,0.742659,105.75,732.81,97.900000,13.128,166.52,5320.30,2.65,67.69
2635,2025-04-02,77.27,0.770304,103.40,736.74,97.900000,13.128,166.52,5303.95,2.65,65.25


In [None]:
#10d_sentiment


import pandas as pd
import glob
import os

# === Path to your feature CSV files ===
feature_folder = "features2/"
feature_files = glob.glob(os.path.join(feature_folder, "*.csv"))

# === Load label dataset ===
df_labels = pd.read_csv("labels_aligned.csv")
df_labels['date'] = pd.to_datetime(df_labels['date'])

# === Start with labels so we can merge into this
df_merged = df_labels.copy()

# === Merge each feature file
for filepath in feature_files:
    df_feature = pd.read_csv(filepath)
    df_feature['date'] = pd.to_datetime(df_feature['date'])

    # Only keep the feature column (excluding Date)
    feature_name = [col for col in df_feature.columns if col != 'date'][0]

    # Merge on Date
    df_merged = df_merged.merge(df_feature, on='date', how='left')

    print(f"✅ Merged: {feature_name}")

# === Optional: Reorder columns
columns = ['date'] + [col for col in df_merged.columns if col != 'date' and col != 'label'] + ['label']
df_merged = df_merged[columns]

# === Save final dataset
df_merged.to_csv("10d_sentiment.csv", index=False)
print("✅ Final merged dataset saved to '10d_sentiment.csv'")

df_merged


✅ Merged: Crude Oil Prices: Brent - Europe
✅ Merged: sentiment
✅ Merged: Closing Coal Prices
✅ Merged: global_clean_index
✅ Merged: euro_stoxx
✅ Merged: ecb_interest_rate
✅ Final merged dataset saved to '10d_sentiment.csv'


Unnamed: 0,date,Crude Oil Prices: Brent - Europe,sentiment,Closing Coal Prices,global_clean_index,euro_stoxx,ecb_interest_rate,label
0,2015-01-04,,0.000000,,626.91,3139.44,0.05,6.90
1,2015-01-05,,0.525762,,615.40,3023.14,0.05,6.77
2,2015-01-06,,0.000000,,614.41,3007.91,0.05,6.78
3,2015-01-07,,0.000000,,614.58,3026.79,0.05,6.81
4,2015-01-08,,0.000000,,627.30,3135.08,0.05,6.71
...,...,...,...,...,...,...,...,...
2632,2025-03-30,74.69,0.000000,101.30,736.58,5331.40,2.65,67.13
2633,2025-03-31,77.23,-0.956960,104.20,725.99,5248.39,2.65,68.98
2634,2025-04-01,77.78,0.742659,105.75,732.81,5320.30,2.65,67.69
2635,2025-04-02,77.27,0.770304,103.40,736.74,5303.95,2.65,65.25


In [None]:

#10wm_sentiment

import pandas as pd
import glob
import os

# === Path to your feature CSV files ===
feature_folder = "/content/features_weekly"
feature_files = glob.glob(os.path.join(feature_folder, "*.csv"))

# === Load label dataset ===
df_labels = pd.read_csv("/content/10_year_dataset - Copy - Carbon_emissions_weekly_10.csv")
df_labels['date'] = pd.to_datetime(df_labels['date'])

# === Start with labels so we can merge into this
df_merged = df_labels.copy()

# === Merge each feature file
for filepath in feature_files:
    df_feature = pd.read_csv(filepath)
    df_feature['date'] = pd.to_datetime(df_feature['date'])

    # Only keep the feature column (excluding Date)
    feature_name = [col for col in df_feature.columns if col != 'date'][0]

    # Merge on Date
    df_merged = df_merged.merge(df_feature, on='date', how='left')

    print(f"✅ Merged: {feature_name}")

# === Optional: Reorder columns
columns = ['date'] + [col for col in df_merged.columns if col != 'date' and col != 'label'] + ['label']
df_merged = df_merged[columns]

# === Save final dataset
df_merged.to_csv("10wm_sentiment.csv", index=False)
print("✅ Final merged dataset saved to '10d_sentiment.csv'")

df_merged


✅ Merged: euro_stoxx
✅ Merged: Crude Oil Prices: Brent - Europe
✅ Merged:  Price Index for Consumer: Energy(electricity, heat, fuel) 
✅ Merged: ecb_interest_rate
✅ Merged: global_clean_index
✅ Merged: production volume in industry
✅ Merged: sentiment
✅ Merged: Closing Coal Prices
✅ Merged: EU Natural Gas Prices
✅ Final merged dataset saved to '10d_sentiment.csv'


Unnamed: 0,date,euro_stoxx,Crude Oil Prices: Brent - Europe,"Price Index for Consumer: Energy(electricity, heat, fuel)",ecb_interest_rate,global_clean_index,production volume in industry,sentiment,Closing Coal Prices,EU Natural Gas Prices,label
0,2015-01-03,,,,,,,,,,6.71
1,2015-01-10,3139.440,,,0.050000,626.910,95.7,0.000000,,,7.13
2,2015-01-17,3047.164,,,0.050000,619.630,95.7,0.105152,,,6.80
3,2015-01-24,3133.462,,,0.050000,619.214,95.7,-0.561826,,,7.08
4,2015-01-31,3288.150,,,0.050000,618.834,95.7,-0.194128,,,6.95
...,...,...,...,...,...,...,...,...,...,...,...
532,2025-03-15,5481.200,71.930,,2.900000,715.972,,-0.038762,96.53,,70.51
533,2025-03-22,5357.774,71.542,,2.721429,734.628,,0.127470,98.58,,67.91
534,2025-03-29,5462.536,72.210,,2.650000,752.028,,-0.365760,97.47,,63.08
535,2025-04-05,5403.008,74.350,,2.650000,738.954,,0.280145,98.23,,64.08


In [None]:

#10w_sentiment

import pandas as pd
import glob
import os

# === Path to your feature CSV files ===
feature_folder = "/content/features_weekly_nomonth"
feature_files = glob.glob(os.path.join(feature_folder, "*.csv"))

# === Load label dataset ===
df_labels = pd.read_csv("/content/10_year_dataset - Copy - Carbon_emissions_weekly_10.csv")
df_labels['date'] = pd.to_datetime(df_labels['date'])

# === Start with labels so we can merge into this
df_merged = df_labels.copy()

# === Merge each feature file
for filepath in feature_files:
    df_feature = pd.read_csv(filepath)
    df_feature['date'] = pd.to_datetime(df_feature['date'])

    # Only keep the feature column (excluding Date)
    feature_name = [col for col in df_feature.columns if col != 'date'][0]

    # Merge on Date
    df_merged = df_merged.merge(df_feature, on='date', how='left')

    print(f"✅ Merged: {feature_name}")

# === Optional: Reorder columns
columns = ['date'] + [col for col in df_merged.columns if col != 'date' and col != 'label'] + ['label']
df_merged = df_merged[columns]

# === Save final dataset
df_merged.to_csv("10w_sentiment.csv", index=False)
print("✅ Final merged dataset saved to '10d_sentiment.csv'")

df_merged


✅ Merged: euro_stoxx
✅ Merged: Crude Oil Prices: Brent - Europe
✅ Merged: ecb_interest_rate
✅ Merged: global_clean_index
✅ Merged: sentiment
✅ Merged: Closing Coal Prices
✅ Final merged dataset saved to '10d_sentiment.csv'


Unnamed: 0,date,euro_stoxx,Crude Oil Prices: Brent - Europe,ecb_interest_rate,global_clean_index,sentiment,Closing Coal Prices,label
0,2015-01-03,,,,,,,6.71
1,2015-01-10,3139.440,,0.050000,626.910,0.000000,,7.13
2,2015-01-17,3047.164,,0.050000,619.630,0.105152,,6.80
3,2015-01-24,3133.462,,0.050000,619.214,-0.561826,,7.08
4,2015-01-31,3288.150,,0.050000,618.834,-0.194128,,6.95
...,...,...,...,...,...,...,...,...
532,2025-03-15,5481.200,71.930,2.900000,715.972,-0.038762,96.53,70.51
533,2025-03-22,5357.774,71.542,2.721429,734.628,0.127470,98.58,67.91
534,2025-03-29,5462.536,72.210,2.650000,752.028,-0.365760,97.47,63.08
535,2025-04-05,5403.008,74.350,2.650000,738.954,0.280145,98.23,64.08


In [None]:
# import pandas as pd
# import seaborn as sns
# import matplotlib.pyplot as plt
# import os

# # === Load your merged dataset ===
# df = pd.read_csv("/content/final_dataset.csv")

# # === Optional: Drop Date column for numeric-only analysis
# df_numeric = df.drop(columns=['date'])

# df_numeric= df_numeric.drop(columns=["Deposit facility - date of changes (raw data) - Level (FM.D.U2.EUR.4F.KR.DFR.LEV)", "Marginal lending facility - date of changes (raw data) - Level (FM.D.U2.EUR.4F.KR.MLFR.LEV)", "High", "Open", "Low"])

# # === 1. Correlation Heatmap ===
# plt.figure(figsize=(12, 8))
# corr = df_numeric.corr()
# sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", center=0)
# plt.title("🔍 Feature Correlation Heatmap")
# plt.tight_layout()
# #plt.savefig("processed_data/feature_correlation_heatmap.png")
# plt.show()

# # === 2. Pairplot (Optional: only for small number of features) ===
# # This helps visualize distributions and relationships
# if df_numeric.shape[1] <= 7:  # Too many features can clutter this
#     sns.pairplot(df_numeric, hue='label', corner=True)
#     plt.suptitle("🧠 Feature Pairwise Relationships", y=1.02)
#     #plt.savefig("processed_data/feature_pairplot.png")
#     plt.show()

# # === 3. Target vs Individual Feature (Top 5 correlations with Target) ===
# target_corr = corr['label'].drop('label').abs().sort_values(ascending=False)
# top_features = target_corr.head(5).index.tolist()

# for feature in top_features:
#     plt.figure(figsize=(8, 4))
#     sns.boxplot(x='label', y=feature, data=df)
#     plt.title(f"🎯 Relationship: {feature} vs label")
#     plt.tight_layout()
#     #plt.savefig(f"processed_data/{feature}_vs_label.png")
#     plt.show()

# print("✅ All feature relationship plots saved in 'processed_data/'")


In [None]:
# corr = df_numeric.corr()
# target_corr = corr['label'].drop('label').abs().sort_values(ascending=False)
# top_features = target_corr.head(5).index.tolist()  # Top 5 features

# for feature in top_features:
#     plt.figure(figsize=(8, 5))
#     sns.scatterplot(data=df, x=feature, y='label', alpha=0.6, edgecolor='k')
#     plt.title(f"📈 Scatterplot: {feature} vs label")
#     plt.tight_layout()
#     #plt.savefig(f"processed_data/plots/{feature}_vs_target_scatter.png")
#     plt.show()
