In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os
from glob import glob

# Explainability

## Retrieve relevant data points for a given date range and energy type
### Fossil Fuel

In [10]:
def get_data_points(df: pd.DataFrame, energy_type: str, year_start: int, year_end: int, pred:int) -> pd.DataFrame:
    """
    Returns rows from the DataFrame that match the given energy_type and year.

    Args:
        df (pd.DataFrame): The input DataFrame.
        energy_type (str): The energy type to filter by.
        year (int): The year to filter by.

    Returns:
        pd.DataFrame: Filtered DataFrame with matching rows.
    """
    filtered_df = df[(df['energy_type'] == energy_type) & (df['year'] >= year_start) & (df['year'] <= year_end)& (df['prediction'] == pred)]
    return filtered_df


In [2]:
data_folder_path = '../../data/fossil_fuel/all_data/9/'
predict_folder_path = '../../output/fossil_fuel/predict_all/2class/'

In [13]:
data_files = ['1960s_coal_labeled_full_0.9.csv', '1960s_gas_labeled_full_0.9.csv', '1960s_oil_labeled_full_0.9.csv',
              '1970s_coal_labeled_full_0.9.csv', '1970s_gas_labeled_full_0.9.csv', '1970s_oil_labeled_full_0.9.csv',
              '1980s_coal_labeled_full_0.9.csv', '1980s_gas_labeled_full_0.9.csv', '1980s_oil_labeled_full_0.9.csv',
              '1990s_coal_labeled_full_0.9.csv', '1990s_gas_labeled_full_0.9.csv', '1990s_oil_labeled_full_0.9.csv']

predict_files = ['unlabeled_predictions_1960_coal_09.csv','unlabeled_predictions_1960_gas_09.csv','unlabeled_predictions_1960_oil_09.csv',
                'unlabeled_predictions_1970_coal_09.csv','unlabeled_predictions_1970_gas_09.csv','unlabeled_predictions_1970_oil_09.csv',
                'unlabeled_predictions_1980_coal_09.csv','unlabeled_predictions_1980_gas_09.csv','unlabeled_predictions_1980_oil_09.csv',
                'unlabeled_predictions_1990_coal_09.csv','unlabeled_predictions_1990_gas_09.csv','unlabeled_predictions_1990_oil_09.csv']

all_dfs = []

for data_fp, predict_fp in zip(data_files, predict_files):
    df_data = pd.read_csv(data_folder_path + data_fp)
    df_predict = pd.read_csv(predict_folder_path + predict_fp)

    filename = os.path.basename(predict_fp).replace('.csv', '')
    parts = filename.rsplit('_', 2)
    if len(parts) < 3:
        continue
    energy_type = parts[1]
    df = pd.concat([
        df_data[['date', 'p', 'newspaper_publisher']],
        df_predict[['prediction', 'confidence']]
    ], axis=1)

    df = df.rename(columns={'p': 'text', 'newspaper_publisher': 'publisher_title'})

    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df = df.dropna(subset=['date'])
    df['year'] = df['date'].dt.year
    df['energy_type'] = energy_type
    
    all_dfs.append(df)

df_fossil_fuel = pd.concat(all_dfs, ignore_index=True)
#df_fossil_fuel['prediction'] = df_fossil_fuel['prediction'].replace(0, -1)


In [14]:
df_fossil_fuel

Unnamed: 0,date,text,publisher_title,prediction,confidence,year,energy_type
0,1963-02-23,Een van da felste uitdagingen uo het verenigd ...,N.V. Drukkerĳ De Tĳd,0,0.705488,1963,coal
1,1963-02-23,V oor oio mtfnea sou eea gecoördineerd beleid ...,N.V. Drukkerĳ De Tĳd,0,0.758977,1963,coal
2,1963-02-23,Noodlijdend zijn onze mijnen nog niet . Het gr...,N.V. Drukkerĳ De Tĳd,0,0.750726,1963,coal
3,1963-02-23,"D 1 , grot - aardgasvondstea in om , Und schep...",N.V. Drukkerĳ De Tĳd,1,0.768789,1963,coal
4,1963-09-16,"DEN HAAG , 16 sept. Als er geen abnormale ding...",N.V. Drukkerĳ De Tĳd,0,0.705253,1963,coal
...,...,...,...,...,...,...,...
796104,1992-08-29,Veertig jaar lief en leed . Publiciteit rond h...,Nieuwenhuis,0,0.676541,1992,oil
796105,1991-01-15,Inmiddels is Jacobs al twintig jaar werkzaam i...,Amigoe,0,0.879638,1991,oil
796106,1991-01-15,"aldus Juan Jacobs , voorzitter van de AFBW . ,...",Amigoe,0,0.556419,1991,oil
796107,1991-01-15,"KRALENDIJK De kritiek , die afgelopen week doo...",Amigoe,0,0.731135,1991,oil


### Oil

In [17]:
## oil 60-66 
file_path = '../../data/explain/oil_60_66_0.csv'
results =get_data_points(df_fossil_fuel,'oil', 1960,1966,0)
results.to_csv(file_path, index=False)

file_path = '../../data/explain/oil_60_66_1.csv'
results =get_data_points(df_fossil_fuel,'oil', 1960,1966,1)
results.to_csv(file_path, index=False)

In [18]:
## oil 72-75 
file_path = '../../data/explain/oil_72_75_0.csv'
results =get_data_points(df_fossil_fuel,'oil', 1972,1975,0)
results.to_csv(file_path, index=False)

file_path = '../../data/explain/oil_72_75_1.csv'
results =get_data_points(df_fossil_fuel,'oil', 1972,1975,1)
results.to_csv(file_path, index=False)

In [19]:
## oil 85-90 
file_path = '../../data/explain/oil_85_90_0.csv'
results =get_data_points(df_fossil_fuel,'oil', 1985,1990,0)
results.to_csv(file_path, index=False)

file_path = '../../data/explain/oil_85_90_1.csv'
results =get_data_points(df_fossil_fuel,'oil', 1985,1990,1)
results.to_csv(file_path, index=False)

### Gas

In [20]:
## gas 60-66 
file_path = '../../data/explain/gas_60_66_0.csv'
results =get_data_points(df_fossil_fuel,'gas', 1960,1966,0)
results.to_csv(file_path, index=False)

file_path = '../../data/explain/gas_60_66_1.csv'
results =get_data_points(df_fossil_fuel,'gas', 1960,1966,1)
results.to_csv(file_path, index=False)

In [21]:
## gas 72-75 
file_path = '../../data/explain/gas_72_75_0.csv'
results =get_data_points(df_fossil_fuel,'gas', 1972,1975,0)
results.to_csv(file_path, index=False)

file_path = '../../data/explain/gas_72_75_1.csv'
results =get_data_points(df_fossil_fuel,'gas', 1972,1975,1)
results.to_csv(file_path, index=False)

In [22]:
## gas 85_90 
file_path = '../../data/explain/gas_85_90_0.csv'
results =get_data_points(df_fossil_fuel,'gas', 1985,1990,0)
results.to_csv(file_path, index=False)

file_path = '../../data/explain/gas_85_90_1.csv'
results =get_data_points(df_fossil_fuel,'gas', 1985,1990,1)
results.to_csv(file_path, index=False)

### Coal

In [23]:
## coal 70-75 
file_path = '../../data/explain/coal_70_75_0.csv'
results =get_data_points(df_fossil_fuel,'coal', 1970,1975,0)
results.to_csv(file_path, index=False)

file_path = '../../data/explain/coal_70_75_1.csv'
results =get_data_points(df_fossil_fuel,'coal', 1970,1975,1)
results.to_csv(file_path, index=False)

In [24]:
## coal 77-83 
file_path = '../../data/explain/coal_77_83_0.csv'
results =get_data_points(df_fossil_fuel,'coal', 1977,1983,0)
results.to_csv(file_path, index=False)

file_path = '../../data/explain/coal_77_83_1.csv'
results =get_data_points(df_fossil_fuel,'coal', 1977,1983,1)
results.to_csv(file_path, index=False)

### Nuclear energy

In [25]:
fp_data = '../../data/nuclear/all_data/combined_files.csv'
fp_predict = '../../output/nuclear/predict_all/unlabeled_predictions_checkpoint_epochs_10_batch_size_32_lr_8.177834244164004e-05_freeze_false_weight_decay_0.01_patience_3_hidden_dropout_0.0_attention_dropout_0.3_class2.csv'

df_all = pd.read_csv(fp_data)
df_predict = pd.read_csv(fp_predict)
merged_df = pd.concat([df_all, df_predict], axis=1)
columns=['date','body','publisher_title','prediction', 'confidence' ] ## title should be replaced with publisher_title
df_nuclear = merged_df[columns]
df_nuclear = df_nuclear.rename(columns={'body': 'text'}) # , 'title': 'publisher_title'
df_nuclear = df_nuclear[df_nuclear['prediction']!= 2]
#df_nuclear['prediction'] = df_nuclear['prediction'].replace(0, -1)
df_nuclear['date'] = pd.to_datetime(df_nuclear['date'], errors='coerce')
df_nuclear = df_nuclear.dropna(subset=['date'])
df_nuclear['year'] = df_nuclear['date'].dt.year
df_nuclear['energy_type']='nuclear'

In [33]:
## nuclear 65-71 
file_path = '../../data/explain/nuclear_65_71_0.csv'
results =get_data_points(df_nuclear,'nuclear', 1965,1971,0)
results.to_csv(file_path, index=False)

file_path = '../../data/explain/nuclear_65_71_1.csv'
results =get_data_points(df_nuclear,'nuclear', 1965,1971,1)
results.to_csv(file_path, index=False)

In [34]:
## nuclear 72-75 
file_path = '../../data/explain/nuclear_72_75_0.csv'
results =get_data_points(df_nuclear,'nuclear', 1972,1975,0)
results.to_csv(file_path, index=False)

file_path = '../../data/explain/nuclear_72_75_1.csv'
results =get_data_points(df_nuclear,'nuclear', 1972,1975,1)
results.to_csv(file_path, index=False)

In [35]:
## nuclear 81-87 
file_path = '../../data/explain/nuclear_81_87_0.csv'
results =get_data_points(df_nuclear,'nuclear', 1981,1987,0)
results.to_csv(file_path, index=False)

file_path = '../../data/explain/nuclear_81_87_1.csv'
results =get_data_points(df_nuclear,'nuclear', 1981,1987,1)
results.to_csv(file_path, index=False)

In [36]:
## nuclear 88-92 
file_path = '../../data/explain/nuclear_88_92_0.csv'
results =get_data_points(df_nuclear,'nuclear', 1988,1992,0)
results.to_csv(file_path, index=False)

file_path = '../../data/explain/nuclear_88_92_1.csv'
results =get_data_points(df_nuclear,'nuclear', 1988,1992,1)
results.to_csv(file_path, index=False)

### Renewable energy

In [37]:
fp_data = '../../data/renewable/all_data/combined_files.csv'
fp_predict = '../../output/renewable/predict_all/unlabeled_predictions_checkpoint_epochs_10_batch_size_16_lr_2.2226468679191218e-05_freeze_true_weight_decay_0.001_patience_2_hidden_dropout_0.0_attention_dropout_0.4_robbert_class2.csv'
df_all = pd.read_csv(fp_data)
df_predict = pd.read_csv(fp_predict)
merged_df = pd.concat([df_all, df_predict], axis=1)
columns=['date','body','publisher_title','prediction', 'confidence' ] ## title should be replaced with publisher_title
df_renewable = merged_df[columns]
df_renewable = df_renewable.rename(columns={'body': 'text'}) # , 'title': 'publisher_title'
df_renewable = df_renewable[df_renewable['prediction']!= 2]
# df_renewable['prediction'] = df_renewable['prediction'].replace(0, -1)
df_renewable['date'] = pd.to_datetime(df_renewable['date'], errors='coerce')
df_renewable = df_renewable.dropna(subset=['date'])
df_renewable['year'] = df_renewable['date'].dt.year
df_renewable['energy_type']='renewable'

In [38]:
## renewable 72-73 
file_path = '../../data/explain/renewable_72_73_0.csv'
results =get_data_points(df_renewable,'renewable', 1972,1973,0)
results.to_csv(file_path, index=False)

file_path = '../../data/explain/renewable_72_73_1.csv'
results =get_data_points(df_renewable,'renewable', 1972,1973,1)
results.to_csv(file_path, index=False)

In [39]:
## renewable 79-84 
file_path = '../../data/explain/renewable_79_84_0.csv'
results =get_data_points(df_renewable,'renewable', 1979,1984,0)
results.to_csv(file_path, index=False)

file_path = '../../data/explain/renewable_79_84_1.csv'
results =get_data_points(df_renewable,'renewable', 1979,1984,1)
results.to_csv(file_path, index=False)

In [40]:
## renewable 88-92 
file_path = '../../data/explain/renewable_88_92_0.csv'
results =get_data_points(df_renewable,'renewable', 1988,1992,0)
results.to_csv(file_path, index=False)

file_path = '../../data/explain/renewable_88_92_1.csv'
results =get_data_points(df_renewable,'renewable', 1988,1992,1)
results.to_csv(file_path, index=False)