# 0. Packages & Functions

In [90]:
import pandas as pd
import string
from nltk import ngrams
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM
import pyarrow.parquet as pq
import pyarrow as pyarrow
import numpy as np
from itertools import combinations
import time
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [91]:
#pre-process text for lexicon based approaches
def preprocess_text(text):
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # convert to lower case
    text = text.lower()
    # remove blank spaces
    text = ' '.join(text.split())
    # remove newline characters
    text = text.replace('\n', '')
    return text

In [92]:
def count_lexicon_words(text_df, lexicon):
    lexicon = lexicon["Lexicon"]
    frequency = []
    present = []
    rfrequency = []
    rpresent = []

    for text in text_df["Text"]:
        lexicon_counts = 0
        present_count = 0
        for word in lexicon:
            lexicon_counts += text.lower().count(word.lower())
            if(text.lower().count(word.lower()) > 0):
                present_count += 1
        
        word_list = text.split() 
        word_count = len(word_list)

        frequency.append(lexicon_counts)
        present.append(present_count)
        rfrequency.append((lexicon_counts/word_count)*100)
        rpresent.append((present_count/word_count)*100)
        
        
    text_df["Absolute Frequency"] = frequency
    text_df["Absolute Present"] = present
    text_df["Relative Frequency"] = rfrequency
    text_df["Relative Present"] = rpresent
    
    return(text_df)

def get_metrics(df, colname, threshold):
    target = []
    values = df[colname]
    
    for v in values:
        if v >= threshold:
            target.append("Yes")
        else:
            target.append("No")
    
    df["Estimate"] = target
    
    cross_table = pd.crosstab(df['Target'], df['Estimate'], margins=True)
    accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
    precision = cross_table.iloc[1,1] / (cross_table.iloc[0,1] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) else 0
    recall = cross_table.iloc[1,1] / (cross_table.iloc[1,0] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
    return([accuracy, precision, recall, f1_score])

def get_gross_table_data(df, colname, threshold, binary):
    target = []
    values = df[colname]
    
    for v in values:
        if v >= threshold:
            target.append("Yes")
        else:
            target.append("No")
    
    df["Estimate"] = target
    
    if(binary):
        cross_table = pd.crosstab(df['Target'], df['Estimate'], margins=True)
    else:
        cross_table = pd.crosstab(df['Final_Climate_Change_Level_Label'], df['Estimate'], margins=True)
    return(cross_table)
    
def find_optimal_threshold(df, lexicon, lexicon_name):
    df = count_lexicon_words(df, lexicon)
    
    #absolute frequency
    af_accuracy = 0
    af_th = 0
    found = False
    while(found == False):
        metrics = get_metrics(df, "Absolute Frequency", af_th)
        if metrics[0] > af_accuracy:
            af_accuracy = metrics[0]
            af_precision = metrics[1]
            af_recall = metrics[2]
            af_f1 = metrics[3]
            af_th += 1
        else:
            found = True
            
    #absolute present
    ap_accuracy = 0
    ap_th = 0
    found = False
    while(found == False):
        metrics = get_metrics(df, "Absolute Present", ap_th)
        if metrics[0] > ap_accuracy:
            ap_accuracy = metrics[0]
            ap_precision = metrics[1]
            ap_recall = metrics[2]
            ap_f1 = metrics[3]
            ap_th += 1
        else:
            found = True
            
    #relative frequency
    rf_accuracy = 0
    rf_th = 0
    found = False
    while(found == False):
        metrics = get_metrics(df, "Relative Frequency", rf_th)
        if metrics[0] > rf_accuracy:
            rf_accuracy = metrics[0]
            rf_precision = metrics[1]
            rf_recall = metrics[2]
            rf_f1 = metrics[3]
            rf_th += 0.1
        else:
            found = True
            
    #relative present
    rp_accuracy = 0
    rp_th = 0
    found = False
    while(found == False):
        metrics = get_metrics(df, "Relative Present", rp_th)
        if metrics[0] > rp_accuracy:
            rp_accuracy = metrics[0]
            rp_precision = metrics[1]
            rp_recall = metrics[2]
            rp_f1 = metrics[3]
            rp_th += 0.1
        else:
            found = True
    
    return(pd.DataFrame({"Lexicon" : [lexicon_name] * 4, 
                         "Technique" : ["Absolute Frequency", "Absolute Present", "Relative Frequency", "Relative Present"],
                         "Threshold" : [af_th - 1, ap_th - 1, rf_th - 0.1, rp_th - 0.1], 
                 "Accuracy" : [af_accuracy, ap_accuracy, rf_accuracy, rp_accuracy], 
                        "Precision" : [af_precision, ap_precision, rf_precision, rp_precision], 
                        "Recall" : [af_recall, ap_recall, rf_recall, rp_recall], 
                        "F1 Score" : [af_f1, ap_f1, rf_f1, rp_f1]}))
    
def test_lexicon(test_df, results_df, lexicon, lexicon_name):
    techniques = ["Absolute Frequency", "Absolute Present", "Relative Frequency", "Relative Present"]
    df = count_lexicon_words(test_df, lexicon)
    accuracy = []
    precision = []
    recall = []
    f1 = []
    th_df = []
    for t in range(len(techniques)):
        th = results_df[results_df["Technique"] == techniques[t]]["Threshold"].iloc[0]
        th_df.append(th)
        accuracy.append(get_metrics(df, techniques[t], th)[0])
        precision.append(get_metrics(df, techniques[t], th)[1])
        recall.append(get_metrics(df, techniques[t], th)[2])
        f1.append(get_metrics(df, techniques[t], th)[3])
    
    return(pd.DataFrame({"Lexicon" : [lexicon_name] * 4, "Technique" : techniques, "Threshold" : th_df ,
                         "Test Accuracy" : accuracy, 
                        "Test Precision" : precision, 
                        "Test Recall" : recall, 
                        "Test F1 Score" : f1}))



def get_cross_table(text_df, lexicon, threshold, colname, binary):
    df = count_lexicon_words(text_df, lexicon)
    return(get_gross_table_data(df, colname, threshold, binary))

In [93]:
def summary(df_with_text, name, model_name, max_lenght_input=-1):
    data_in_list = df_with_text[name].tolist()
    tokenizer_sum = AutoTokenizer.from_pretrained(model_name)
    model_sum = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    summarizer = pipeline('summarization', model=model_sum, tokenizer = tokenizer_sum) 

    if max_lenght_input>=0:
        df_with_text['summary'] = summarizer(data_in_list, max_length=max_lenght_input)

    else:
        df_with_text['summary'] = summarizer(data_in_list)

def classification(df_with_text, name, model_name, max_lenght_input=-1):
    data_in_list = df_with_text[name].tolist()
    tokenizer_clas = AutoTokenizer.from_pretrained(model_name)
    model_clas = AutoModelForSequenceClassification.from_pretrained(model_name)
    classification = pipeline('text-classification', model=model_clas, tokenizer = tokenizer_clas) 

    if max_lenght_input>=0:
        df_with_text['classification'] = classification(data_in_list, max_length=max_lenght_input, truncation=True)

    else:
        df_with_text['classification'] = classification(data_in_list)
        
    return(df_with_text)

def get_metrics_hugging_face(text_df, text_column, model, tokens):
    
    df = classification(text_df, text_column, model, tokens)
    label_list = list(df["classification"])
    labels = [entry['label'] for entry in label_list]
    df["Label_Hugging"] = labels
    
    cross_table = pd.crosstab(df['Target'], df['Label_Hugging'], margins=True)
    
    # calculate classification metrics using scikit-learn
    accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
    precision = cross_table.iloc[1,1] / (cross_table.iloc[0,1] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) else 0
    recall = cross_table.iloc[1,1] / (cross_table.iloc[1,0] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0


    # print the metrics
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 score:", f1_score)
    print(cross_table)
    print("\n")
    print(pd.crosstab(df['Final_Climate_Change_Level_Label'], df['Label_Hugging'], margins=True))
    

def get_metrics_df_hugging_face(text_df, text_column, model, tokens, model_name):
    df = classification(text_df, text_column, model, tokens)
    label_list = list(df["classification"])
    labels = [entry['label'] for entry in label_list]
    df["Label_Hugging"] = labels
    
    cross_table = pd.crosstab(df['Target'], df['Label_Hugging'], margins=True)
    
    # calculate classification metrics using scikit-learn
    accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
    precision = cross_table.iloc[1,1] / (cross_table.iloc[0,1] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) else 0
    recall = cross_table.iloc[1,1] / (cross_table.iloc[1,0] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
    
    return(pd.DataFrame({"Model" : [model_name], "Accuracy" : [accuracy], "Precision" : [precision], "Recall" : [recall],
                         "F1 Score" : [f1_score]}))

# 1. Labels

In [94]:
tag_climate_df = pd.read_parquet("Climate_Labels_Dataset.parquet")
tag_climate_df.head(5)
#Only keep the required columns
tag_climate_df = tag_climate_df[["Text", "Final_Climate_Change_Level_Label"]]
#Clean the tabel
tag_climate_df['Final_Climate_Change_Level_Label'] = tag_climate_df['Final_Climate_Change_Level_Label'].str.strip()
tag_climate_df.loc[tag_climate_df["Final_Climate_Change_Level_Label"] == "NA", "Final_Climate_Change_Level_Label"] = "Na"
tag_climate_df.loc[tag_climate_df["Final_Climate_Change_Level_Label"] == "0", "Final_Climate_Change_Level_Label"] = "Na"
tag_climate_df.loc[tag_climate_df["Final_Climate_Change_Level_Label"] == "Na", "Final_Climate_Change_Level_Label"] = "No Climate"
tag_climate_df["Target"] = tag_climate_df["Final_Climate_Change_Level_Label"].apply(lambda x: "Yes" if x in ["High", "Medium"] else "No")
tag_climate_df

Unnamed: 0,Text,Final_Climate_Change_Level_Label,Target
0,More than a dozen state attorneys general gath...,Medium,Yes
1,Sen. Jeff Merkley of Oregon endorsed Bernie S...,Small,No
2,When Carmen Luna moved to a neighborhood on t...,Medium,Yes
3,As ocean warming continues to trigger widespre...,High,Yes
4,PG&E Corp. told California regulators that it...,Medium,Yes
...,...,...,...
295,"U.S. government bond prices swung Wednesday, u...",No Climate,No
296,Japan’s corporate governance reforms are start...,No Climate,No
297,While President Trump is out there wheezing hi...,No Climate,No
298,The South is home to three schools ranked four...,No Climate,No


In [95]:
overview_labels_hms = tag_climate_df.groupby("Final_Climate_Change_Level_Label")["Text"].count().reset_index()
overview_labels_hms

Unnamed: 0,Final_Climate_Change_Level_Label,Text
0,High,65
1,Medium,35
2,No Climate,109
3,Small,91


In [96]:
overview_labels = tag_climate_df.groupby("Target")["Text"].count().reset_index()
overview_labels

Unnamed: 0,Target,Text
0,No,200
1,Yes,100


## Splits in Train en Test set

In [97]:
# Split the dataframe into two sets
df_train, df_test = train_test_split(tag_climate_df, test_size = 1/3, random_state = 23)
df_train = df_train.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

In [98]:
df_train.groupby("Final_Climate_Change_Level_Label")["Text"].count().reset_index()

Unnamed: 0,Final_Climate_Change_Level_Label,Text
0,High,46
1,Medium,18
2,No Climate,75
3,Small,61


In [99]:
df_train.groupby("Target")["Text"].count().reset_index()

Unnamed: 0,Target,Text
0,No,136
1,Yes,64


In [100]:
df_test.groupby("Final_Climate_Change_Level_Label")["Text"].count().reset_index()

Unnamed: 0,Final_Climate_Change_Level_Label,Text
0,High,19
1,Medium,17
2,No Climate,34
3,Small,30


In [101]:
df_test.groupby("Target")["Text"].count().reset_index()

Unnamed: 0,Target,Text
0,No,64
1,Yes,36


# 2. Lexicons

In [157]:
#Load the lexicon
UNDP_Lexicon = pd.read_csv("Lexicons/UNDP_Lexicon")
UNDP_Lexicon = UNDP_Lexicon.drop_duplicates().reset_index(drop = True)
UNDP_Lexicon["Lexicon"] = UNDP_Lexicon["Lexicon"].str.lower()
UNDP_Lexicon = UNDP_Lexicon[UNDP_Lexicon["Keep"] == "Yes"]
UNDP_Lexicon = pd.DataFrame(UNDP_Lexicon["Lexicon"])

In [160]:
#Load the lexicon
IPCC_Lexicon = pd.read_csv("Lexicons/IPCC_Lexicon")
IPCC_Lexicon = IPCC_Lexicon.drop_duplicates().reset_index(drop = True)
IPCC_Lexicon["Lexicon"] = IPCC_Lexicon["Lexicon"].str.lower()
IPCC_Lexicon = IPCC_Lexicon[IPCC_Lexicon["Keep"] == "Yes"]
IPCC_Lexicon = pd.DataFrame(IPCC_Lexicon["Lexicon"])
IPCC_Lexicon

Unnamed: 0,Lexicon
0,acceptability of policy or system change
1,adaptability
2,adaptation
3,adaptation behaviour
4,adaptation limits
...,...
383,for climate change mitigation and adaptation
388,sea level rise
389,sea level fall
397,social


In [162]:
#Load the lexicon
EPA_Lexicon = pd.read_csv("Lexicons/EPA_Lexicon")
EPA_Lexicon = EPA_Lexicon.drop_duplicates().reset_index(drop = True)
EPA_Lexicon["Lexicon"] = EPA_Lexicon["Lexicon"].str.lower()
EPA_Lexicon = EPA_Lexicon[EPA_Lexicon["Afkorting"] == "No"]

EPA_Lexicon = pd.DataFrame(EPA_Lexicon["Lexicon"])
EPA_Lexicon

Unnamed: 0,Lexicon
0,abrupt climate change
1,adaptation
2,adaptive capacity
3,aerosols
4,afforestation
...,...
150,wastewater
151,water vapor
152,weather
153,100-year flood levels


In [164]:
#Load the lexicon
Wikipedia_Lexicon = pd.read_csv("Lexicons/Wikipedia_Lexicon")
Wikipedia_Lexicon = Wikipedia_Lexicon.drop_duplicates().reset_index(drop = True)
Wikipedia_Lexicon["Lexicon"] = Wikipedia_Lexicon["Lexicon"].str.lower()

In [None]:
Wikipedi

In [106]:
#Load the lexicon
Global_Change_Lexicon = pd.read_csv("Lexicons/Global_Change_Lexicon")
Global_Change_Lexicon = Global_Change_Lexicon.drop_duplicates().reset_index(drop = True)
Global_Change_Lexicon["Lexicon"] = Global_Change_Lexicon["Lexicon"].str.lower()

In [107]:
#Load the lexicon
BBC_Lexicon = pd.read_csv("Lexicons/BBC_Lexicon")
BBC_Lexicon = BBC_Lexicon.drop_duplicates().reset_index(drop = True)
BBC_Lexicon["Lexicon"] = BBC_Lexicon["Lexicon"].str.lower()
BBC_Lexicon = BBC_Lexicon[BBC_Lexicon["Afkorting"] == "No"]

BBC_Lexicon = pd.DataFrame(BBC_Lexicon["Lexicon"])
BBC_Lexicon

Unnamed: 0,Lexicon
0,adaptation
1,adaptation fund
2,annex i countries
3,annex ii
4,anthropogenic climate change
...,...
67,technology transfer
68,tipping point
69,twenty-twenty-twenty
77,350/450


In [108]:
#Create an empty dataframe and write a function to fill with the values

common_words_df = pd.DataFrame({"Lexicon" : ["Global Change", "IPCC", "Wikipedia", "EPA", "BBC", "UNDP"], 
                               "Global Change": [0, 0, 0, 0, 0, 0], "IPCC" : [0, 0, 0, 0, 0, 0], 
                               "Wikipedia" : [0, 0, 0, 0, 0, 0], "EPA" : [0, 0, 0, 0, 0, 0], 
                               "BBC" : [0, 0, 0, 0, 0, 0], "UNDP" : [0, 0, 0, 0, 0, 0]})

dfs = [Global_Change_Lexicon, IPCC_Lexicon, Wikipedia_Lexicon, EPA_Lexicon, BBC_Lexicon, UNDP_Lexicon]

for r in range(0, len(dfs)):
    for c in range(0, len(dfs)):
        # Get the common values between the two columns
        common_words = set(dfs[r]['Lexicon']).intersection(set(dfs[c]['Lexicon']))
        common_words_df.loc[r, common_words_df.columns[c +1]] = len(common_words)/len(dfs[c]['Lexicon'])

common_words_df

Unnamed: 0,Lexicon,Global Change,IPCC,Wikipedia,EPA,BBC,UNDP
0,Global Change,1.0,0.110787,0.073171,0.129032,0.177419,0.148936
1,IPCC,0.361905,1.0,0.170732,0.258065,0.306452,0.382979
2,Wikipedia,0.114286,0.081633,1.0,0.212903,0.241935,0.212766
3,EPA,0.190476,0.116618,0.20122,1.0,0.225806,0.276596
4,BBC,0.104762,0.055394,0.091463,0.090323,1.0,0.212766
5,UNDP,0.066667,0.052478,0.060976,0.083871,0.16129,1.0


In [109]:
#Create an empty dataframe and write a function to fill with the values

common_words_df = pd.DataFrame({"Lexicon" : ["Global Change", "IPCC", "Wikipedia", "EPA", "BBC", "UNDP"], 
                               "Global Change": [0, 0, 0, 0, 0, 0], "IPCC" : [0, 0, 0, 0, 0, 0], 
                               "Wikipedia" : [0, 0, 0, 0, 0, 0], "EPA" : [0, 0, 0, 0, 0, 0], 
                               "BBC" : [0, 0, 0, 0, 0, 0], "UNDP" : [0, 0, 0, 0, 0, 0]})

dfs = [Global_Change_Lexicon, IPCC_Lexicon, Wikipedia_Lexicon, EPA_Lexicon, BBC_Lexicon, UNDP_Lexicon]

for r in range(0, len(dfs)):
    for c in range(0, len(dfs)):
        # Get the common values between the two columns
        common_words = set(dfs[r]['Lexicon']).intersection(set(dfs[c]['Lexicon']))
        common_words_df.loc[r, common_words_df.columns[c +1]] = len(common_words)

common_words_df

Unnamed: 0,Lexicon,Global Change,IPCC,Wikipedia,EPA,BBC,UNDP
0,Global Change,105,38,12,20,11,7
1,IPCC,38,343,28,40,19,18
2,Wikipedia,12,28,164,33,15,10
3,EPA,20,40,33,155,14,13
4,BBC,11,19,15,14,62,10
5,UNDP,7,18,10,13,10,47


In [110]:
dfs = [Global_Change_Lexicon, IPCC_Lexicon, Wikipedia_Lexicon, EPA_Lexicon, BBC_Lexicon, UNDP_Lexicon]
non_unique_words = []
unique_words = []
total_words = []
for r in range(len(dfs)):
    common_words = []
    for c in range(len(dfs)):
        if c != r:
            # Get the common values between the two columns
            common_words.extend(list(set(dfs[r]['Lexicon']).intersection(set(dfs[c]['Lexicon']))))
    common_words = list(set(common_words))  # Remove duplicates by converting to a set and back to a list
    total_words.append(len(dfs[r]["Lexicon"]))
    unique_words.append(len(dfs[r]["Lexicon"]) - len(common_words))
    non_unique_words.append(len(common_words))

non_unique_words

unique_words_df = pd.DataFrame({"Lexicon" : ["Global Change", "IPCC", "Wikipedia", "EPA", "BBC", "UNDP"], 
                                "Non unique words" : non_unique_words, "unique words" : unique_words, 
                               "total_words" : total_words})

unique_words_df["Richness"] = unique_words_df["unique words"] / unique_words_df["total_words"]

unique_words_df

Unnamed: 0,Lexicon,Non unique words,unique words,total_words,Richness
0,Global Change,45,60,105,0.571429
1,IPCC,80,263,343,0.766764
2,Wikipedia,47,117,164,0.713415
3,EPA,59,96,155,0.619355
4,BBC,30,32,62,0.516129
5,UNDP,25,22,47,0.468085


In [111]:
Lexicon_names = ["Global Change", "IPCC", "Wikipedia", "EPA", "BBC", "UNDP"]
Lexicons = [Global_Change_Lexicon, IPCC_Lexicon, Wikipedia_Lexicon, EPA_Lexicon, BBC_Lexicon, UNDP_Lexicon]
Lexicon_name = []
Topic = []
for i in range(len(Lexicon_names)):

    #Get the topic of each Lexicon
    # Concatenate all the terms in the Lexicon column
    text = ' '.join(Lexicons[i]['Lexicon'].values)
    Lexicon_name.extend([Lexicon_names[i]] * 5)
    # Create a new dataframe with a single row containing the concatenated text
    df = pd.DataFrame({'Text': [text]})

    # Create the document-term matrix
    vectorizer = CountVectorizer()
    dtm = vectorizer.fit_transform(df['Text'])

    # Apply Latent Dirichlet Allocation (LDA)
    lda = LatentDirichletAllocation(n_components=5, random_state=42)
    lda.fit(dtm)

    # Extract the topics and associated words
    feature_names = vectorizer.get_feature_names()
    num_top_words = 10  # Number of words per topic to display

    for topic_idx, topic in enumerate(lda.components_):
        #print(f"Topic #{topic_idx + 1}:")
        Topic.append(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))

Lexicon_Topics_df = pd.DataFrame({"Lexicon" : Lexicon_name, "Topic" : Topic})
Lexicon_Topics_df = Lexicon_Topics_df.drop_duplicates().reset_index(drop = True)

In [112]:
pd.set_option('display.max_colwidth', None)
Lexicon_Topics_df
pd.reset_option('display.max_colwidth')

# 3. Classification Models

## 3.1. Lexicons

### 3.1.1. Train Lexicons

In [113]:
Text_df = df_train.copy()
Text_df["Text"] = Text_df["Text"].apply(preprocess_text) 

In [114]:
start = time.time()

names = ["IPCC", "Global_Change", "UNDP", "EPA", "Wikipedia", "BBC"]
All_names = []
# Generate all combinations
all_combinations = []
for r in range(1, len(names) + 1):
    combinations_r = combinations(names, r)
    all_combinations.extend(combinations_r)

# Print all combinations
for combination in all_combinations:
    combined_string = "_".join(combination)
    All_names.append(combined_string)
    
Lexicons = [IPCC_Lexicon, Global_Change_Lexicon, UNDP_Lexicon, EPA_Lexicon, Wikipedia_Lexicon, BBC_Lexicon]
All_Lexicons = []
# Generate all combinations
all_combinations = []
for r in range(1, len(Lexicons) + 1):
    combinations_r = combinations(Lexicons, r)
    all_combinations.extend(combinations_r)

# Concatenate and print all combinations
for combination in all_combinations:
    combined_df = pd.concat(combination, axis=0).drop_duplicates().reset_index()
    All_Lexicons.append(combined_df)
    
results_df = pd.DataFrame()

start_results = time.time()

print("--- %s seconds ---" % (start_results - start))

for i in range(len(All_Lexicons)):
    r_df = find_optimal_threshold(Text_df, All_Lexicons[i], All_names[i])
    results_df = pd.concat([results_df, r_df])

results_df = results_df.reset_index(drop = True)

print("--- %s seconds ---" % (time.time() - start_results))

--- 0.4037930965423584 seconds ---
--- 1368.5836322307587 seconds ---


In [115]:
results_df.sort_values(by = "Accuracy", ascending = False)

Unnamed: 0,Lexicon,Technique,Threshold,Accuracy,Precision,Recall,F1 Score
82,Wikipedia_BBC,Relative Frequency,0.6,0.910,0.838235,0.890625,0.863636
128,Global_Change_UNDP_Wikipedia,Absolute Frequency,11.0,0.905,0.909091,0.781250,0.840336
214,Global_Change_UNDP_Wikipedia_BBC,Relative Frequency,1.1,0.905,0.800000,0.937500,0.863309
212,Global_Change_UNDP_Wikipedia_BBC,Absolute Frequency,11.0,0.905,0.909091,0.781250,0.840336
126,Global_Change_UNDP_EPA,Relative Frequency,1.1,0.900,0.782051,0.953125,0.859155
...,...,...,...,...,...,...,...
156,UNDP_Wikipedia_BBC,Absolute Frequency,3.0,0.645,0.474074,1.000000,0.643216
249,IPCC_Global_Change_UNDP_EPA_Wikipedia_BBC,Absolute Present,5.0,0.610,0.449275,0.968750,0.613861
165,IPCC_Global_Change_UNDP_EPA,Absolute Present,5.0,0.610,0.449275,0.968750,0.613861
225,IPCC_Global_Change_UNDP_EPA_Wikipedia,Absolute Present,5.0,0.610,0.449275,0.968750,0.613861


In [116]:
results_df.to_parquet("Lexicon_Tagging_Train_Results.parquet")

### 3.1.2. Test Lexicons

In [117]:
Text_df_test = df_test.copy()
Text_df_test["Text"] = Text_df_test["Text"].apply(preprocess_text) 

In [118]:
start = time.time()

names = ["IPCC", "Global_Change", "UNDP", "EPA", "Wikipedia", "BBC"]
All_names = []
# Generate all combinations
all_combinations = []
for r in range(1, len(names) + 1):
    combinations_r = combinations(names, r)
    all_combinations.extend(combinations_r)

# Print all combinations
for combination in all_combinations:
    combined_string = "_".join(combination)
    All_names.append(combined_string)
    
Lexicons = [IPCC_Lexicon, Global_Change_Lexicon, UNDP_Lexicon, EPA_Lexicon, Wikipedia_Lexicon, BBC_Lexicon]
All_Lexicons = []
# Generate all combinations
all_combinations = []
for r in range(1, len(Lexicons) + 1):
    combinations_r = combinations(Lexicons, r)
    all_combinations.extend(combinations_r)

# Concatenate and print all combinations
for combination in all_combinations:
    combined_df = pd.concat(combination, axis=0).drop_duplicates().reset_index()
    All_Lexicons.append(combined_df)
    
test_result_df = pd.DataFrame()

start_results = time.time()

print("--- %s seconds ---" % (start_results - start))

for n in range(0, len(All_names)):
    test_lexicon_result = test_lexicon(Text_df_test, results_df[results_df["Lexicon"] == All_names[n]], All_Lexicons[All_names.index(All_names[n])], All_names[n])
    test_result_df = pd.concat([test_result_df, test_lexicon_result])
    
print("--- %s seconds ---" % (time.time() - start_results))

--- 0.9570615291595459 seconds ---
--- 599.5314538478851 seconds ---


In [119]:
test_result_df.sort_values(by = "Test Accuracy", ascending = False)

Unnamed: 0,Lexicon,Technique,Threshold,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,UNDP_EPA_Wikipedia,Absolute Frequency,9.0,0.89,0.903226,0.777778,0.835821
2,EPA,Relative Frequency,0.6,0.89,0.857143,0.833333,0.845070
2,UNDP_EPA,Relative Frequency,0.9,0.89,0.804878,0.916667,0.857143
2,UNDP_EPA_BBC,Relative Frequency,0.9,0.89,0.804878,0.916667,0.857143
2,UNDP_EPA_Wikipedia,Relative Frequency,0.9,0.89,0.804878,0.916667,0.857143
...,...,...,...,...,...,...,...
0,UNDP_Wikipedia_BBC,Absolute Frequency,3.0,0.66,0.514286,1.000000,0.679245
1,IPCC_Global_Change_UNDP_EPA_BBC,Absolute Present,5.0,0.64,0.500000,0.972222,0.660377
1,IPCC_Global_Change_UNDP_EPA_Wikipedia,Absolute Present,5.0,0.64,0.500000,0.972222,0.660377
1,IPCC_Global_Change_UNDP_EPA_Wikipedia_BBC,Absolute Present,5.0,0.64,0.500000,0.972222,0.660377


In [120]:
test_result_df.to_parquet("Lexicon_Tagging_Test_Results.parquet")

## 3.2. Bert

In [121]:
bert_test_df = df_test.copy()

In [122]:
bert1 = get_metrics_df_hugging_face(bert_test_df, 'Text',"climatebert/environmental-claims",512, "climatebert/environmental-claims")
bert2 = get_metrics_df_hugging_face(bert_test_df, "Text", "climatebert/distilroberta-base-climate-detector", 512, "climatebert/distilroberta-base-climate-detector")
bert_df = pd.concat([bert1, bert2]).reset_index(drop = True)

In [123]:
bert_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,climatebert/environmental-claims,0.36,0.0,0.0,0.0
1,climatebert/distilroberta-base-climate-detector,0.79,0.647059,0.916667,0.758621


In [124]:
bert_df.to_parquet("Bert_Tagging_Test_Results.parquet")

# 4. Comparing Results

In [148]:
Lexicon_train_result_df = pd.read_parquet("Lexicon_Tagging_Train_Results.parquet")
Lexicon_test_result_df = pd.read_parquet("Lexicon_Tagging_Test_Results.parquet")
Bert_test_result_df = pd.read_parquet("Bert_Tagging_Test_Results.parquet")

In [149]:
Lexicon_train_result_df.sort_values(by = "Accuracy", ascending = False).head(10).reset_index(drop = True)

Unnamed: 0,Lexicon,Technique,Threshold,Accuracy,Precision,Recall,F1 Score
0,Wikipedia_BBC,Relative Frequency,0.6,0.91,0.838235,0.890625,0.863636
1,Global_Change_UNDP_Wikipedia,Absolute Frequency,11.0,0.905,0.909091,0.78125,0.840336
2,Global_Change_UNDP_Wikipedia_BBC,Relative Frequency,1.1,0.905,0.8,0.9375,0.863309
3,Global_Change_UNDP_Wikipedia_BBC,Absolute Frequency,11.0,0.905,0.909091,0.78125,0.840336
4,Global_Change_UNDP_EPA,Relative Frequency,1.1,0.9,0.782051,0.953125,0.859155
5,Global_Change_UNDP_EPA_BBC,Relative Frequency,1.1,0.9,0.782051,0.953125,0.859155
6,Global_Change_UNDP_EPA_BBC,Absolute Frequency,11.0,0.9,0.87931,0.796875,0.836066
7,Global_Change_UNDP_EPA_Wikipedia,Relative Frequency,1.1,0.9,0.782051,0.953125,0.859155
8,Global_Change_UNDP_Wikipedia,Relative Frequency,1.2,0.9,0.814286,0.890625,0.850746
9,UNDP_Wikipedia_BBC,Relative Frequency,0.9,0.9,0.782051,0.953125,0.859155


In [156]:
Lexicon_test_result_df.sort_values(by = ["Test Accuracy", "Test F1 Score"], ascending = False).head(10).reset_index(drop = True)[["Lexicon", "Technique", "Threshold", "Test Accuracy", "Test F1 Score"]]

Unnamed: 0,Lexicon,Technique,Threshold,Test Accuracy,Test F1 Score
0,UNDP_EPA,Relative Frequency,0.9,0.89,0.857143
1,UNDP_EPA_Wikipedia,Relative Frequency,0.9,0.89,0.857143
2,UNDP_EPA_BBC,Relative Frequency,0.9,0.89,0.857143
3,UNDP_EPA_Wikipedia_BBC,Relative Frequency,0.9,0.89,0.857143
4,Global_Change_EPA,Relative Frequency,0.8,0.89,0.853333
5,EPA,Relative Frequency,0.6,0.89,0.84507
6,EPA_BBC,Relative Frequency,0.6,0.89,0.84507
7,UNDP_EPA_Wikipedia,Absolute Frequency,9.0,0.89,0.835821
8,IPCC_UNDP_Wikipedia_BBC,Relative Frequency,1.6,0.88,0.846154
9,Global_Change_EPA_Wikipedia,Relative Frequency,0.8,0.88,0.842105


In [154]:
Lexicon_test_result_df

Unnamed: 0,Lexicon,Technique,Threshold,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,IPCC,Absolute Frequency,10.0,0.80,0.700000,0.777778,0.736842
1,IPCC,Absolute Present,6.0,0.74,0.608696,0.777778,0.682927
2,IPCC,Relative Frequency,1.1,0.82,0.695652,0.888889,0.780488
3,IPCC,Relative Present,0.8,0.74,0.647059,0.611111,0.628571
0,Global_Change,Absolute Frequency,5.0,0.73,0.736842,0.388889,0.509091
...,...,...,...,...,...,...,...
3,Global_Change_UNDP_EPA_Wikipedia_BBC,Relative Present,0.6,0.81,0.660377,0.972222,0.786517
0,IPCC_Global_Change_UNDP_EPA_Wikipedia_BBC,Absolute Frequency,12.0,0.77,0.638298,0.833333,0.722892
1,IPCC_Global_Change_UNDP_EPA_Wikipedia_BBC,Absolute Present,5.0,0.64,0.500000,0.972222,0.660377
2,IPCC_Global_Change_UNDP_EPA_Wikipedia_BBC,Relative Frequency,1.5,0.83,0.693878,0.944444,0.800000


In [128]:
Lexicon_test_result_df["Lexicon"].nunique()

63

In [129]:
Bert_test_result_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,climatebert/environmental-claims,0.36,0.0,0.0,0.0
1,climatebert/distilroberta-base-climate-detector,0.79,0.647059,0.916667,0.758621


In [131]:
Lexicon_cross = pd.concat([EPA_Lexicon]).drop_duplicates()
get_cross_table(Text_df_test, Lexicon_cross, 0.6, "Relative Frequency", False)

Estimate,No,Yes,All
Final_Climate_Change_Level_Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
High,0,19,19
Medium,6,11,17
No Climate,34,0,34
Small,25,5,30
All,65,35,100


# 5. Classify Articles

In [132]:
WP = pd.read_parquet("C:/Users/Boedt/OneDrive/Bureaublad/Scraped_Articles/Final/WP_Final_Articles.parquet")
WP_clean = WP.copy()
WP_clean["Text"] = WP_clean["Text"].apply(preprocess_text) 

In [133]:
WP_clean

Unnamed: 0,Title,Text,Date,News Paper,Link,Year
0,Full text: Clinton testifies before House comm...,2015 clintons benghazi hearing in three minute...,2016-09-23,Washington_Post,https://www.washingtonpost.com/news/post-polit...,2016
1,Full transcript: FBI Director James Comey test...,comey rogers testify on alleged russian interf...,2017-03-21,Washington_Post,https://www.washingtonpost.com/news/post-polit...,2017
2,Transcript of Zuckerberg’s appearance before H...,top takeaways from mark zuckerberg’s hearings ...,2018-04-11,Washington_Post,https://www.washingtonpost.com/news/the-switch...,2018
3,Transcript of Mark Zuckerberg’s Senate hearing,facebook ceo mark zuckerberg sat down before l...,2018-04-10,Washington_Post,https://www.washingtonpost.com/news/the-switch...,2018
4,The October Democratic debate transcript,sen kamala harris dcalif sen bernie sanders iv...,2019-10-16,Washington_Post,https://www.washingtonpost.com/politics/2019/1...,2019
...,...,...,...,...,...,...
143891,"By Martin Weil\nJuly 25, 2016\nA man was shot ...",a man was shot and wounded late monday on nort...,2016-07-26,Washington_Post,https://www.washingtonpost.com/local/public-sa...,2016
143892,Short Circuit: A roundup of recent federal cou...,here is the latest edition of the institute fo...,2017-01-04,Washington_Post,https://www.washingtonpost.com/news/volokh-con...,2017
143893,"Juno spacecraft slips into safe mode, putting ...",wednesday was meant to be a momentous day for ...,2016-10-20,Washington_Post,https://www.washingtonpost.com/news/speaking-o...,2016
143894,The down-ballot GOP candidate’s guide to survi...,when you run for office you tend to get a lot ...,2016-10-28,Washington_Post,https://www.washingtonpost.com/blogs/post-part...,2016


In [134]:
test = count_lexicon_words(WP_clean, EPA_Lexicon)

In [135]:
test[test["Relative Frequency"] >= 0.6]

Unnamed: 0,Title,Text,Date,News Paper,Link,Year,Absolute Frequency,Absolute Present,Relative Frequency,Relative Present
48,"The issues 2020 Democrats are running\non, acc...",politics analysis the issues 2020 democrats ar...,2019-04-08,Washington_Post,https://www.washingtonpost.com/graphics/politi...,2019,67,6,0.605458,0.054220
346,Treacherous freezing rain tonight with dangero...,a winter storm warning for montgomery fairfax ...,2016-02-16,Washington_Post,https://www.washingtonpost.com/news/capital-we...,2016,41,1,0.986288,0.024056
481,Hurricane Michael forecast updates: Historic s...,in the wake of hurricane michael panama city r...,2018-10-10,Washington_Post,https://www.washingtonpost.com/weather/2018/10...,2018,25,3,0.687380,0.082486
557,Opinion Change is afoot in Trump country,add to your saved stories save gift article sh...,2018-01-11,Washington_Post,https://www.washingtonpost.com/news/theworldpo...,2018,97,16,2.788158,0.459902
583,Radical warming in Siberia leaves millions on ...,2°c beyond the limit extreme climate change ha...,2019-10-04,Washington_Post,https://www.washingtonpost.com/graphics/2019/n...,2019,74,11,2.167545,0.322203
...,...,...,...,...,...,...,...,...,...,...
143826,"PM Update: Dense fog and drizzle tonight, then...",1045 pm fog has formed across the area this ev...,2018-02-23,Washington_Post,https://www.washingtonpost.com/news/capital-we...,2018,3,1,1.986755,0.662252
143836,Don’t mind those rumbling jets overhead. It’s ...,the north american aerospace defense command —...,2017-11-14,Washington_Post,https://www.washingtonpost.com/news/dr-gridloc...,2017,2,1,1.324503,0.662252
143837,Ryan’s tax hype falls flat,house speaker paul d ryan rwis perhaps to calm...,2017-06-21,Washington_Post,https://www.washingtonpost.com/blogs/right-tur...,2017,1,1,0.662252,0.662252
143845,"By Martin Weil\nJanuary 25, 2016\nMotorists an...",motorists and pedestrians across the washingto...,2016-01-26,Washington_Post,https://www.washingtonpost.com/local/streets-b...,2016,3,1,1.986755,0.662252


In [136]:
test["Climate"] = "No"
test.loc[test["Relative Frequency"] >= 0.6, "Climate"] = "Yes"

In [139]:
WP["Climate"] = test["Climate"]

In [141]:
WP[WP["Climate"] == "Yes"]

Unnamed: 0,Title,Text,Date,News Paper,Link,Year,Climate
48,"The issues 2020 Democrats are running\non, acc...",Politics Analysis\nThe issues 2020 Democrats a...,2019-04-08,Washington_Post,https://www.washingtonpost.com/graphics/politi...,2019,Yes
346,Treacherous freezing rain tonight with dangero...,"*A winter storm warning for Montgomery, Fairfa...",2016-02-16,Washington_Post,https://www.washingtonpost.com/news/capital-we...,2016,Yes
481,Hurricane Michael forecast updates: Historic s...,"In the wake of Hurricane Michael, Panama City ...",2018-10-10,Washington_Post,https://www.washingtonpost.com/weather/2018/10...,2018,Yes
557,Opinion Change is afoot in Trump country,Add to your saved stories\nSave\nGift Article\...,2018-01-11,Washington_Post,https://www.washingtonpost.com/news/theworldpo...,2018,Yes
583,Radical warming in Siberia leaves millions on ...,2°C: Beyond the limit\nExtreme climate change ...,2019-10-04,Washington_Post,https://www.washingtonpost.com/graphics/2019/n...,2019,Yes
...,...,...,...,...,...,...,...
143826,"PM Update: Dense fog and drizzle tonight, then...",10:45 p.m.: Fog has formed across the area thi...,2018-02-23,Washington_Post,https://www.washingtonpost.com/news/capital-we...,2018,Yes
143836,Don’t mind those rumbling jets overhead. It’s ...,The North American Aerospace Defense Command —...,2017-11-14,Washington_Post,https://www.washingtonpost.com/news/dr-gridloc...,2017,Yes
143837,Ryan’s tax hype falls flat,"House Speaker Paul D. Ryan (R-Wis.), perhaps t...",2017-06-21,Washington_Post,https://www.washingtonpost.com/blogs/right-tur...,2017,Yes
143845,"By Martin Weil\nJanuary 25, 2016\nMotorists an...",Motorists and pedestrians across the Washingto...,2016-01-26,Washington_Post,https://www.washingtonpost.com/local/streets-b...,2016,Yes
