# 0. Packages and Functions

## 0.1. Packages

In [182]:
import pandas as pd
import string
from nltk import ngrams
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM
import pyarrow.parquet as pq
import pyarrow as pyarrow
import numpy as np

## 0.2. Functions

In [200]:
#pre-process text for lexicon based approaches
def preprocess_text(text):
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # convert to lower case
    text = text.lower()
    # remove blank spaces
    text = ' '.join(text.split())
    # remove newline characters
    text = text.replace('\n', '')
    return text

### Absolute Count with Frequency

In [201]:
#Create function that take the dataframe, lexicon and n-gram value (how many n-grams should be considered) and determine the 
#count of words in dataframe text that match the lexicon
def count_lexicon_words_1(text_df, lexicon):
    lexicon = lexicon["Lexicon"]
    count = []

    for text in text_df["Text"]:
        lexicon_counts = 0

        for word in lexicon:
            lexicon_counts += text.lower().count(word.lower())
            
        count.append(lexicon_counts)
        
    text_df["Lexicon Count"] = count
    
    return(text_df)

#create a function that used the lexicon approach to determine with the target is yes or no
def lexicon_target_classifier_1(df, treshold):
    target = []
    
    count = df["Lexicon Count"]
    
    for c in count:
        if c < treshold:
            target.append("No")
        else:
            target.append("Yes")
            
    df["Target Lexicon"] = target
    return(df)

#Combine both functions to classify articles based on the lexicon
def lexicon_climate_classifier_1(text_df, lexicon, treshold):
    df = count_lexicon_words_1(text_df, lexicon)
    
    return(lexicon_target_classifier_1(df, treshold))

def threshold_metrics_1(df_text, lexicon, min_treshhold, max_treshhold):
    for i in range(min_treshhold, max_treshhold + 1):
        df = lexicon_climate_classifier_1(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)
        
        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / (cross_table.iloc[0,1] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) else 0
        recall = cross_table.iloc[1,1] / (cross_table.iloc[1,0] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

        # print the metrics
        print("Threshhold:", i)
        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 score:", f1_score)
        print(cross_table)
        print("\n")
        
def get_metrics_df_1(df_text, lexicon, min_treshhold, max_treshhold, Lexicon_name):    
    Accuracy = []
    Precision = []
    Recall = []
    F1_score = []
    Name = []
    Treshhold = []
    Technique = []
    
    for i in range(min_treshhold, max_treshhold + 1):
        df = lexicon_climate_classifier_1(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / (cross_table.iloc[0,1] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) else 0
        recall = cross_table.iloc[1,1] / (cross_table.iloc[1,0] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

        Accuracy.append(accuracy)
        Precision.append(precision)
        Recall.append(recall)
        F1_score.append(f1_score)
        Name.append(Lexicon_name)
        Treshhold.append(i)
        Technique.append("Absolute Frequency")
        
    return(pd.DataFrame({"Lexicon" : Name, "Technique": Technique, "Treshhold" : Treshhold, "Accuracy" : Accuracy, "Precision" : Precision, "Recall" : Recall, "F1 Score" : F1_score}))
    

def get_nsmh_crosstab_1(df_text, lexicon, min_treshhold, max_treshhold):
    for i in range(min_treshhold, max_treshhold + 1):
        df = lexicon_climate_classifier_1(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Final_Climate_Change_Level_Label'], df['Target Lexicon'], margins=True)
        print(cross_table)
        print("\n")
        print("\n")

### Relative Count with Frequency

In [202]:
#Create function that take the dataframe, lexicon and n-gram value (how many n-grams should be considered) and determine the 
#count of words in dataframe text that match the lexicon
def count_lexicon_words_2(text_df, lexicon):
    lexicon = lexicon["Lexicon"]
    count = []
    
    for text in text_df["Text"]:
        lexicon_counts = 0

        for word in lexicon:
            lexicon_counts += text.lower().count(word.lower())
        
        word_list = text.split() 
        word_count = len(word_list)
        count.append((lexicon_counts/word_count)*100)

    text_df["Lexicon Count"] = count
    
    return(text_df)

#create a function that used the lexicon approach to determine with the target is yes or no
def lexicon_target_classifier_2(df, treshold):
    target = []
    
    count = df["Lexicon Count"]
    
    for c in count:
        if c < treshold:
            target.append("No")
        else:
            target.append("Yes")
            
    df["Target Lexicon"] = target
    return(df)

#Combine both functions to classify articles based on the lexicon
def lexicon_climate_classifier_2(text_df, lexicon, treshold):
    df = count_lexicon_words_2(text_df, lexicon)
    
    return(lexicon_target_classifier_2(df, treshold))

def threshold_metrics_2(df_text, lexicon, min_treshhold, max_treshhold, jump):
    
    if(jump == 0):
        df = lexicon_climate_classifier_2(df_text, lexicon, min_treshhold)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / (cross_table.iloc[0,1] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) else 0
        recall = cross_table.iloc[1,1] / (cross_table.iloc[1,0] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

        # print the metrics
        print("Threshhold:", min_treshhold)
        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 score:", f1_score)
        print(cross_table)
        print("\n")

    else:   
        for num in range(int((max_treshhold - min_treshhold) / jump) + 1):
            i = min_treshhold + num * jump
            df = lexicon_climate_classifier_2(df_text, lexicon, i)
            cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

            # calculate classification metrics using scikit-learn
            accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
            precision = cross_table.iloc[1,1] / (cross_table.iloc[0,1] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) else 0
            recall = cross_table.iloc[1,1] / (cross_table.iloc[1,0] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) != 0 else 0
            f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

            # print the metrics
            print("Threshhold:", i)
            print("Accuracy:", accuracy)
            print("Precision:", precision)
            print("Recall:", recall)
            print("F1 score:", f1_score)
            print(cross_table)
            print("\n")

def get_metrics_df_2(df_text, lexicon, min_treshhold, max_treshhold, jump, Lexicon_name):    
    Accuracy = []
    Precision = []
    Recall = []
    F1_score = []
    Name = []
    Treshhold = []
    Technique = []
    
    for num in range(int((max_treshhold - min_treshhold) / jump) + 1):
        i = min_treshhold + num * jump
        df = lexicon_climate_classifier_2(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / (cross_table.iloc[0,1] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) else 0
        recall = cross_table.iloc[1,1] / (cross_table.iloc[1,0] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
        
        Accuracy.append(accuracy)
        Precision.append(precision)
        Recall.append(recall)
        F1_score.append(f1_score)
        Name.append(Lexicon_name)
        Treshhold.append(i)
        Technique.append("Relative Frequency")
        
    return(pd.DataFrame({"Lexicon" : Name, "Technique": Technique, "Treshhold" : Treshhold, "Accuracy" : Accuracy, "Precision" : Precision, "Recall" : Recall, "F1 Score" : F1_score}))

def get_nsmh_crosstab_2(df_text, lexicon, min_treshhold, max_treshhold, jump):
    if(jump == 0):
        df = lexicon_climate_classifier_2(df_text, lexicon, min_treshhold)
        cross_table = pd.crosstab(df['Final_Climate_Change_Level_Label'], df['Target Lexicon'], margins=True)
        print(cross_table)
    else:   
        for num in range(int((max_treshhold - min_treshhold) / jump) + 1):
            i = min_treshhold + num * jump
            df = lexicon_climate_classifier_2(df_text, lexicon, i)
            cross_table = pd.crosstab(df['Final_Climate_Change_Level_Label'], df['Target Lexicon'], margins=True)
            print(cross_table)
            print("\n")
            print("\n")

### Absolute Term Presences

In [203]:
#Create function that take the dataframe, lexicon and n-gram value (how many n-grams should be considered) and determine the 
#count of words in dataframe text that match the lexicon
def count_lexicon_words_3(text_df, lexicon):
    lexicon = lexicon["Lexicon"]
    count = []

    for text in text_df["Text"]:
        lexicon_counts = 0

        for word in lexicon:
            if text.lower().count(word.lower()) > 0:
                lexicon_counts += 1
        
        count.append(lexicon_counts)
        
    text_df["Lexicon Count"] = count
    
    return text_df

#create a function that used the lexicon approach to determine with the target is yes or no
def lexicon_target_classifier_3(df, treshold):
    target = []
    
    count = df["Lexicon Count"]
    
    for c in count:
        if c < treshold:
            target.append("No")
        else:
            target.append("Yes")
            
    df["Target Lexicon"] = target
    return(df)

#Combine both functions to classify articles based on the lexicon
def lexicon_climate_classifier_3(text_df, lexicon, treshold):
    df = count_lexicon_words_3(text_df, lexicon)
    
    return(lexicon_target_classifier_3(df, treshold))

def threshold_metrics_3(df_text, lexicon, min_treshhold, max_treshhold):
    for i in range(min_treshhold, max_treshhold + 1):
        df = lexicon_climate_classifier_3(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)
        
        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / (cross_table.iloc[0,1] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) else 0
        recall = cross_table.iloc[1,1] / (cross_table.iloc[1,0] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

        # print the metrics
        print("Threshhold:", i)
        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 score:", f1_score)
        print(cross_table)
        print("\n")
        
def get_metrics_df_3(df_text, lexicon, min_treshhold, max_treshhold, Lexicon_name):    
    Accuracy = []
    Precision = []
    Recall = []
    F1_score = []
    Name = []
    Treshhold = []
    Technique = []
    
    for i in range(min_treshhold, max_treshhold + 1):
        df = lexicon_climate_classifier_3(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / (cross_table.iloc[0,1] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) else 0
        recall = cross_table.iloc[1,1] / (cross_table.iloc[1,0] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

        Accuracy.append(accuracy)
        Precision.append(precision)
        Recall.append(recall)
        F1_score.append(f1_score)
        Name.append(Lexicon_name)
        Treshhold.append(i)
        Technique.append("Absolute Presences")
        
    return(pd.DataFrame({"Lexicon" : Name, "Technique": Technique, "Treshhold" : Treshhold, "Accuracy" : Accuracy, "Precision" : Precision, "Recall" : Recall, "F1 Score" : F1_score}))
    
def get_nsmh_crosstab_2(df_text, lexicon, min_treshhold, max_treshhold):
    for i in range(min_treshhold, max_treshhold + 1):
        df = lexicon_climate_classifier_2(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Final_Climate_Change_Level_Label'], df['Target Lexicon'], margins=True)
        print(cross_table)
        print("\n")
        print("\n")

### Relative Term Presences

In [204]:
#Create function that take the dataframe, lexicon and n-gram value (how many n-grams should be considered) and determine the 
#count of words in dataframe text that match the lexicon
def count_lexicon_words_4(text_df, lexicon):
    lexicon = lexicon["Lexicon"]
    count = []

    for text in text_df["Text"]:
        lexicon_counts = 0

        for word in lexicon:
            if text.lower().count(word.lower()) > 0:
                lexicon_counts += 1
        
        word_list = text.split() 
        word_count = len(word_list)
        count.append((lexicon_counts/word_count)*100)
        
    text_df["Lexicon Count"] = count
    
    return text_df

#create a function that used the lexicon approach to determine with the target is yes or no
def lexicon_target_classifier_4(df, treshold):
    target = []
    
    count = df["Lexicon Count"]
    
    for c in count:
        if c < treshold:
            target.append("No")
        else:
            target.append("Yes")
            
    df["Target Lexicon"] = target
    return(df)

#Combine both functions to classify articles based on the lexicon
def lexicon_climate_classifier_4(text_df, lexicon, treshold):
    df = count_lexicon_words_4(text_df, lexicon)
    
    return(lexicon_target_classifier_4(df, treshold))

def threshold_metrics_4(df_text, lexicon, min_treshhold, max_treshhold, jump):
    
    if(jump == 0):
        df = lexicon_climate_classifier_4(df_text, lexicon, min_treshhold)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / (cross_table.iloc[0,1] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) else 0
        recall = cross_table.iloc[1,1] / (cross_table.iloc[1,0] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

        # print the metrics
        print("Threshhold:", i)
        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 score:", f1_score)
        print(cross_table)
        print("\n")
        
    else:
        for num in range(int((max_treshhold - min_treshhold) / jump) + 1):
            i = min_treshhold + num * jump
            df = lexicon_climate_classifier_4(df_text, lexicon, i)
            cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

            # calculate classification metrics using scikit-learn
            accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
            precision = cross_table.iloc[1,1] / cross_table.iloc[0,1] if cross_table.iloc[0,1] != 0 else 0
            recall = cross_table.iloc[1,1] / cross_table.iloc[1,0] if cross_table.iloc[1,0] != 0 else 0
            f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

            # print the metrics
            print("Threshhold:", i)
            print("Accuracy:", accuracy)
            print("Precision:", precision)
            print("Recall:", recall)
            print("F1 score:", f1_score)
            print(cross_table)
            print("\n")

def get_metrics_df_4(df_text, lexicon, min_treshhold, max_treshhold, jump, Lexicon_name):    
    Accuracy = []
    Precision = []
    Recall = []
    F1_score = []
    Name = []
    Treshhold = []
    Technique = []
    
    for num in range(int((max_treshhold - min_treshhold) / jump) + 1):
        i = min_treshhold + num * jump
        df = lexicon_climate_classifier_4(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / (cross_table.iloc[0,1] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) else 0
        recall = cross_table.iloc[1,1] / (cross_table.iloc[1,0] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

        Accuracy.append(accuracy)
        Precision.append(precision)
        Recall.append(recall)
        F1_score.append(f1_score)
        Name.append(Lexicon_name)
        Treshhold.append(i)
        Technique.append("Relative Presences")
        
    return(pd.DataFrame({"Lexicon" : Name, "Technique": Technique, "Treshhold" : Treshhold, "Accuracy" : Accuracy, "Precision" : Precision, "Recall" : Recall, "F1 Score" : F1_score}))

def get_nsmh_crosstab_4(df_text, lexicon, min_treshhold, max_treshhold, jump):
    if(jump == 0):
        df = lexicon_climate_classifier_4(df_text, lexicon, min_treshhold)
        cross_table = pd.crosstab(df['Final_Climate_Change_Level_Label'], df['Target Lexicon'], margins=True)
        print(cross_table)
    else:   
        for num in range(int((max_treshhold - min_treshhold) / jump) + 1):
            i = min_treshhold + num * jump
            df = lexicon_climate_classifier_4(df_text, lexicon, i)
            cross_table = pd.crosstab(df['Final_Climate_Change_Level_Label'], df['Target Lexicon'], margins=True)
            print(cross_table)
            print("\n")
            print("\n")

### Hugging Face

In [205]:
def summary(df_with_text, name, model_name, max_lenght_input=-1):
    data_in_list = df_with_text[name].tolist()
    tokenizer_sum = AutoTokenizer.from_pretrained(model_name)
    model_sum = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    summarizer = pipeline('summarization', model=model_sum, tokenizer = tokenizer_sum) 

    if max_lenght_input>=0:
        df_with_text['summary'] = summarizer(data_in_list, max_length=max_lenght_input)

    else:
        df_with_text['summary'] = summarizer(data_in_list)

def classification(df_with_text, name, model_name, max_lenght_input=-1):
    data_in_list = df_with_text[name].tolist()
    tokenizer_clas = AutoTokenizer.from_pretrained(model_name)
    model_clas = AutoModelForSequenceClassification.from_pretrained(model_name)
    classification = pipeline('text-classification', model=model_clas, tokenizer = tokenizer_clas) 

    if max_lenght_input>=0:
        df_with_text['classification'] = classification(data_in_list, max_length=max_lenght_input, truncation=True)

    else:
        df_with_text['classification'] = classification(data_in_list)
        
    return(df_with_text)

def get_metrics_hugging_face(text_df, text_column, model, tokens):
    
    df = classification(text_df, text_column, model, tokens)
    label_list = list(df["classification"])
    labels = [entry['label'] for entry in label_list]
    df["Label_Hugging"] = labels
    
    cross_table = pd.crosstab(df['Target'], df['Label_Hugging'], margins=True)
    
    # calculate classification metrics using scikit-learn
    accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
    precision = cross_table.iloc[1,1] / (cross_table.iloc[0,1] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) else 0
    recall = cross_table.iloc[1,1] / (cross_table.iloc[1,0] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0


    # print the metrics
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 score:", f1_score)
    print(cross_table)
    print("\n")
    print(pd.crosstab(df['Final_Climate_Change_Level_Label'], df['Label_Hugging'], margins=True))

Accuracy: This metric measures the overall performance of a model. It is defined as the number of correct predictions divided by the total number of predictions. Accuracy is a good metric to use when the classes are roughly balanced, meaning there are about the same number of positive and negative examples in the dataset.

Precision: This metric measures how many of the positive predictions made by a model are actually correct. It is defined as the number of true positives divided by the total number of positive predictions. Precision is a good metric to use when we care more about avoiding false positives than false negatives.

Recall: This metric measures how many of the positive examples in the dataset are correctly predicted by the model. It is defined as the number of true positives divided by the total number of actual positive examples. Recall is a good metric to use when we care more about avoiding false negatives than false positives.

F1 score: This metric is a weighted average of precision and recall, where the weight is determined by the beta parameter. The most common value for beta is 1, which gives equal weight to precision and recall. The F1 score is a good metric to use when we want to balance precision and recall, and when the classes are imbalanced.

# 1. Import Label Dataset

In [206]:
tag_climate_df = pd.read_parquet("Climate_Labels_Dataset.parquet")
tag_climate_df.head(5)

Unnamed: 0,Text,Link,Sentiment_Label,Sentiment_Label_R,Level_Climate_Change_Topic,Level_Climate_Change_Topic_R,Final_Climate_Change_Level_Label,Final_Sentiment_Label,was_I_retarded?
0,More than a dozen state attorneys general gath...,https://www.washingtonpost.com/news/energy-env...,1,-1,Medium,Medium,Medium,-1,No
1,Sen. Jeff Merkley of Oregon endorsed Bernie S...,http://www.wsj.com/articles/campaign-wire-1460...,0,0,Small,Small,Small,0,Yes
2,When Carmen Luna moved to a neighborhood on t...,https://www.wsj.com/articles/mexico-city-strug...,-1,-1,Medium,Medium,Medium,-1,Yes
3,As ocean warming continues to trigger widespre...,https://www.washingtonpost.com/national/health...,1,-1,High,High,High,-1,No
4,PG&E Corp. told California regulators that it...,https://www.wsj.com/articles/pg-e-equipment-mi...,-1,-1,Medium,Medium,Medium,-1,Yes


In [207]:
#Only keep the required columns
tag_climate_df = tag_climate_df[["Text", "Final_Climate_Change_Level_Label"]]

In [208]:
tag_climate_df

Unnamed: 0,Text,Final_Climate_Change_Level_Label
0,More than a dozen state attorneys general gath...,Medium
1,Sen. Jeff Merkley of Oregon endorsed Bernie S...,Small
2,When Carmen Luna moved to a neighborhood on t...,Medium
3,As ocean warming continues to trigger widespre...,High
4,PG&E Corp. told California regulators that it...,Medium
...,...,...
295,"U.S. government bond prices swung Wednesday, u...",Na
296,Japan’s corporate governance reforms are start...,Na
297,While President Trump is out there wheezing hi...,Na
298,The South is home to three schools ranked four...,Na


In [209]:
#Clean the tabel
tag_climate_df['Final_Climate_Change_Level_Label'] = tag_climate_df['Final_Climate_Change_Level_Label'].str.strip()
tag_climate_df.loc[tag_climate_df["Final_Climate_Change_Level_Label"] == "NA", "Final_Climate_Change_Level_Label"] = "Na"
tag_climate_df.loc[tag_climate_df["Final_Climate_Change_Level_Label"] == "0", "Final_Climate_Change_Level_Label"] = "Na"
tag_climate_df.loc[tag_climate_df["Final_Climate_Change_Level_Label"] == "Na", "Final_Climate_Change_Level_Label"] = "No Climate"
tag_climate_df["Target"] = tag_climate_df["Final_Climate_Change_Level_Label"].apply(lambda x: "Yes" if x in ["High", "Medium"] else "No")

In [210]:
overview_labels_hms = tag_climate_df.groupby("Final_Climate_Change_Level_Label")["Text"].count().reset_index()

In [211]:
overview_labels_hms

Unnamed: 0,Final_Climate_Change_Level_Label,Text
0,High,65
1,Medium,35
2,No Climate,109
3,Small,91


In [212]:
overview_labels = tag_climate_df.groupby("Target")["Text"].count().reset_index()

In [213]:
overview_labels

Unnamed: 0,Target,Text
0,No,200
1,Yes,100


In [127]:
#overview_labels_hms.to_csv("C:/Users/Boedt/OneDrive/Bureaublad/R Thesis/overview_tag_labels_hms", index = False)
#overview_labels.to_csv("C:/Users/Boedt/OneDrive/Bureaublad/R Thesis/overview_tag_labels", index = False)

# 2. Taggers

## 2.1. Lexicon

### Global Change Lexicon

Uitleg

In [None]:
#Load the lexicon
Global_Change_Lexicon = pd.read_csv("Global_Change_Lexicon")
Global_Change_Lexicon = Global_Change_Lexicon.drop_duplicates().reset_index(drop = True)
Global_Change_Lexicon["Lexicon"] = Global_Change_Lexicon["Lexicon"].str.lower()

Global_Change_Lexicon

### IPCC Lexicon

Uitleg

In [None]:
#Load the lexicon
IPCC_Lexicon = pd.read_csv("IPCC_Lexicon")
IPCC_Lexicon = IPCC_Lexicon.drop_duplicates().reset_index(drop = True)
IPCC_Lexicon["Lexicon"] = IPCC_Lexicon["Lexicon"].str.lower()

IPCC_Lexicon

### Wikipedia Lexicon

Uitleg

In [None]:
#Load the lexicon
Wikipedia_Lexicon = pd.read_csv("Wikipedia_Lexicon")
Wikipedia_Lexicon = Wikipedia_Lexicon.drop_duplicates().reset_index(drop = True)
Wikipedia_Lexicon["Lexicon"] = Wikipedia_Lexicon["Lexicon"].str.lower()

Wikipedia_Lexicon

### EPA Lexicon

Uitleg

In [None]:
#Load the lexicon
EPA_Lexicon = pd.read_csv("EPA_Lexicon")
EPA_Lexicon = EPA_Lexicon.drop_duplicates().reset_index(drop = True)
EPA_Lexicon["Lexicon"] = EPA_Lexicon["Lexicon"].str.lower()

EPA_Lexicon

EPA_Lexicon

### BBC Lexicon

Uitleg

In [None]:
#Load the lexicon
BBC_Lexicon = pd.read_csv("BBC_Lexicon")
BBC_Lexicon = BBC_Lexicon.drop_duplicates().reset_index(drop = True)
BBC_Lexicon["Lexicon"] = BBC_Lexicon["Lexicon"].str.lower()

BBC_Lexicon

### UNDP Lexicon

Uitleg

In [None]:
#Load the lexicon
UNDP_Lexicon = pd.read_csv("UNDP_Lexicon")
UNDP_Lexicon = UNDP_Lexicon.drop_duplicates().reset_index(drop = True)
UNDP_Lexicon["Lexicon"] = UNDP_Lexicon["Lexicon"].str.lower()

UNDP_Lexicon

### Compare the lexicons to each other

In [74]:
#Create an empty dataframe and write a function to fill with the values

common_words_df = pd.DataFrame({"Lexicon" : ["Global Change", "IPCC", "Wikipedia", "EPA", "BBC", "UNDP"], 
                               "Global Change": [0, 0, 0, 0, 0, 0], "IPCC" : [0, 0, 0, 0, 0, 0], 
                               "Wikipedia" : [0, 0, 0, 0, 0, 0], "EPA" : [0, 0, 0, 0, 0, 0], 
                               "BBC" : [0, 0, 0, 0, 0, 0], "UNDP" : [0, 0, 0, 0, 0, 0]})

dfs = [Global_Change_Lexicon, IPCC_Lexicon, Wikipedia_Lexicon, EPA_Lexicon, BBC_Lexicon, UNDP_Lexicon]

for r in range(0, len(dfs)):
    for c in range(0, len(dfs)):
        # Get the common values between the two columns
        common_words = set(dfs[r]['Lexicon']).intersection(set(dfs[c]['Lexicon']))
        common_words_df.loc[r, common_words_df.columns[c +1]] = len(common_words)

common_words_df

Unnamed: 0,Lexicon,Global Change,IPCC,Wikipedia,EPA,BBC,UNDP
0,Global Change,105,38,12,20,11,7
1,IPCC,38,405,28,46,22,19
2,Wikipedia,12,28,164,33,15,10
3,EPA,20,46,33,176,16,14
4,BBC,11,22,15,16,79,11
5,UNDP,7,19,10,14,11,47


In [81]:
dfs = [Global_Change_Lexicon, IPCC_Lexicon, Wikipedia_Lexicon, EPA_Lexicon, BBC_Lexicon, UNDP_Lexicon]
non_unique_words = []
unique_words = []
total_words = []
for r in range(len(dfs)):
    common_words = []
    for c in range(len(dfs)):
        if c != r:
            # Get the common values between the two columns
            common_words.extend(list(set(dfs[r]['Lexicon']).intersection(set(dfs[c]['Lexicon']))))
    common_words = list(set(common_words))  # Remove duplicates by converting to a set and back to a list
    total_words.append(len(dfs[r]["Lexicon"]))
    unique_words.append(len(dfs[r]["Lexicon"]) - len(common_words))
    non_unique_words.append(len(common_words))

non_unique_words

unique_words_df = pd.DataFrame({"Lexicon" : ["Global Change", "IPCC", "Wikipedia", "EPA", "BBC", "UNDP"], 
                                "Non unique words" : non_unique_words, "unique words" : unique_words, 
                               "total_words" : total_words})

unique_words_df["Richness"] = unique_words_df["unique words"] / unique_words_df["total_words"]

unique_words_df

Unnamed: 0,Lexicon,Non unique words,unique words,total_words,Richness
0,Global Change,45,60,105,0.571429
1,IPCC,88,317,405,0.782716
2,Wikipedia,47,117,164,0.713415
3,EPA,66,110,176,0.625
4,BBC,34,45,79,0.56962
5,UNDP,26,21,47,0.446809


# 3. Testen Taggers

## 3.1. Lexicon

In [189]:
#create a separate df with the specific cleaning for the lexicons
Lexicon_df = tag_climate_df.copy()
Lexicon_df["Text"] = Lexicon_df["Text"].apply(preprocess_text)

In [195]:
get_metrics_df_2(Lexicon_df, EPA_Lexicon, 0, 2, 0.1, "EPA")

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score
0,EPA,Relative Frequency,0.0,0.333333,0.0,0.0,0.0
1,EPA,Relative Frequency,0.1,0.64,0.480769,1.0,0.649351
2,EPA,Relative Frequency,0.2,0.733333,0.555556,1.0,0.714286
3,EPA,Relative Frequency,0.3,0.81,0.640523,0.98,0.774704
4,EPA,Relative Frequency,0.4,0.873333,0.731343,0.98,0.837607
5,EPA,Relative Frequency,0.5,0.883333,0.773109,0.92,0.840183
6,EPA,Relative Frequency,0.6,0.896667,0.810811,0.9,0.853081
7,EPA,Relative Frequency,0.7,0.893333,0.826923,0.86,0.843137
8,EPA,Relative Frequency,0.8,0.89,0.838384,0.83,0.834171
9,EPA,Relative Frequency,0.9,0.883333,0.842105,0.8,0.820513


### 3.1.1. One Lexicon

In [214]:
#Global Change Lexicon
Global_Change_df_1 = get_metrics_df_1(Lexicon_df, Global_Change_Lexicon, 1, 20, "Global Change")
Global_Change_df_2 = get_metrics_df_2(Lexicon_df, Global_Change_Lexicon, 0.1, 2, 0.1, "Global Change")
Global_Change_df_3 = get_metrics_df_3(Lexicon_df, Global_Change_Lexicon, 1, 20, "Global Change")
Global_Change_df_4 = get_metrics_df_4(Lexicon_df, Global_Change_Lexicon, 0.1, 2, 0.1, "Global Change")

In [215]:
Global_Change_df = pd.concat([Global_Change_df_1, Global_Change_df_2, Global_Change_df_3, Global_Change_df_4])

In [216]:
#IPCC Lexicon
IPCC_df_1 = get_metrics_df_1(Lexicon_df, IPCC_Lexicon, 1, 20, "IPCC")
IPCC_df_2 = get_metrics_df_2(Lexicon_df, IPCC_Lexicon, 0.1, 2, 0.1, "IPCC")
IPCC_df_3 = get_metrics_df_3(Lexicon_df, IPCC_Lexicon, 1, 20, "IPCC")
IPCC_df_4 = get_metrics_df_4(Lexicon_df, IPCC_Lexicon, 0.1, 2, 0.1, "IPCC")

In [217]:
IPCC_df = pd.concat([IPCC_df_1, IPCC_df_2, IPCC_df_3, IPCC_df_4])

In [218]:
#Wikipedia Lexicon
Wikipedia_df_1 = get_metrics_df_1(Lexicon_df, Wikipedia_Lexicon, 1, 20, "Wikipedia")
Wikipedia_df_2 = get_metrics_df_2(Lexicon_df, Wikipedia_Lexicon, 0.1, 2, 0.1, "Wikipedia")
Wikipedia_df_3 = get_metrics_df_3(Lexicon_df, Wikipedia_Lexicon, 1, 20, "Wikipedia")
Wikipedia_df_4 = get_metrics_df_4(Lexicon_df, Wikipedia_Lexicon, 0.1, 2, 0.1, "Wikipedia")

In [219]:
Wikipedia_df = pd.concat([Wikipedia_df_1, Wikipedia_df_2, Wikipedia_df_3, Wikipedia_df_4])

In [220]:
#EPA Lexicon
EPA_df_1 = get_metrics_df_1(Lexicon_df, EPA_Lexicon, 1, 20, "EPA")
EPA_df_2 = get_metrics_df_2(Lexicon_df, EPA_Lexicon, 0.1, 2, 0.1, "EPA")
EPA_df_3 = get_metrics_df_3(Lexicon_df, EPA_Lexicon, 1, 20, "EPA")
EPA_df_4 = get_metrics_df_4(Lexicon_df, EPA_Lexicon, 0.1, 2, 0.1, "EPA")

In [221]:
EPA_df = pd.concat([EPA_df_1, EPA_df_2, EPA_df_3, EPA_df_4])

In [222]:
#BBC Lexicon
BBC_df_1 = get_metrics_df_1(Lexicon_df, BBC_Lexicon, 1, 20, "BBC")
BBC_df_2 = get_metrics_df_2(Lexicon_df, BBC_Lexicon, 0.1, 2, 0.1, "BBC")
BBC_df_3 = get_metrics_df_3(Lexicon_df, BBC_Lexicon, 1, 20, "BBC")
BBC_df_4 = get_metrics_df_4(Lexicon_df, BBC_Lexicon, 0.1, 2, 0.1, "BBC")

In [223]:
BBC_df = pd.concat([BBC_df_1, BBC_df_2, BBC_df_3, BBC_df_4])

In [224]:
#UNDP Lexicon
UNDP_df_1 = get_metrics_df_1(Lexicon_df, UNDP_Lexicon, 1, 20, "UNDP")
UNDP_df_2 = get_metrics_df_2(Lexicon_df, UNDP_Lexicon, 0.1, 2, 0.1, "UNDP")
UNDP_df_3 = get_metrics_df_3(Lexicon_df, UNDP_Lexicon, 1, 20, "UNDP")
UNDP_df_4 = get_metrics_df_4(Lexicon_df, UNDP_Lexicon, 0.1, 2, 0.1, "UNDP")

In [225]:
UNDP_df = pd.concat([UNDP_df_1, UNDP_df_2, UNDP_df_3, UNDP_df_4])

In [226]:
#Get all lexicons together
Lexicon_df_1 = pd.concat([Global_Change_df, IPCC_df, Wikipedia_df, EPA_df, BBC_df, UNDP_df])

In [227]:
Lexicon_df_1.sort_values("Accuracy", ascending = False).reset_index(drop = True).head(20)

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score
0,EPA,Relative Frequency,0.6,0.896667,0.810811,0.9,0.853081
1,EPA,Relative Frequency,0.7,0.893333,0.826923,0.86,0.843137
2,BBC,Relative Frequency,0.2,0.89,0.786325,0.92,0.847926
3,EPA,Relative Frequency,0.8,0.89,0.838384,0.83,0.834171
4,UNDP,Absolute Frequency,4.0,0.886667,0.836735,0.82,0.828283
5,EPA,Relative Frequency,0.9,0.883333,0.842105,0.8,0.820513
6,EPA,Relative Frequency,0.5,0.883333,0.773109,0.92,0.840183
7,Wikipedia,Relative Frequency,0.4,0.883333,0.773109,0.92,0.840183
8,Wikipedia,Relative Frequency,0.5,0.883333,0.798165,0.87,0.832536
9,EPA,Relative Frequency,1.0,0.883333,0.865169,0.77,0.814815


### 3.1.2. Two Lexicons Combined

In [228]:
#UNDP and EPA
EPA_UNDP_Lexicon = pd.concat([EPA_Lexicon, UNDP_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_UNDP_df_1 = get_metrics_df_1(Lexicon_df, EPA_UNDP_Lexicon, 1, 20, "EPA_UDNP")
EPA_UNDP_df_2 = get_metrics_df_2(Lexicon_df, EPA_UNDP_Lexicon, 0.1, 2, 0.1, "EPA_UDNP")
EPA_UNDP_df_3 = get_metrics_df_3(Lexicon_df, EPA_UNDP_Lexicon, 1, 20, "EPA_UDNP")
EPA_UNDP_df_4 = get_metrics_df_4(Lexicon_df, EPA_UNDP_Lexicon, 0.1, 2, 0.1, "EPA_UDNP")

In [229]:
EPA_UNDP_df = pd.concat([EPA_UNDP_df_1, EPA_UNDP_df_2, EPA_UNDP_df_3, EPA_UNDP_df_4])

In [230]:
#UNDP and BBC
BBC_UNDP_Lexicon = pd.concat([BBC_Lexicon, UNDP_Lexicon]).drop_duplicates().reset_index(drop = True)
BBC_UNDP_df_1 = get_metrics_df_1(Lexicon_df, BBC_UNDP_Lexicon, 1, 20, "BBC_UNDP")
BBC_UNDP_df_2 = get_metrics_df_2(Lexicon_df, BBC_UNDP_Lexicon, 0.1, 2, 0.1, "BBC_UNDP")
BBC_UNDP_df_3 = get_metrics_df_3(Lexicon_df, BBC_UNDP_Lexicon, 1, 20, "BBC_UNDP")
BBC_UNDP_df_4 = get_metrics_df_4(Lexicon_df, BBC_UNDP_Lexicon, 0.1, 2, 0.1, "BBC_UNDP")

In [231]:
BBC_UNDP_df = pd.concat([BBC_UNDP_df_1, BBC_UNDP_df_2, BBC_UNDP_df_3, BBC_UNDP_df_4])

In [232]:
#UNDP and Global Change 
UNDP_Global_Change_Lexicon = pd.concat([Global_Change_Lexicon, UNDP_Lexicon]).drop_duplicates().reset_index(drop = True)
UNDP_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, UNDP_Global_Change_Lexicon, 1, 20, "UNDP_Global_Change")
UNDP_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, UNDP_Global_Change_Lexicon, 0.1, 2, 0.1, "UNDP_Global_Change")
UNDP_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, UNDP_Global_Change_Lexicon, 1, 20, "UNDP_Global_Change")
UNDP_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, UNDP_Global_Change_Lexicon, 0.1, 2, 0.1, "UNDP_Global_Change")

In [233]:
Global_Change_UNDP_df = pd.concat([UNDP_Global_Change_df_1, UNDP_Global_Change_df_2, UNDP_Global_Change_df_3, UNDP_Global_Change_df_4])

In [234]:
#UNDP and IPCC
UNDP_IPCC_Lexicon = pd.concat([IPCC_Lexicon, UNDP_Lexicon]).drop_duplicates().reset_index(drop = True)
UNDP_IPCC_df_1 = get_metrics_df_1(Lexicon_df, UNDP_IPCC_Lexicon, 1, 20, "UNDP_IPCC")
UNDP_IPCC_df_2 = get_metrics_df_2(Lexicon_df, UNDP_IPCC_Lexicon, 0.1, 2, 0.1, "UNDP_IPCC")
UNDP_IPCC_df_3 = get_metrics_df_3(Lexicon_df, UNDP_IPCC_Lexicon, 1, 20, "UNDP_IPCC")
UNDP_IPCC_df_4 = get_metrics_df_4(Lexicon_df, UNDP_IPCC_Lexicon, 0.1, 2, 0.1, "UNDP_IPCC")

In [235]:
IPCC_UNDP_df = pd.concat([UNDP_IPCC_df_1, UNDP_IPCC_df_2, UNDP_IPCC_df_3, UNDP_IPCC_df_4])

In [236]:
#UNDP and Wikipedia
UNDP_Wikipedia_Lexicon = pd.concat([Wikipedia_Lexicon, UNDP_Lexicon]).drop_duplicates().reset_index(drop = True)
UNDP_Wikipedia_df_1 = get_metrics_df_1(Lexicon_df, UNDP_Wikipedia_Lexicon, 1, 20, "UNDP_Wikipedia")
UNDP_Wikipedia_df_2 = get_metrics_df_2(Lexicon_df, UNDP_Wikipedia_Lexicon, 0.1, 2, 0.1, "UNDP_Wikipedia")
UNDP_Wikipedia_df_3 = get_metrics_df_3(Lexicon_df, UNDP_Wikipedia_Lexicon, 1, 20, "UNDP_Wikipedia")
UNDP_Wikipedia_df_4 = get_metrics_df_4(Lexicon_df, UNDP_Wikipedia_Lexicon, 0.1, 2, 0.1, "UNDP_Wikipedia")

In [237]:
Wikipedia_UNDP_df = pd.concat([UNDP_Wikipedia_df_1, UNDP_Wikipedia_df_2, UNDP_Wikipedia_df_3, UNDP_Wikipedia_df_4])

In [238]:
Lexicon_df_2 = pd.concat([EPA_UNDP_df, BBC_UNDP_df, Global_Change_UNDP_df, IPCC_UNDP_df, Wikipedia_UNDP_df]).reset_index(drop = True)

In [239]:
#EPA and BBC
EPA_BBC_Lexicon = pd.concat([EPA_Lexicon, BBC_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_BBC_df_1 = get_metrics_df_1(Lexicon_df, EPA_BBC_Lexicon, 1, 20, "BBC_EPA")
EPA_BBC_df_2 = get_metrics_df_2(Lexicon_df, EPA_BBC_Lexicon, 0.1, 2, 0.1, "BBC_EPA")
EPA_BBC_df_3 = get_metrics_df_3(Lexicon_df, EPA_BBC_Lexicon, 1, 20, "BBC_EPA")
EPA_BBC_df_4 = get_metrics_df_4(Lexicon_df, EPA_BBC_Lexicon, 0.1, 2, 0.1, "BBC_EPA")

In [240]:
EPA_BBC_df = pd.concat([EPA_BBC_df_1, EPA_BBC_df_2, EPA_BBC_df_3, EPA_BBC_df_4])

In [242]:
#EPA and Global Change
EPA_Global_Change_Lexicon = pd.concat([EPA_Lexicon, Global_Change_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, EPA_Global_Change_Lexicon, 1, 20, "EPA_GLobal_Change")
EPA_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, EPA_Global_Change_Lexicon, 0.1, 2, 0.1, "BBC_Global_Change")
EPA_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, EPA_Global_Change_Lexicon, 1, 20, "BBC_Global_Change")
EPA_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, EPA_Global_Change_Lexicon, 0.1, 2, 0.1, "BBC_Global_Change")

In [243]:
EPA_Global_Change_df = pd.concat([EPA_Global_Change_df_1, EPA_Global_Change_df_2, EPA_Global_Change_df_3, EPA_Global_Change_df_4])

In [244]:
EPA_Global_Change_df.sort_values("Accuracy", ascending = False)

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score
9,BBC_Global_Change,Relative Frequency,1.0,0.886667,0.794643,0.89,0.839623
10,BBC_Global_Change,Relative Frequency,1.1,0.886667,0.817308,0.85,0.833333
11,BBC_Global_Change,Relative Frequency,1.2,0.880000,0.820000,0.82,0.820000
12,BBC_Global_Change,Relative Frequency,1.3,0.880000,0.826531,0.81,0.818182
8,BBC_Global_Change,Relative Frequency,0.9,0.876667,0.764706,0.91,0.831050
...,...,...,...,...,...,...,...
1,EPA_GLobal_Change,Absolute Frequency,2.0,0.556667,0.429185,1.00,0.600601
0,BBC_Global_Change,Relative Presences,0.1,0.500000,0.400000,1.00,0.571429
0,BBC_Global_Change,Relative Frequency,0.1,0.500000,0.400000,1.00,0.571429
0,EPA_GLobal_Change,Absolute Frequency,1.0,0.493333,0.396825,1.00,0.568182


In [245]:
#EPA and IPCC
EPA_IPCC_Lexicon = pd.concat([EPA_Lexicon, IPCC_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_IPCC_df_1 = get_metrics_df_1(Lexicon_df, EPA_IPCC_Lexicon, 1, 20, "EPA_IPCC")
EPA_IPCC_df_2 = get_metrics_df_2(Lexicon_df, EPA_IPCC_Lexicon, 0.1, 2, 0.1, "EPA_IPCC")
EPA_IPCC_df_3 = get_metrics_df_3(Lexicon_df, EPA_IPCC_Lexicon, 1, 20, "EPA_IPCC")
EPA_IPCC_df_4 = get_metrics_df_4(Lexicon_df, EPA_IPCC_Lexicon, 0.1, 2, 0.1, "EPA_IPCC")

In [246]:
EPA_IPCC_df = pd.concat([EPA_IPCC_df_1, EPA_IPCC_df_2, EPA_IPCC_df_3, EPA_IPCC_df_4])

In [247]:
EPA_IPCC_df.sort_values("Accuracy", ascending = False)

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score
15,EPA_IPCC,Relative Frequency,1.6,0.876667,0.805825,0.83,0.817734
14,EPA_IPCC,Relative Frequency,1.5,0.876667,0.788991,0.86,0.822967
13,EPA_IPCC,Relative Frequency,1.4,0.870000,0.765217,0.88,0.818605
12,EPA_IPCC,Relative Frequency,1.3,0.863333,0.743802,0.90,0.814480
17,EPA_IPCC,Relative Frequency,1.8,0.863333,0.810526,0.77,0.789744
...,...,...,...,...,...,...,...
1,EPA_IPCC,Relative Frequency,0.2,0.463333,0.383142,1.00,0.554017
0,EPA_IPCC,Relative Presences,0.1,0.416667,0.363636,1.00,0.533333
0,EPA_IPCC,Relative Frequency,0.1,0.416667,0.363636,1.00,0.533333
0,EPA_IPCC,Absolute Frequency,1.0,0.403333,0.358423,1.00,0.527704


In [248]:
#EPA and Wikipedia
EPA_Wikipedia_Lexicon = pd.concat([EPA_Lexicon, Wikipedia_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_Wikipedia_df_1 = get_metrics_df_1(Lexicon_df, EPA_Wikipedia_Lexicon, 1, 20, "EPA_Wikipedia")
EPA_Wikipedia_df_2 = get_metrics_df_2(Lexicon_df, EPA_Wikipedia_Lexicon, 0.1, 2, 0.1, "EPA_Wikipedia")
EPA_Wikipedia_df_3 = get_metrics_df_3(Lexicon_df, EPA_Wikipedia_Lexicon, 1, 20, "EPA_Wikipedia")
EPA_Wikipedia_df_4 = get_metrics_df_4(Lexicon_df, EPA_Wikipedia_Lexicon, 0.1, 2, 0.1, "EPA_Wikipedia")

In [249]:
EPA_Wikipedia_df = pd.concat([EPA_Wikipedia_df_1, EPA_Wikipedia_df_2, EPA_Wikipedia_df_3, EPA_Wikipedia_df_4])

In [250]:
EPA_Wikipedia_df.sort_values("Accuracy", ascending = False)

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score
13,EPA_Wikipedia,Relative Frequency,1.4,0.896667,0.848485,0.84,0.844221
9,EPA_Wikipedia,Relative Frequency,1.0,0.896667,0.800000,0.92,0.855814
10,EPA_Wikipedia,Relative Frequency,1.1,0.893333,0.809091,0.89,0.847619
14,EPA_Wikipedia,Relative Frequency,1.5,0.893333,0.854167,0.82,0.836735
11,EPA_Wikipedia,Relative Frequency,1.2,0.893333,0.826923,0.86,0.843137
...,...,...,...,...,...,...,...
0,EPA_Wikipedia,Relative Presences,0.1,0.626667,0.471698,1.00,0.641026
1,EPA_Wikipedia,Absolute Frequency,2.0,0.623333,0.469484,1.00,0.638978
0,EPA_Wikipedia,Relative Frequency,0.1,0.620000,0.467290,1.00,0.636943
0,EPA_Wikipedia,Absolute Frequency,1.0,0.603333,0.456621,1.00,0.626959


In [251]:
Lexicon_df_2 = pd.concat([Lexicon_df_2, EPA_BBC_df, EPA_Global_Change_df, EPA_IPCC_df, EPA_Wikipedia_df]).reset_index(drop = True)

In [252]:
#BBC and Global Change
BBC_Global_Change_Lexicon = pd.concat([BBC_Lexicon, Global_Change_Lexicon]).drop_duplicates().reset_index(drop = True)
BBC_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, BBC_Global_Change_Lexicon, 1, 20, "BBC_Global_Change")
BBC_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, BBC_Global_Change_Lexicon, 0.1, 2, 0.1, "BBC_Global_Change")
BBC_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, BBC_Global_Change_Lexicon, 1, 20, "BBC_Global_Change")
BBC_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, BBC_Global_Change_Lexicon, 0.1, 2, 0.1, "BBC_Global_Change")

In [253]:
BBC_Global_Change_df = pd.concat([BBC_Global_Change_df_1, BBC_Global_Change_df_2, BBC_Global_Change_df_3, BBC_Global_Change_df_4])

In [254]:
#BBC and IPCC
BBC_IPCC_Lexicon = pd.concat([BBC_Lexicon, IPCC_Lexicon]).drop_duplicates().reset_index(drop = True)
BBC_IPCC_df_1 = get_metrics_df_1(Lexicon_df, BBC_IPCC_Lexicon, 1, 20, "BBC_IPCC")
BBC_IPCC_df_2 = get_metrics_df_2(Lexicon_df, BBC_IPCC_Lexicon, 0.1, 2, 0.1, "BBC_IPCC")
BBC_IPCC_df_3 = get_metrics_df_3(Lexicon_df, BBC_IPCC_Lexicon, 1, 20, "BBC_IPCC")
BBC_IPCC_df_4 = get_metrics_df_4(Lexicon_df, BBC_IPCC_Lexicon, 0.1, 2, 0.1, "BBC_IPCC")

In [255]:
BBC_IPCC_df = pd.concat([BBC_IPCC_df_1, BBC_IPCC_df_2, BBC_IPCC_df_3, BBC_IPCC_df_4])

In [256]:
#BBC and Wikipedia
BBC_Wikipedia_Lexicon = pd.concat([BBC_Lexicon, Wikipedia_Lexicon]).drop_duplicates().reset_index(drop = True)
BBC_Wikipedia_df_1 = get_metrics_df_1(Lexicon_df, BBC_Wikipedia_Lexicon, 1, 20, "BBC_Wikipedia")
BBC_Wikipedia_df_2 = get_metrics_df_2(Lexicon_df, BBC_Wikipedia_Lexicon, 0.1, 2, 0.1, "BBC_Wikipedia")
BBC_Wikipedia_df_3 = get_metrics_df_3(Lexicon_df, BBC_Wikipedia_Lexicon, 1, 20, "BBC_Wikipedia")
BBC_Wikipedia_df_4 = get_metrics_df_4(Lexicon_df, BBC_Wikipedia_Lexicon, 0.1, 2, 0.1, "BBC_Wikipedia")

In [257]:
BBC_Wikpedia_df = pd.concat([BBC_Wikipedia_df_1, BBC_Wikipedia_df_2, BBC_Wikipedia_df_3, BBC_Wikipedia_df_4])

In [258]:
Lexicon_df_2 = pd.concat([Lexicon_df_2, BBC_Global_Change_df, BBC_IPCC_df, BBC_Wikpedia_df]).reset_index(drop = True)

In [259]:
#Wikipedia and Global Change
Wikipedia_Global_Change_Lexicon = pd.concat([Global_Change_Lexicon, Wikipedia_Lexicon]).drop_duplicates().reset_index(drop = True)
Wikipedia_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, Wikipedia_Global_Change_Lexicon, 1, 20, "Wikipedia_Global_Change")
Wikipedia_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, Wikipedia_Global_Change_Lexicon, 0.1, 2, 0.1, "Wikipedia_Global_Change")
Wikipedia_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, Wikipedia_Global_Change_Lexicon, 1, 20, "Wikipedia_Global_Change")
Wikipedia_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, Wikipedia_Global_Change_Lexicon, 0.1, 2, 0.1, "Wikipedia_Global_Change")

In [260]:
Wikpedia_Global_Change_df = pd.concat([Wikipedia_Global_Change_df_1, Wikipedia_Global_Change_df_2, Wikipedia_Global_Change_df_3, Wikipedia_Global_Change_df_4])

In [261]:
#Wikipedia and IPCC
Wikipedia_IPCC_Lexicon = pd.concat([IPCC_Lexicon, Wikipedia_Lexicon]).drop_duplicates().reset_index(drop = True)
Wikipedia_IPCC_df_1 = get_metrics_df_1(Lexicon_df, Wikipedia_IPCC_Lexicon, 1, 20, "Wikipedia_IPCC")
Wikipedia_IPCC_df_2 = get_metrics_df_2(Lexicon_df, Wikipedia_IPCC_Lexicon, 0.1, 2, 0.1, "Wikipedia_IPCC")
Wikipedia_IPCC_df_3 = get_metrics_df_3(Lexicon_df, Wikipedia_IPCC_Lexicon, 1, 20, "Wikipedia_IPCC")
Wikipedia_IPCC_df_4 = get_metrics_df_4(Lexicon_df, Wikipedia_IPCC_Lexicon, 0.1, 2, 0.1, "Wikipedia_IPCC")

In [262]:
Wikpedia_IPCC_df = pd.concat([Wikipedia_IPCC_df_1, Wikipedia_IPCC_df_2, Wikipedia_IPCC_df_3, Wikipedia_IPCC_df_4])

In [263]:
#IPCC and Global Change
Global_Change_IPCC_Lexicon = pd.concat([IPCC_Lexicon, Global_Change_Lexicon]).drop_duplicates().reset_index(drop = True)
Global_Change_IPCC_df_1 = get_metrics_df_1(Lexicon_df, Global_Change_IPCC_Lexicon, 1, 20, "Global_Change_IPCC")
Global_Change_IPCC_df_2 = get_metrics_df_2(Lexicon_df, Global_Change_IPCC_Lexicon, 0.1, 2, 0.1, "Global_Change_IPCC")
Global_Change_IPCC_df_3 = get_metrics_df_3(Lexicon_df, Global_Change_IPCC_Lexicon, 1, 20, "Global_Change_IPCC")
Global_Change_IPCC_df_4 = get_metrics_df_4(Lexicon_df, Global_Change_IPCC_Lexicon, 0.1, 2, 0.1, "Global_Change_IPCC")

In [264]:
Global_Change_IPCC_df = pd.concat([Global_Change_IPCC_df_1, Global_Change_IPCC_df_2, Global_Change_IPCC_df_3, Global_Change_IPCC_df_4])

In [265]:
Lexicon_df_2 = pd.concat([Lexicon_df_2, Wikpedia_Global_Change_df, Wikpedia_IPCC_df, Global_Change_IPCC_df]).reset_index(drop = True)

In [269]:
Lexicon_df_2.sort_values("F1 Score", ascending = False)

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score
25,EPA_UDNP,Relative Frequency,0.6,0.900000,0.812500,0.91,0.858491
653,EPA_Wikipedia,Relative Frequency,1.0,0.896667,0.800000,0.92,0.855814
415,BBC_EPA,Relative Frequency,0.6,0.896667,0.805310,0.91,0.854460
81,BBC_UNDP,Absolute Frequency,4.0,0.900000,0.843137,0.86,0.851485
652,EPA_Wikipedia,Relative Frequency,0.9,0.890000,0.776860,0.94,0.850679
...,...,...,...,...,...,...,...
58,EPA_UDNP,Absolute Presences,20.0,0.333333,0.000000,0.00,0.000000
57,EPA_UDNP,Absolute Presences,19.0,0.333333,0.000000,0.00,0.000000
56,EPA_UDNP,Absolute Presences,18.0,0.333333,0.000000,0.00,0.000000
760,BBC_Global_Change,Absolute Presences,20.0,0.333333,0.000000,0.00,0.000000


In [267]:
Lexicon_Metrics = pd.concat([Lexicon_df_1, Lexicon_df_2]).reset_index(drop = True)

In [268]:
Lexicon_Metrics.sort_values(by="Accuracy", ascending = False)

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score
493,EPA_UDNP,Relative Frequency,0.6,0.900000,0.812500,0.91,0.858491
549,BBC_UNDP,Absolute Frequency,4.0,0.900000,0.843137,0.86,0.851485
259,EPA,Relative Frequency,0.6,0.896667,0.810811,0.90,0.853081
883,BBC_EPA,Relative Frequency,0.6,0.896667,0.805310,0.91,0.854460
1121,EPA_Wikipedia,Relative Frequency,1.0,0.896667,0.800000,0.92,0.855814
...,...,...,...,...,...,...,...
360,BBC,Absolute Presences,10.0,0.333333,0.000000,0.00,0.000000
361,BBC,Absolute Presences,11.0,0.333333,0.000000,0.00,0.000000
526,EPA_UDNP,Absolute Presences,20.0,0.333333,0.000000,0.00,0.000000
363,BBC,Absolute Presences,13.0,0.333333,0.000000,0.00,0.000000


In [270]:
Lexicon_Metrics.to_parquet("Lexicon_Tagging_Metrics.parquet")

### Three Lexicon Combined

First, we will only look at the ones that showed great performance in the previous steps

In [None]:
#EPA, UNDP and BBC
EPA_UNDP_BBC_Lexicon = pd.concat([EPA_Lexicon, UNDP_Lexicon, BBC_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_UNDP_BBC_df_1 = get_metrics_df_1(Lexicon_df, EPA_UNDP_BBC_Lexicon, 0, 20, "EPA_UNDP_BBC")
EPA_UNDP_BBC_df_2 = get_metrics_df_2(Lexicon_df, EPA_UNDP_BBC_Lexicon, 0, 2, 0.1, "EPA_UNDP_BBC")
EPA_UNDP_BBC_df_3 = get_metrics_df_3(Lexicon_df, EPA_UNDP_BBC_Lexicon, 0, 20, "EPA_UNDP_BBC")
EPA_UNDP_BBC_df_4 = get_metrics_df_4(Lexicon_df, EPA_UNDP_BBC_Lexicon, 0, 2, 0.1, "EPA_UNDP_BBC")

In [None]:
EPA_UNDP_BBC_df = pd.concat([EPA_UNDP_BBC_df_1, EPA_UNDP_BBC_df_2, EPA_UNDP_BBC_df_3, EPA_UNDP_BBC_df_4])

In [None]:
EPA_UNDP_BBC_df.sort_values(by="Accuracy", ascending = False)

In [None]:
#EPA, UNDP and Wikipedia
EPA_UNDP_Wikipedia_Lexicon = pd.concat([EPA_Lexicon, UNDP_Lexicon, Wikipedia_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_UNDP_Wikipedia_df_1 = get_metrics_df_1(Lexicon_df, EPA_UNDP_Wikipedia_Lexicon, 0, 20, "EPA_UNDP_Wikipedia")
EPA_UNDP_Wikipedia_df_2 = get_metrics_df_2(Lexicon_df, EPA_UNDP_Wikipedia_Lexicon, 0, 2, 0.1, "EPA_UNDP_Wikipedia")
EPA_UNDP_Wikipedia_df_3 = get_metrics_df_3(Lexicon_df, EPA_UNDP_Wikipedia_Lexicon, 0, 20, "EPA_UNDP_Wikipedia")
EPA_UNDP_Wikipedia_df_4 = get_metrics_df_4(Lexicon_df, EPA_UNDP_Wikipedia_Lexicon, 0, 2, 0.1, "EPA_UNDP_Wikipedia")

In [None]:
EPA_UNDP_Wikipedia_df = pd.concat([EPA_UNDP_Wikipedia_df_1, EPA_UNDP_Wikipedia_df_2, EPA_UNDP_Wikipedia_df_3, EPA_UNDP_Wikipedia_df_4])

In [None]:
EPA_UNDP_Wikipedia_df.sort_values(by="F1 Score", ascending = False)

In [None]:
get_nsmh_crosstab_2(Lexicon_df, EPA_UNDP_Wikipedia_Lexicon, 1, 1, 0)

In [None]:
#EPA, UNDP, BBC and Wikipedia
EPA_UNDP_BBC_Wikipedia_Lexicon = pd.concat([EPA_Lexicon, UNDP_Lexicon, Wikipedia_Lexicon, BBC_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_UNDP_BBC_Wikipedia_df_1 = get_metrics_df_1(Lexicon_df, EPA_UNDP_BBC_Wikipedia_Lexicon, 0, 20, "EPA_UNDP_Wikipedia_BBC")
EPA_UNDP_BBC_Wikipedia_df_2 = get_metrics_df_2(Lexicon_df, EPA_UNDP_BBC_Wikipedia_Lexicon, 0, 2, 0.1, "EPA_UNDP_Wikipedia_BBC")
EPA_UNDP_BBC_Wikipedia_df_3 = get_metrics_df_3(Lexicon_df, EPA_UNDP_BBC_Wikipedia_Lexicon, 0, 20, "EPA_UNDP_Wikipedia_BBC")
EPA_UNDP_BBC_Wikipedia_df_4 = get_metrics_df_4(Lexicon_df, EPA_UNDP_BBC_Wikipedia_Lexicon, 0, 2, 0.1, "EPA_UNDP_Wikipedia_BBC")

In [None]:
EPA_UNDP_BBC_Wikipedia_df = pd.concat([EPA_UNDP_BBC_Wikipedia_df_1, EPA_UNDP_BBC_Wikipedia_df_2, EPA_UNDP_BBC_Wikipedia_df_3, EPA_UNDP_BBC_Wikipedia_df_4])

In [None]:
EPA_UNDP_BBC_Wikipedia_df.sort_values(by="F1 Score", ascending = False)

### Combining All Six Lexicons

In [None]:
Full_Lexicon = pd.concat([Global_Change_Lexicon, IPCC_Lexicon, EPA_Lexicon, Wikipedia_Lexicon, BBC_Lexicon, UNDP_Lexicon]).drop_duplicates(subset = ["Lexicon"])
get_metrics_df_2(Lexicon_df, Full_Lexicon, 1, 2, 0.2, "Full")

## 3.2. Hugging Face

In [90]:
get_metrics_hugging_face(Lexicon_df, 'Text',"climatebert/environmental-claims",512)

Accuracy: 0.3333333333333333
Precision: 0.5
Recall: 1.0
F1 score: 0.6666666666666666
Label_Hugging   no  All
Target                 
No             200  200
Yes            100  100
All            300  300


Label_Hugging                      no  All
Final_Climate_Change_Level_Label          
High                               65   65
Medium                             35   35
No Climate                        109  109
Small                              91   91
All                               300  300


In [129]:
get_metrics_hugging_face(Lexicon_df, "Text", "climatebert/distilroberta-base-climate-detector", 512)

Accuracy: 0.7666666666666667
Precision: 1.4615384615384615
Recall: 19.0
F1 score: 2.7142857142857144
Label_Hugging   no  yes  All
Target                      
No             135   65  200
Yes              5   95  100
All            140  160  300


Label_Hugging                      no  yes  All
Final_Climate_Change_Level_Label               
High                                0   65   65
Medium                              5   30   35
No Climate                         99   10  109
Small                              36   55   91
All                               140  160  300


# 4 Final Selection

In [272]:
pd.read_parquet("Lexicon_Tagging_Metrics.parquet")

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score
0,Global Change,Absolute Frequency,1.0,0.506667,0.403226,1.00,0.574713
1,Global Change,Absolute Frequency,2.0,0.666667,0.500000,0.86,0.632353
2,Global Change,Absolute Frequency,3.0,0.740000,0.596491,0.68,0.635514
3,Global Change,Absolute Frequency,4.0,0.776667,0.694118,0.59,0.637838
4,Global Change,Absolute Frequency,5.0,0.773333,0.766667,0.46,0.575000
...,...,...,...,...,...,...,...
1633,Global_Change_IPCC,Relative Presences,1.5,0.770000,0.754098,0.46,0.571429
1634,Global_Change_IPCC,Relative Presences,1.6,0.760000,0.769231,0.40,0.526316
1635,Global_Change_IPCC,Relative Presences,1.7,0.753333,0.770833,0.37,0.500000
1636,Global_Change_IPCC,Relative Presences,1.8,0.750000,0.777778,0.35,0.482759
