# 0. Packages and Functions

## 0.1. Packages

In [33]:
import pandas as pd
import string
from nltk import ngrams
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM
import pyarrow.parquet as pq
import pyarrow as pyarrow
import numpy as np

## 0.2. Functions

In [34]:
#pre-process text for lexicon based approaches
def preprocess_text(text):
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # convert to lower case
    text = text.lower()
    # remove blank spaces
    text = ' '.join(text.split())
    # remove newline characters
    text = text.replace('\n', '')
    return text

### Absolute Count with Frequency

In [414]:
#Create function that take the dataframe, lexicon and n-gram value (how many n-grams should be considered) and determine the 
#count of words in dataframe text that match the lexicon
def count_lexicon_words_1(text_df, lexicon):
    lexicon = lexicon["Lexicon"]
    count = []

    for text in text_df["Text"]:
        lexicon_counts = 0

        for word in lexicon:
            lexicon_counts += text.lower().count(word.lower())
            
        count.append(lexicon_counts)
        
    text_df["Lexicon Count"] = count
    
    return(text_df)

#create a function that used the lexicon approach to determine with the target is yes or no
def lexicon_target_classifier_1(df, treshold):
    target = []
    
    count = df["Lexicon Count"]
    
    for c in count:
        if c >= treshold:
            target.append("Yes")
        else:
            target.append("No")
            
    df["Target Lexicon"] = target
    return(df)

#Combine both functions to classify articles based on the lexicon
def lexicon_climate_classifier_1(text_df, lexicon, treshold):
    df = count_lexicon_words_1(text_df, lexicon)
    
    return(lexicon_target_classifier_1(df, treshold))

def threshold_metrics_1(df_text, lexicon, min_treshhold, max_treshhold):
    for i in range(min_treshhold, max_treshhold + 1):
        df = lexicon_climate_classifier_1(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)
        
        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / (cross_table.iloc[0,1] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) else 0
        recall = cross_table.iloc[1,1] / (cross_table.iloc[1,0] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

        # print the metrics
        print("Threshhold:", i)
        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 score:", f1_score)
        print(cross_table)
        print("\n")
        
def get_metrics_df_1(df_text, lexicon, min_treshhold, max_treshhold, Lexicon_name):    
    Accuracy = []
    Precision = []
    Recall = []
    F1_score = []
    Name = []
    Treshhold = []
    Technique = []
    
    for i in range(min_treshhold, max_treshhold + 1):
        df = lexicon_climate_classifier_1(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / (cross_table.iloc[0,1] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) else 0
        recall = cross_table.iloc[1,1] / (cross_table.iloc[1,0] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

        Accuracy.append(accuracy)
        Precision.append(precision)
        Recall.append(recall)
        F1_score.append(f1_score)
        Name.append(Lexicon_name)
        Treshhold.append(i)
        Technique.append("Absolute Frequency")
        
    return(pd.DataFrame({"Lexicon" : Name, "Technique": Technique, "Treshhold" : Treshhold, "Accuracy" : Accuracy, "Precision" : Precision, "Recall" : Recall, "F1 Score" : F1_score}))
    

def get_nsmh_crosstab_1(df_text, lexicon, min_treshhold, max_treshhold):
    for i in range(min_treshhold, max_treshhold + 1):
        df = lexicon_climate_classifier_1(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Final_Climate_Change_Level_Label'], df['Target Lexicon'], margins=True)
        print(cross_table)
        print("\n")
        print("\n")

### Relative Count with Frequency

In [359]:
#Create function that take the dataframe, lexicon and n-gram value (how many n-grams should be considered) and determine the 
#count of words in dataframe text that match the lexicon
def count_lexicon_words_2(text_df, lexicon):
    lexicon = lexicon["Lexicon"]
    count = []
    
    for text in text_df["Text"]:
        lexicon_counts = 0

        for word in lexicon:
            lexicon_counts += text.lower().count(word.lower())
        
        word_list = text.split() 
        word_count = len(word_list)
        count.append((lexicon_counts/word_count)*100)

    text_df["Lexicon Count"] = count
    
    return(text_df)

#create a function that used the lexicon approach to determine with the target is yes or no
def lexicon_target_classifier_2(df, treshold):
    target = []
    
    count = df["Lexicon Count"]
    
    for c in count:
        if c < treshold:
            target.append("No")
        else:
            target.append("Yes")
            
    df["Target Lexicon"] = target
    return(df)

#Combine both functions to classify articles based on the lexicon
def lexicon_climate_classifier_2(text_df, lexicon, treshold):
    df = count_lexicon_words_2(text_df, lexicon)
    
    return(lexicon_target_classifier_2(df, treshold))

def threshold_metrics_2(df_text, lexicon, min_treshhold, max_treshhold, jump):
    
    if(jump == 0):
        df = lexicon_climate_classifier_2(df_text, lexicon, min_treshhold)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / (cross_table.iloc[0,1] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) else 0
        recall = cross_table.iloc[1,1] / (cross_table.iloc[1,0] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

        # print the metrics
        print("Threshhold:", min_treshhold)
        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 score:", f1_score)
        print(cross_table)
        print("\n")

    else:   
        for num in range(int((max_treshhold - min_treshhold) / jump) + 1):
            i = min_treshhold + num * jump
            df = lexicon_climate_classifier_2(df_text, lexicon, i)
            cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

            # calculate classification metrics using scikit-learn
            accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
            precision = cross_table.iloc[1,1] / (cross_table.iloc[0,1] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) else 0
            recall = cross_table.iloc[1,1] / (cross_table.iloc[1,0] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) != 0 else 0
            f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

            # print the metrics
            print("Threshhold:", i)
            print("Accuracy:", accuracy)
            print("Precision:", precision)
            print("Recall:", recall)
            print("F1 score:", f1_score)
            print(cross_table)
            print("\n")

def get_metrics_df_2(df_text, lexicon, min_treshhold, max_treshhold, jump, Lexicon_name):    
    Accuracy = []
    Precision = []
    Recall = []
    F1_score = []
    Name = []
    Treshhold = []
    Technique = []
    
    for num in range(int((max_treshhold - min_treshhold) / jump) + 1):
        i = min_treshhold + num * jump
        df = lexicon_climate_classifier_2(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / (cross_table.iloc[0,1] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) else 0
        recall = cross_table.iloc[1,1] / (cross_table.iloc[1,0] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
        
        Accuracy.append(accuracy)
        Precision.append(precision)
        Recall.append(recall)
        F1_score.append(f1_score)
        Name.append(Lexicon_name)
        Treshhold.append(i)
        Technique.append("Relative Frequency")
        
    return(pd.DataFrame({"Lexicon" : Name, "Technique": Technique, "Treshhold" : Treshhold, "Accuracy" : Accuracy, "Precision" : Precision, "Recall" : Recall, "F1 Score" : F1_score}))

def get_nsmh_crosstab_2(df_text, lexicon, min_treshhold, max_treshhold, jump):
    if(jump == 0):
        df = lexicon_climate_classifier_2(df_text, lexicon, min_treshhold)
        cross_table = pd.crosstab(df['Final_Climate_Change_Level_Label'], df['Target Lexicon'], margins=True)
        print(cross_table)
    else:   
        for num in range(int((max_treshhold - min_treshhold) / jump) + 1):
            i = min_treshhold + num * jump
            df = lexicon_climate_classifier_2(df_text, lexicon, i)
            cross_table = pd.crosstab(df['Final_Climate_Change_Level_Label'], df['Target Lexicon'], margins=True)
            print(cross_table)
            print("\n")
            print("\n")

### Absolute Term Presences

In [358]:
#Create function that take the dataframe, lexicon and n-gram value (how many n-grams should be considered) and determine the 
#count of words in dataframe text that match the lexicon
def count_lexicon_words_3(text_df, lexicon):
    lexicon = lexicon["Lexicon"]
    count = []

    for text in text_df["Text"]:
        lexicon_counts = 0

        for word in lexicon:
            if text.lower().count(word.lower()) > 0:
                lexicon_counts += 1
        
        count.append(lexicon_counts)
        
    text_df["Lexicon Count"] = count
    
    return text_df

#create a function that used the lexicon approach to determine with the target is yes or no
def lexicon_target_classifier_3(df, treshold):
    target = []
    
    count = df["Lexicon Count"]
    
    for c in count:
        if c < treshold:
            target.append("No")
        else:
            target.append("Yes")
            
    df["Target Lexicon"] = target
    return(df)

#Combine both functions to classify articles based on the lexicon
def lexicon_climate_classifier_3(text_df, lexicon, treshold):
    df = count_lexicon_words_3(text_df, lexicon)
    
    return(lexicon_target_classifier_3(df, treshold))

def threshold_metrics_3(df_text, lexicon, min_treshhold, max_treshhold):
    for i in range(min_treshhold, max_treshhold + 1):
        df = lexicon_climate_classifier_3(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)
        
        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / (cross_table.iloc[0,1] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) else 0
        recall = cross_table.iloc[1,1] / (cross_table.iloc[1,0] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

        # print the metrics
        print("Threshhold:", i)
        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 score:", f1_score)
        print(cross_table)
        print("\n")
        
def get_metrics_df_3(df_text, lexicon, min_treshhold, max_treshhold, Lexicon_name):    
    Accuracy = []
    Precision = []
    Recall = []
    F1_score = []
    Name = []
    Treshhold = []
    Technique = []
    
    for i in range(min_treshhold, max_treshhold + 1):
        df = lexicon_climate_classifier_3(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / (cross_table.iloc[0,1] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) else 0
        recall = cross_table.iloc[1,1] / (cross_table.iloc[1,0] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

        Accuracy.append(accuracy)
        Precision.append(precision)
        Recall.append(recall)
        F1_score.append(f1_score)
        Name.append(Lexicon_name)
        Treshhold.append(i)
        Technique.append("Absolute Presences")
        
    return(pd.DataFrame({"Lexicon" : Name, "Technique": Technique, "Treshhold" : Treshhold, "Accuracy" : Accuracy, "Precision" : Precision, "Recall" : Recall, "F1 Score" : F1_score}))
    
def get_nsmh_crosstab_3(df_text, lexicon, min_treshhold, max_treshhold):
    for i in range(min_treshhold, max_treshhold + 1):
        df = lexicon_climate_classifier_2(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Final_Climate_Change_Level_Label'], df['Target Lexicon'], margins=True)
        print(cross_table)
        print("\n")
        print("\n")

### Relative Term Presences

In [38]:
#Create function that take the dataframe, lexicon and n-gram value (how many n-grams should be considered) and determine the 
#count of words in dataframe text that match the lexicon
def count_lexicon_words_4(text_df, lexicon):
    lexicon = lexicon["Lexicon"]
    count = []

    for text in text_df["Text"]:
        lexicon_counts = 0

        for word in lexicon:
            if text.lower().count(word.lower()) > 0:
                lexicon_counts += 1
        
        word_list = text.split() 
        word_count = len(word_list)
        count.append((lexicon_counts/word_count)*100)
        
    text_df["Lexicon Count"] = count
    
    return text_df

#create a function that used the lexicon approach to determine with the target is yes or no
def lexicon_target_classifier_4(df, treshold):
    target = []
    
    count = df["Lexicon Count"]
    
    for c in count:
        if c < treshold:
            target.append("No")
        else:
            target.append("Yes")
            
    df["Target Lexicon"] = target
    return(df)

#Combine both functions to classify articles based on the lexicon
def lexicon_climate_classifier_4(text_df, lexicon, treshold):
    df = count_lexicon_words_4(text_df, lexicon)
    
    return(lexicon_target_classifier_4(df, treshold))

def threshold_metrics_4(df_text, lexicon, min_treshhold, max_treshhold, jump):
    
    if(jump == 0):
        df = lexicon_climate_classifier_4(df_text, lexicon, min_treshhold)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / (cross_table.iloc[0,1] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) else 0
        recall = cross_table.iloc[1,1] / (cross_table.iloc[1,0] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

        # print the metrics
        print("Threshhold:", i)
        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 score:", f1_score)
        print(cross_table)
        print("\n")
        
    else:
        for num in range(int((max_treshhold - min_treshhold) / jump) + 1):
            i = min_treshhold + num * jump
            df = lexicon_climate_classifier_4(df_text, lexicon, i)
            cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

            # calculate classification metrics using scikit-learn
            accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
            precision = cross_table.iloc[1,1] / cross_table.iloc[0,1] if cross_table.iloc[0,1] != 0 else 0
            recall = cross_table.iloc[1,1] / cross_table.iloc[1,0] if cross_table.iloc[1,0] != 0 else 0
            f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

            # print the metrics
            print("Threshhold:", i)
            print("Accuracy:", accuracy)
            print("Precision:", precision)
            print("Recall:", recall)
            print("F1 score:", f1_score)
            print(cross_table)
            print("\n")

def get_metrics_df_4(df_text, lexicon, min_treshhold, max_treshhold, jump, Lexicon_name):    
    Accuracy = []
    Precision = []
    Recall = []
    F1_score = []
    Name = []
    Treshhold = []
    Technique = []
    
    for num in range(int((max_treshhold - min_treshhold) / jump) + 1):
        i = min_treshhold + num * jump
        df = lexicon_climate_classifier_4(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / (cross_table.iloc[0,1] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) else 0
        recall = cross_table.iloc[1,1] / (cross_table.iloc[1,0] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

        Accuracy.append(accuracy)
        Precision.append(precision)
        Recall.append(recall)
        F1_score.append(f1_score)
        Name.append(Lexicon_name)
        Treshhold.append(i)
        Technique.append("Relative Presences")
        
    return(pd.DataFrame({"Lexicon" : Name, "Technique": Technique, "Treshhold" : Treshhold, "Accuracy" : Accuracy, "Precision" : Precision, "Recall" : Recall, "F1 Score" : F1_score}))

def get_nsmh_crosstab_4(df_text, lexicon, min_treshhold, max_treshhold, jump):
    if(jump == 0):
        df = lexicon_climate_classifier_4(df_text, lexicon, min_treshhold)
        cross_table = pd.crosstab(df['Final_Climate_Change_Level_Label'], df['Target Lexicon'], margins=True)
        print(cross_table)
    else:   
        for num in range(int((max_treshhold - min_treshhold) / jump) + 1):
            i = min_treshhold + num * jump
            df = lexicon_climate_classifier_4(df_text, lexicon, i)
            cross_table = pd.crosstab(df['Final_Climate_Change_Level_Label'], df['Target Lexicon'], margins=True)
            print(cross_table)
            print("\n")
            print("\n")

### Hugging Face

In [335]:
def summary(df_with_text, name, model_name, max_lenght_input=-1):
    data_in_list = df_with_text[name].tolist()
    tokenizer_sum = AutoTokenizer.from_pretrained(model_name)
    model_sum = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    summarizer = pipeline('summarization', model=model_sum, tokenizer = tokenizer_sum) 

    if max_lenght_input>=0:
        df_with_text['summary'] = summarizer(data_in_list, max_length=max_lenght_input)

    else:
        df_with_text['summary'] = summarizer(data_in_list)

def classification(df_with_text, name, model_name, max_lenght_input=-1):
    data_in_list = df_with_text[name].tolist()
    tokenizer_clas = AutoTokenizer.from_pretrained(model_name)
    model_clas = AutoModelForSequenceClassification.from_pretrained(model_name)
    classification = pipeline('text-classification', model=model_clas, tokenizer = tokenizer_clas) 

    if max_lenght_input>=0:
        df_with_text['classification'] = classification(data_in_list, max_length=max_lenght_input, truncation=True)

    else:
        df_with_text['classification'] = classification(data_in_list)
        
    return(df_with_text)

def get_metrics_hugging_face(text_df, text_column, model, tokens):
    
    df = classification(text_df, text_column, model, tokens)
    label_list = list(df["classification"])
    labels = [entry['label'] for entry in label_list]
    df["Label_Hugging"] = labels
    
    cross_table = pd.crosstab(df['Target'], df['Label_Hugging'], margins=True)
    
    # calculate classification metrics using scikit-learn
    accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
    precision = cross_table.iloc[1,1] / (cross_table.iloc[0,1] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) else 0
    recall = cross_table.iloc[1,1] / (cross_table.iloc[1,0] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0


    # print the metrics
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 score:", f1_score)
    print(cross_table)
    print("\n")
    print(pd.crosstab(df['Final_Climate_Change_Level_Label'], df['Label_Hugging'], margins=True))
    

def get_metrics_df_hugging_face(text_df, text_column, model, tokens, model_name):
    df = classification(text_df, text_column, model, tokens)
    label_list = list(df["classification"])
    labels = [entry['label'] for entry in label_list]
    df["Label_Hugging"] = labels
    
    cross_table = pd.crosstab(df['Target'], df['Label_Hugging'], margins=True)
    
    # calculate classification metrics using scikit-learn
    accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
    precision = cross_table.iloc[1,1] / (cross_table.iloc[0,1] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) else 0
    recall = cross_table.iloc[1,1] / (cross_table.iloc[1,0] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
    
    return(pd.DataFrame({"Model" : [model_name], "Accuracy" : [accuracy], "Precision" : [precision], "Recall" : [recall],
                         "F1 Score" : [f1_score]}))

Accuracy: This metric measures the overall performance of a model. It is defined as the number of correct predictions divided by the total number of predictions. Accuracy is a good metric to use when the classes are roughly balanced, meaning there are about the same number of positive and negative examples in the dataset.

Precision: This metric measures how many of the positive predictions made by a model are actually correct. It is defined as the number of true positives divided by the total number of positive predictions. Precision is a good metric to use when we care more about avoiding false positives than false negatives.

Recall: This metric measures how many of the positive examples in the dataset are correctly predicted by the model. It is defined as the number of true positives divided by the total number of actual positive examples. Recall is a good metric to use when we care more about avoiding false negatives than false positives.

F1 score: This metric is a weighted average of precision and recall, where the weight is determined by the beta parameter. The most common value for beta is 1, which gives equal weight to precision and recall. The F1 score is a good metric to use when we want to balance precision and recall, and when the classes are imbalanced.

# 1. Import Label Dataset

In [40]:
tag_climate_df = pd.read_parquet("Climate_Labels_Dataset.parquet")
tag_climate_df.head(5)

Unnamed: 0,Text,Link,Sentiment_Label,Sentiment_Label_R,Level_Climate_Change_Topic,Level_Climate_Change_Topic_R,Final_Climate_Change_Level_Label,Final_Sentiment_Label,was_I_retarded?
0,More than a dozen state attorneys general gath...,https://www.washingtonpost.com/news/energy-env...,1,-1,Medium,Medium,Medium,-1,No
1,Sen. Jeff Merkley of Oregon endorsed Bernie S...,http://www.wsj.com/articles/campaign-wire-1460...,0,0,Small,Small,Small,0,Yes
2,When Carmen Luna moved to a neighborhood on t...,https://www.wsj.com/articles/mexico-city-strug...,-1,-1,Medium,Medium,Medium,-1,Yes
3,As ocean warming continues to trigger widespre...,https://www.washingtonpost.com/national/health...,1,-1,High,High,High,-1,No
4,PG&E Corp. told California regulators that it...,https://www.wsj.com/articles/pg-e-equipment-mi...,-1,-1,Medium,Medium,Medium,-1,Yes


In [41]:
#Only keep the required columns
tag_climate_df = tag_climate_df[["Text", "Final_Climate_Change_Level_Label"]]

In [42]:
tag_climate_df

Unnamed: 0,Text,Final_Climate_Change_Level_Label
0,More than a dozen state attorneys general gath...,Medium
1,Sen. Jeff Merkley of Oregon endorsed Bernie S...,Small
2,When Carmen Luna moved to a neighborhood on t...,Medium
3,As ocean warming continues to trigger widespre...,High
4,PG&E Corp. told California regulators that it...,Medium
...,...,...
295,"U.S. government bond prices swung Wednesday, u...",Na
296,Japan’s corporate governance reforms are start...,Na
297,While President Trump is out there wheezing hi...,Na
298,The South is home to three schools ranked four...,Na


In [43]:
#Clean the tabel
tag_climate_df['Final_Climate_Change_Level_Label'] = tag_climate_df['Final_Climate_Change_Level_Label'].str.strip()
tag_climate_df.loc[tag_climate_df["Final_Climate_Change_Level_Label"] == "NA", "Final_Climate_Change_Level_Label"] = "Na"
tag_climate_df.loc[tag_climate_df["Final_Climate_Change_Level_Label"] == "0", "Final_Climate_Change_Level_Label"] = "Na"
tag_climate_df.loc[tag_climate_df["Final_Climate_Change_Level_Label"] == "Na", "Final_Climate_Change_Level_Label"] = "No Climate"
tag_climate_df["Target"] = tag_climate_df["Final_Climate_Change_Level_Label"].apply(lambda x: "Yes" if x in ["High", "Medium"] else "No")

In [44]:
overview_labels_hms = tag_climate_df.groupby("Final_Climate_Change_Level_Label")["Text"].count().reset_index()

In [45]:
overview_labels_hms

Unnamed: 0,Final_Climate_Change_Level_Label,Text
0,High,65
1,Medium,35
2,No Climate,109
3,Small,91


In [46]:
overview_labels = tag_climate_df.groupby("Target")["Text"].count().reset_index()

In [47]:
overview_labels

Unnamed: 0,Target,Text
0,No,200
1,Yes,100


In [48]:
#overview_labels_hms.to_csv("C:/Users/Boedt/OneDrive/Bureaublad/R Thesis/overview_tag_labels_hms", index = False)
#overview_labels.to_csv("C:/Users/Boedt/OneDrive/Bureaublad/R Thesis/overview_tag_labels", index = False)

# 2. Taggers

## 2.1. Lexicon

### Global Change Lexicon

Uitleg

In [49]:
#Load the lexicon
Global_Change_Lexicon = pd.read_csv("Lexicons/Global_Change_Lexicon")
Global_Change_Lexicon = Global_Change_Lexicon.drop_duplicates().reset_index(drop = True)
Global_Change_Lexicon["Lexicon"] = Global_Change_Lexicon["Lexicon"].str.lower()

Global_Change_Lexicon

Unnamed: 0,Lexicon
0,100-year flood
1,emissions scenario
2,adaptation
3,adaptation science
4,adaptive capacity
...,...
100,vector
101,vulnerability
102,vulnerability assessment
103,water security


### IPCC Lexicon

Uitleg

In [390]:
#Load the lexicon
IPCC_Lexicon = pd.read_csv("Lexicons/IPCC_Lexicon")
IPCC_Lexicon = IPCC_Lexicon.drop_duplicates().reset_index(drop = True)
IPCC_Lexicon["Lexicon"] = IPCC_Lexicon["Lexicon"].str.lower()

IPCC_Lexicon = pd.DataFrame(IPCC_Lexicon["Lexicon"])
list(IPCC_Lexicon["Lexicon"])

['acceptability of policy or system change',
 'adaptability',
 'adaptation',
 'adaptation behaviour',
 'adaptation limits',
 'adaptation options',
 'adaptation pathways',
 'adaptive capacity',
 'adaptive governance',
 'aerosol',
 'afforestation',
 'agreement',
 'air pollution',
 'albedo',
 'ambient persuasive technology',
 'anomaly',
 'anthropocene',
 'anthropogenic',
 'anthropogenic emissions',
 'anthropogenic removals',
 'artificial intelligence',
 'atmosphere',
 'atmosphere–ocean general circulation model',
 'attribution',
 'baseline scenario',
 'battery electric vehicle',
 ' bev ',
 'biochar',
 'biodiversity',
 'bioenergy',
 'bioenergy with carbon dioxide capture and storage',
 ' beccs ',
 'biofuel',
 'biomass',
 'biophilic urbanism',
 'black carbon',
 ' bc ',
 'blue carbon',
 'burden sharing',
 'effort sharing',
 'business as usual',
 'carbon budget',
 'carbon cycle',
 'carbon dioxide',
 ' co2 ',
 'carbon dioxide capture and storage',
 ' ccs ',
 'carbon dioxide capture and utilisa

### Wikipedia Lexicon

Uitleg

In [51]:
#Load the lexicon
Wikipedia_Lexicon = pd.read_csv("Lexicons/Wikipedia_Lexicon")
Wikipedia_Lexicon = Wikipedia_Lexicon.drop_duplicates().reset_index(drop = True)
Wikipedia_Lexicon["Lexicon"] = Wikipedia_Lexicon["Lexicon"].str.lower()

Wikipedia_Lexicon

Unnamed: 0,Lexicon
0,"100,000-year problem"
1,adaptation
2,additionality
3,albedo
4,anoxic event
...,...
159,volcanism
160,water vapor
161,weather
162,world climate report


### EPA Lexicon

Uitleg

In [52]:
#Load the lexicon
EPA_Lexicon = pd.read_csv("Lexicons/EPA_Lexicon")
EPA_Lexicon = EPA_Lexicon.drop_duplicates().reset_index(drop = True)
EPA_Lexicon["Lexicon"] = EPA_Lexicon["Lexicon"].str.lower()

EPA_Lexicon = pd.DataFrame(EPA_Lexicon["Lexicon"])
EPA_Lexicon

Unnamed: 0,Lexicon
0,abrupt climate change
1,adaptation
2,adaptive capacity
3,aerosols
4,afforestation
...,...
171,pfcs
172,sf6
173,o3
174,uv


### BBC Lexicon

Uitleg

In [53]:
#Load the lexicon
BBC_Lexicon = pd.read_csv("Lexicons/BBC_Lexicon")
BBC_Lexicon = BBC_Lexicon.drop_duplicates().reset_index(drop = True)
BBC_Lexicon["Lexicon"] = BBC_Lexicon["Lexicon"].str.lower()

BBC_Lexicon = pd.DataFrame(BBC_Lexicon["Lexicon"])
BBC_Lexicon

Unnamed: 0,Lexicon
0,adaptation
1,adaptation fund
2,annex i countries
3,annex ii
4,anthropogenic climate change
...,...
74,gwp
75,ghgs
76,ji
77,350/450


### UNDP Lexicon

Uitleg

In [54]:
#Load the lexicon
UNDP_Lexicon = pd.read_csv("Lexicons/UNDP_Lexicon")
UNDP_Lexicon = UNDP_Lexicon.drop_duplicates().reset_index(drop = True)
UNDP_Lexicon["Lexicon"] = UNDP_Lexicon["Lexicon"].str.lower()

UNDP_Lexicon = pd.DataFrame(UNDP_Lexicon["Lexicon"])
UNDP_Lexicon

Unnamed: 0,Lexicon
0,weather
1,climate
2,greenhouse gases
3,greenhouse gas emmisions
4,global warming
5,climate change
6,climate crisis
7,feedback loop
8,tipping point
9,climate overshoot


### Compare the lexicons to each other

In [55]:
#Create an empty dataframe and write a function to fill with the values

common_words_df = pd.DataFrame({"Lexicon" : ["Global Change", "IPCC", "Wikipedia", "EPA", "BBC", "UNDP"], 
                               "Global Change": [0, 0, 0, 0, 0, 0], "IPCC" : [0, 0, 0, 0, 0, 0], 
                               "Wikipedia" : [0, 0, 0, 0, 0, 0], "EPA" : [0, 0, 0, 0, 0, 0], 
                               "BBC" : [0, 0, 0, 0, 0, 0], "UNDP" : [0, 0, 0, 0, 0, 0]})

dfs = [Global_Change_Lexicon, IPCC_Lexicon, Wikipedia_Lexicon, EPA_Lexicon, BBC_Lexicon, UNDP_Lexicon]

for r in range(0, len(dfs)):
    for c in range(0, len(dfs)):
        # Get the common values between the two columns
        common_words = set(dfs[r]['Lexicon']).intersection(set(dfs[c]['Lexicon']))
        common_words_df.loc[r, common_words_df.columns[c +1]] = len(common_words)

common_words_df

Unnamed: 0,Lexicon,Global Change,IPCC,Wikipedia,EPA,BBC,UNDP
0,Global Change,105,38,12,20,11,7
1,IPCC,38,405,28,46,22,19
2,Wikipedia,12,28,164,33,15,10
3,EPA,20,46,33,176,16,14
4,BBC,11,22,15,16,79,11
5,UNDP,7,19,10,14,11,47


In [56]:
dfs = [Global_Change_Lexicon, IPCC_Lexicon, Wikipedia_Lexicon, EPA_Lexicon, BBC_Lexicon, UNDP_Lexicon]
non_unique_words = []
unique_words = []
total_words = []
for r in range(len(dfs)):
    common_words = []
    for c in range(len(dfs)):
        if c != r:
            # Get the common values between the two columns
            common_words.extend(list(set(dfs[r]['Lexicon']).intersection(set(dfs[c]['Lexicon']))))
    common_words = list(set(common_words))  # Remove duplicates by converting to a set and back to a list
    total_words.append(len(dfs[r]["Lexicon"]))
    unique_words.append(len(dfs[r]["Lexicon"]) - len(common_words))
    non_unique_words.append(len(common_words))

non_unique_words

unique_words_df = pd.DataFrame({"Lexicon" : ["Global Change", "IPCC", "Wikipedia", "EPA", "BBC", "UNDP"], 
                                "Non unique words" : non_unique_words, "unique words" : unique_words, 
                               "total_words" : total_words})

unique_words_df["Richness"] = unique_words_df["unique words"] / unique_words_df["total_words"]

unique_words_df

Unnamed: 0,Lexicon,Non unique words,unique words,total_words,Richness
0,Global Change,45,60,105,0.571429
1,IPCC,88,317,405,0.782716
2,Wikipedia,47,117,164,0.713415
3,EPA,66,110,176,0.625
4,BBC,34,45,79,0.56962
5,UNDP,26,21,47,0.446809


# 3. Testen Taggers

## 3.1. Lexicon

In [57]:
#create a separate df with the specific cleaning for the lexicons
Lexicon_df = tag_climate_df.copy()
Lexicon_df["Text"] = Lexicon_df["Text"].apply(preprocess_text)

### 3.1.1. One Lexicon

In [58]:
#Global Change Lexicon
Global_Change_df_1 = get_metrics_df_1(Lexicon_df, Global_Change_Lexicon, 1, 20, "Global Change")
Global_Change_df_2 = get_metrics_df_2(Lexicon_df, Global_Change_Lexicon, 0.1, 2, 0.1, "Global Change")
Global_Change_df_3 = get_metrics_df_3(Lexicon_df, Global_Change_Lexicon, 1, 20, "Global Change")
Global_Change_df_4 = get_metrics_df_4(Lexicon_df, Global_Change_Lexicon, 0.1, 2, 0.1, "Global Change")

In [59]:
Global_Change_df = pd.concat([Global_Change_df_1, Global_Change_df_2, Global_Change_df_3, Global_Change_df_4])

In [60]:
#IPCC Lexicon
IPCC_df_1 = get_metrics_df_1(Lexicon_df, IPCC_Lexicon, 1, 20, "IPCC")
IPCC_df_2 = get_metrics_df_2(Lexicon_df, IPCC_Lexicon, 0.1, 2, 0.1, "IPCC")
IPCC_df_3 = get_metrics_df_3(Lexicon_df, IPCC_Lexicon, 1, 20, "IPCC")
IPCC_df_4 = get_metrics_df_4(Lexicon_df, IPCC_Lexicon, 0.1, 2, 0.1, "IPCC")

In [61]:
IPCC_df = pd.concat([IPCC_df_1, IPCC_df_2, IPCC_df_3, IPCC_df_4])

In [62]:
#Wikipedia Lexicon
Wikipedia_df_1 = get_metrics_df_1(Lexicon_df, Wikipedia_Lexicon, 1, 20, "Wikipedia")
Wikipedia_df_2 = get_metrics_df_2(Lexicon_df, Wikipedia_Lexicon, 0.1, 2, 0.1, "Wikipedia")
Wikipedia_df_3 = get_metrics_df_3(Lexicon_df, Wikipedia_Lexicon, 1, 20, "Wikipedia")
Wikipedia_df_4 = get_metrics_df_4(Lexicon_df, Wikipedia_Lexicon, 0.1, 2, 0.1, "Wikipedia")

In [63]:
Wikipedia_df = pd.concat([Wikipedia_df_1, Wikipedia_df_2, Wikipedia_df_3, Wikipedia_df_4])

In [64]:
#EPA Lexicon
EPA_df_1 = get_metrics_df_1(Lexicon_df, EPA_Lexicon, 1, 20, "EPA")
EPA_df_2 = get_metrics_df_2(Lexicon_df, EPA_Lexicon, 0.1, 2, 0.1, "EPA")
EPA_df_3 = get_metrics_df_3(Lexicon_df, EPA_Lexicon, 1, 20, "EPA")
EPA_df_4 = get_metrics_df_4(Lexicon_df, EPA_Lexicon, 0.1, 2, 0.1, "EPA")

In [65]:
EPA_df = pd.concat([EPA_df_1, EPA_df_2, EPA_df_3, EPA_df_4])

In [66]:
#BBC Lexicon
BBC_df_1 = get_metrics_df_1(Lexicon_df, BBC_Lexicon, 1, 20, "BBC")
BBC_df_2 = get_metrics_df_2(Lexicon_df, BBC_Lexicon, 0.1, 2, 0.1, "BBC")
BBC_df_3 = get_metrics_df_3(Lexicon_df, BBC_Lexicon, 1, 20, "BBC")
BBC_df_4 = get_metrics_df_4(Lexicon_df, BBC_Lexicon, 0.1, 2, 0.1, "BBC")

In [67]:
BBC_df = pd.concat([BBC_df_1, BBC_df_2, BBC_df_3, BBC_df_4])

In [68]:
#UNDP Lexicon
UNDP_df_1 = get_metrics_df_1(Lexicon_df, UNDP_Lexicon, 1, 20, "UNDP")
UNDP_df_2 = get_metrics_df_2(Lexicon_df, UNDP_Lexicon, 0.1, 2, 0.1, "UNDP")
UNDP_df_3 = get_metrics_df_3(Lexicon_df, UNDP_Lexicon, 1, 20, "UNDP")
UNDP_df_4 = get_metrics_df_4(Lexicon_df, UNDP_Lexicon, 0.1, 2, 0.1, "UNDP")

In [69]:
UNDP_df = pd.concat([UNDP_df_1, UNDP_df_2, UNDP_df_3, UNDP_df_4])

In [70]:
#Get all lexicons together
Lexicon_df_1 = pd.concat([Global_Change_df, IPCC_df, Wikipedia_df, EPA_df, BBC_df, UNDP_df])

In [71]:
Lexicon_df_1["Nr"] = 1

### 3.1.2. Two Lexicons Combined

In [72]:
#UNDP and EPA
EPA_UNDP_Lexicon = pd.concat([EPA_Lexicon, UNDP_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_UNDP_df_1 = get_metrics_df_1(Lexicon_df, EPA_UNDP_Lexicon, 1, 20, "EPA_UDNP")
EPA_UNDP_df_2 = get_metrics_df_2(Lexicon_df, EPA_UNDP_Lexicon, 0.1, 2, 0.1, "EPA_UDNP")
EPA_UNDP_df_3 = get_metrics_df_3(Lexicon_df, EPA_UNDP_Lexicon, 1, 20, "EPA_UDNP")
EPA_UNDP_df_4 = get_metrics_df_4(Lexicon_df, EPA_UNDP_Lexicon, 0.1, 2, 0.1, "EPA_UDNP")

In [73]:
EPA_UNDP_df = pd.concat([EPA_UNDP_df_1, EPA_UNDP_df_2, EPA_UNDP_df_3, EPA_UNDP_df_4])

In [74]:
#UNDP and BBC
BBC_UNDP_Lexicon = pd.concat([BBC_Lexicon, UNDP_Lexicon]).drop_duplicates().reset_index(drop = True)
BBC_UNDP_df_1 = get_metrics_df_1(Lexicon_df, BBC_UNDP_Lexicon, 1, 20, "BBC_UNDP")
BBC_UNDP_df_2 = get_metrics_df_2(Lexicon_df, BBC_UNDP_Lexicon, 0.1, 2, 0.1, "BBC_UNDP")
BBC_UNDP_df_3 = get_metrics_df_3(Lexicon_df, BBC_UNDP_Lexicon, 1, 20, "BBC_UNDP")
BBC_UNDP_df_4 = get_metrics_df_4(Lexicon_df, BBC_UNDP_Lexicon, 0.1, 2, 0.1, "BBC_UNDP")

In [75]:
BBC_UNDP_df = pd.concat([BBC_UNDP_df_1, BBC_UNDP_df_2, BBC_UNDP_df_3, BBC_UNDP_df_4])

In [76]:
#UNDP and Global Change 
UNDP_Global_Change_Lexicon = pd.concat([Global_Change_Lexicon, UNDP_Lexicon]).drop_duplicates().reset_index(drop = True)
UNDP_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, UNDP_Global_Change_Lexicon, 1, 20, "UNDP_Global_Change")
UNDP_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, UNDP_Global_Change_Lexicon, 0.1, 2, 0.1, "UNDP_Global_Change")
UNDP_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, UNDP_Global_Change_Lexicon, 1, 20, "UNDP_Global_Change")
UNDP_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, UNDP_Global_Change_Lexicon, 0.1, 2, 0.1, "UNDP_Global_Change")

In [77]:
Global_Change_UNDP_df = pd.concat([UNDP_Global_Change_df_1, UNDP_Global_Change_df_2, UNDP_Global_Change_df_3, UNDP_Global_Change_df_4])

In [78]:
#UNDP and IPCC
UNDP_IPCC_Lexicon = pd.concat([IPCC_Lexicon, UNDP_Lexicon]).drop_duplicates().reset_index(drop = True)
UNDP_IPCC_df_1 = get_metrics_df_1(Lexicon_df, UNDP_IPCC_Lexicon, 1, 20, "UNDP_IPCC")
UNDP_IPCC_df_2 = get_metrics_df_2(Lexicon_df, UNDP_IPCC_Lexicon, 0.1, 2, 0.1, "UNDP_IPCC")
UNDP_IPCC_df_3 = get_metrics_df_3(Lexicon_df, UNDP_IPCC_Lexicon, 1, 20, "UNDP_IPCC")
UNDP_IPCC_df_4 = get_metrics_df_4(Lexicon_df, UNDP_IPCC_Lexicon, 0.1, 2, 0.1, "UNDP_IPCC")

In [79]:
IPCC_UNDP_df = pd.concat([UNDP_IPCC_df_1, UNDP_IPCC_df_2, UNDP_IPCC_df_3, UNDP_IPCC_df_4])

In [80]:
#UNDP and Wikipedia
UNDP_Wikipedia_Lexicon = pd.concat([Wikipedia_Lexicon, UNDP_Lexicon]).drop_duplicates().reset_index(drop = True)
UNDP_Wikipedia_df_1 = get_metrics_df_1(Lexicon_df, UNDP_Wikipedia_Lexicon, 1, 20, "UNDP_Wikipedia")
UNDP_Wikipedia_df_2 = get_metrics_df_2(Lexicon_df, UNDP_Wikipedia_Lexicon, 0.1, 2, 0.1, "UNDP_Wikipedia")
UNDP_Wikipedia_df_3 = get_metrics_df_3(Lexicon_df, UNDP_Wikipedia_Lexicon, 1, 20, "UNDP_Wikipedia")
UNDP_Wikipedia_df_4 = get_metrics_df_4(Lexicon_df, UNDP_Wikipedia_Lexicon, 0.1, 2, 0.1, "UNDP_Wikipedia")

In [81]:
Wikipedia_UNDP_df = pd.concat([UNDP_Wikipedia_df_1, UNDP_Wikipedia_df_2, UNDP_Wikipedia_df_3, UNDP_Wikipedia_df_4])

In [82]:
Lexicon_df_2 = pd.concat([EPA_UNDP_df, BBC_UNDP_df, Global_Change_UNDP_df, IPCC_UNDP_df, Wikipedia_UNDP_df]).reset_index(drop = True)

In [83]:
#EPA and BBC
EPA_BBC_Lexicon = pd.concat([EPA_Lexicon, BBC_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_BBC_df_1 = get_metrics_df_1(Lexicon_df, EPA_BBC_Lexicon, 1, 20, "BBC_EPA")
EPA_BBC_df_2 = get_metrics_df_2(Lexicon_df, EPA_BBC_Lexicon, 0.1, 2, 0.1, "BBC_EPA")
EPA_BBC_df_3 = get_metrics_df_3(Lexicon_df, EPA_BBC_Lexicon, 1, 20, "BBC_EPA")
EPA_BBC_df_4 = get_metrics_df_4(Lexicon_df, EPA_BBC_Lexicon, 0.1, 2, 0.1, "BBC_EPA")

In [84]:
EPA_BBC_df = pd.concat([EPA_BBC_df_1, EPA_BBC_df_2, EPA_BBC_df_3, EPA_BBC_df_4])

In [85]:
#EPA and Global Change
EPA_Global_Change_Lexicon = pd.concat([EPA_Lexicon, Global_Change_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, EPA_Global_Change_Lexicon, 1, 20, "EPA_GLobal_Change")
EPA_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, EPA_Global_Change_Lexicon, 0.1, 2, 0.1, "EPA_GLobal_Change")
EPA_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, EPA_Global_Change_Lexicon, 1, 20, "EPA_GLobal_Change")
EPA_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, EPA_Global_Change_Lexicon, 0.1, 2, 0.1, "EPA_GLobal_Change")

In [86]:
EPA_Global_Change_df = pd.concat([EPA_Global_Change_df_1, EPA_Global_Change_df_2, EPA_Global_Change_df_3, EPA_Global_Change_df_4])

In [87]:
EPA_Global_Change_df.sort_values("Accuracy", ascending = False)

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score
7,BBC_Global_Change,Relative Frequency,0.8,0.890000,0.801802,0.89,0.843602
10,BBC_Global_Change,Relative Frequency,1.1,0.883333,0.849462,0.79,0.818653
9,BBC_Global_Change,Relative Frequency,1.0,0.880000,0.833333,0.80,0.816327
11,BBC_Global_Change,Relative Frequency,1.2,0.880000,0.847826,0.78,0.812500
8,BBC_Global_Change,Relative Frequency,0.9,0.876667,0.800000,0.84,0.819512
...,...,...,...,...,...,...,...
0,BBC_Global_Change,Relative Presences,0.1,0.510000,0.404858,1.00,0.576369
0,BBC_Global_Change,Relative Frequency,0.1,0.503333,0.401606,1.00,0.573066
0,EPA_GLobal_Change,Absolute Frequency,1.0,0.493333,0.396825,1.00,0.568182
0,BBC_Global_Change,Absolute Presences,1.0,0.493333,0.396825,1.00,0.568182


In [88]:
#EPA and IPCC
EPA_IPCC_Lexicon = pd.concat([EPA_Lexicon, IPCC_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_IPCC_df_1 = get_metrics_df_1(Lexicon_df, EPA_IPCC_Lexicon, 1, 20, "EPA_IPCC")
EPA_IPCC_df_2 = get_metrics_df_2(Lexicon_df, EPA_IPCC_Lexicon, 0.1, 2, 0.1, "EPA_IPCC")
EPA_IPCC_df_3 = get_metrics_df_3(Lexicon_df, EPA_IPCC_Lexicon, 1, 20, "EPA_IPCC")
EPA_IPCC_df_4 = get_metrics_df_4(Lexicon_df, EPA_IPCC_Lexicon, 0.1, 2, 0.1, "EPA_IPCC")

In [89]:
EPA_IPCC_df = pd.concat([EPA_IPCC_df_1, EPA_IPCC_df_2, EPA_IPCC_df_3, EPA_IPCC_df_4])

In [90]:
EPA_IPCC_df.sort_values("Accuracy", ascending = False)

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score
15,EPA_IPCC,Relative Frequency,1.6,0.876667,0.805825,0.83,0.817734
14,EPA_IPCC,Relative Frequency,1.5,0.876667,0.788991,0.86,0.822967
13,EPA_IPCC,Relative Frequency,1.4,0.870000,0.765217,0.88,0.818605
12,EPA_IPCC,Relative Frequency,1.3,0.863333,0.743802,0.90,0.814480
17,EPA_IPCC,Relative Frequency,1.8,0.863333,0.810526,0.77,0.789744
...,...,...,...,...,...,...,...
1,EPA_IPCC,Relative Frequency,0.2,0.463333,0.383142,1.00,0.554017
0,EPA_IPCC,Relative Presences,0.1,0.416667,0.363636,1.00,0.533333
0,EPA_IPCC,Relative Frequency,0.1,0.416667,0.363636,1.00,0.533333
0,EPA_IPCC,Absolute Frequency,1.0,0.403333,0.358423,1.00,0.527704


In [91]:
#EPA and Wikipedia
EPA_Wikipedia_Lexicon = pd.concat([EPA_Lexicon, Wikipedia_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_Wikipedia_df_1 = get_metrics_df_1(Lexicon_df, EPA_Wikipedia_Lexicon, 1, 20, "EPA_Wikipedia")
EPA_Wikipedia_df_2 = get_metrics_df_2(Lexicon_df, EPA_Wikipedia_Lexicon, 0.1, 2, 0.1, "EPA_Wikipedia")
EPA_Wikipedia_df_3 = get_metrics_df_3(Lexicon_df, EPA_Wikipedia_Lexicon, 1, 20, "EPA_Wikipedia")
EPA_Wikipedia_df_4 = get_metrics_df_4(Lexicon_df, EPA_Wikipedia_Lexicon, 0.1, 2, 0.1, "EPA_Wikipedia")

In [92]:
EPA_Wikipedia_df = pd.concat([EPA_Wikipedia_df_1, EPA_Wikipedia_df_2, EPA_Wikipedia_df_3, EPA_Wikipedia_df_4])

In [93]:
EPA_Wikipedia_df.sort_values("Accuracy", ascending = False)

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score
6,EPA_Wikipedia,Relative Frequency,0.7,0.893333,0.826923,0.86,0.843137
5,EPA_Wikipedia,Relative Frequency,0.6,0.893333,0.803571,0.90,0.849057
7,EPA_Wikipedia,Relative Frequency,0.8,0.890000,0.838384,0.83,0.834171
8,EPA_Wikipedia,Relative Frequency,0.9,0.883333,0.842105,0.80,0.820513
9,EPA_Wikipedia,Relative Frequency,1.0,0.880000,0.855556,0.77,0.810526
...,...,...,...,...,...,...,...
0,EPA_Wikipedia,Absolute Presences,1.0,0.603333,0.456621,1.00,0.626959
19,EPA_Wikipedia,Absolute Presences,20.0,0.333333,0.000000,0.00,0.000000
18,EPA_Wikipedia,Absolute Presences,19.0,0.333333,0.000000,0.00,0.000000
17,EPA_Wikipedia,Absolute Presences,18.0,0.333333,0.000000,0.00,0.000000


In [94]:
Lexicon_df_2 = pd.concat([Lexicon_df_2, EPA_BBC_df, EPA_Global_Change_df, EPA_IPCC_df, EPA_Wikipedia_df]).reset_index(drop = True)

In [95]:
#BBC and Global Change
BBC_Global_Change_Lexicon = pd.concat([BBC_Lexicon, Global_Change_Lexicon]).drop_duplicates().reset_index(drop = True)
BBC_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, BBC_Global_Change_Lexicon, 1, 20, "BBC_Global_Change")
BBC_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, BBC_Global_Change_Lexicon, 0.1, 2, 0.1, "BBC_Global_Change")
BBC_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, BBC_Global_Change_Lexicon, 1, 20, "BBC_Global_Change")
BBC_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, BBC_Global_Change_Lexicon, 0.1, 2, 0.1, "BBC_Global_Change")

In [96]:
BBC_Global_Change_df = pd.concat([BBC_Global_Change_df_1, BBC_Global_Change_df_2, BBC_Global_Change_df_3, BBC_Global_Change_df_4])

In [97]:
#BBC and IPCC
BBC_IPCC_Lexicon = pd.concat([BBC_Lexicon, IPCC_Lexicon]).drop_duplicates().reset_index(drop = True)
BBC_IPCC_df_1 = get_metrics_df_1(Lexicon_df, BBC_IPCC_Lexicon, 1, 20, "BBC_IPCC")
BBC_IPCC_df_2 = get_metrics_df_2(Lexicon_df, BBC_IPCC_Lexicon, 0.1, 2, 0.1, "BBC_IPCC")
BBC_IPCC_df_3 = get_metrics_df_3(Lexicon_df, BBC_IPCC_Lexicon, 1, 20, "BBC_IPCC")
BBC_IPCC_df_4 = get_metrics_df_4(Lexicon_df, BBC_IPCC_Lexicon, 0.1, 2, 0.1, "BBC_IPCC")

In [98]:
BBC_IPCC_df = pd.concat([BBC_IPCC_df_1, BBC_IPCC_df_2, BBC_IPCC_df_3, BBC_IPCC_df_4])

In [99]:
#BBC and Wikipedia
BBC_Wikipedia_Lexicon = pd.concat([BBC_Lexicon, Wikipedia_Lexicon]).drop_duplicates().reset_index(drop = True)
BBC_Wikipedia_df_1 = get_metrics_df_1(Lexicon_df, BBC_Wikipedia_Lexicon, 1, 20, "BBC_Wikipedia")
BBC_Wikipedia_df_2 = get_metrics_df_2(Lexicon_df, BBC_Wikipedia_Lexicon, 0.1, 2, 0.1, "BBC_Wikipedia")
BBC_Wikipedia_df_3 = get_metrics_df_3(Lexicon_df, BBC_Wikipedia_Lexicon, 1, 20, "BBC_Wikipedia")
BBC_Wikipedia_df_4 = get_metrics_df_4(Lexicon_df, BBC_Wikipedia_Lexicon, 0.1, 2, 0.1, "BBC_Wikipedia")

In [100]:
BBC_Wikpedia_df = pd.concat([BBC_Wikipedia_df_1, BBC_Wikipedia_df_2, BBC_Wikipedia_df_3, BBC_Wikipedia_df_4])

In [101]:
Lexicon_df_2 = pd.concat([Lexicon_df_2, BBC_Global_Change_df, BBC_IPCC_df, BBC_Wikpedia_df]).reset_index(drop = True)

In [102]:
#Wikipedia and Global Change
Wikipedia_Global_Change_Lexicon = pd.concat([Global_Change_Lexicon, Wikipedia_Lexicon]).drop_duplicates().reset_index(drop = True)
Wikipedia_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, Wikipedia_Global_Change_Lexicon, 1, 20, "Wikipedia_Global_Change")
Wikipedia_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, Wikipedia_Global_Change_Lexicon, 0.1, 2, 0.1, "Wikipedia_Global_Change")
Wikipedia_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, Wikipedia_Global_Change_Lexicon, 1, 20, "Wikipedia_Global_Change")
Wikipedia_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, Wikipedia_Global_Change_Lexicon, 0.1, 2, 0.1, "Wikipedia_Global_Change")

In [103]:
Wikpedia_Global_Change_df = pd.concat([Wikipedia_Global_Change_df_1, Wikipedia_Global_Change_df_2, Wikipedia_Global_Change_df_3, Wikipedia_Global_Change_df_4])

In [104]:
#Wikipedia and IPCC
Wikipedia_IPCC_Lexicon = pd.concat([IPCC_Lexicon, Wikipedia_Lexicon]).drop_duplicates().reset_index(drop = True)
Wikipedia_IPCC_df_1 = get_metrics_df_1(Lexicon_df, Wikipedia_IPCC_Lexicon, 1, 20, "Wikipedia_IPCC")
Wikipedia_IPCC_df_2 = get_metrics_df_2(Lexicon_df, Wikipedia_IPCC_Lexicon, 0.1, 2, 0.1, "Wikipedia_IPCC")
Wikipedia_IPCC_df_3 = get_metrics_df_3(Lexicon_df, Wikipedia_IPCC_Lexicon, 1, 20, "Wikipedia_IPCC")
Wikipedia_IPCC_df_4 = get_metrics_df_4(Lexicon_df, Wikipedia_IPCC_Lexicon, 0.1, 2, 0.1, "Wikipedia_IPCC")

In [105]:
Wikpedia_IPCC_df = pd.concat([Wikipedia_IPCC_df_1, Wikipedia_IPCC_df_2, Wikipedia_IPCC_df_3, Wikipedia_IPCC_df_4])

In [106]:
#IPCC and Global Change
Global_Change_IPCC_Lexicon = pd.concat([IPCC_Lexicon, Global_Change_Lexicon]).drop_duplicates().reset_index(drop = True)
Global_Change_IPCC_df_1 = get_metrics_df_1(Lexicon_df, Global_Change_IPCC_Lexicon, 1, 20, "Global_Change_IPCC")
Global_Change_IPCC_df_2 = get_metrics_df_2(Lexicon_df, Global_Change_IPCC_Lexicon, 0.1, 2, 0.1, "Global_Change_IPCC")
Global_Change_IPCC_df_3 = get_metrics_df_3(Lexicon_df, Global_Change_IPCC_Lexicon, 1, 20, "Global_Change_IPCC")
Global_Change_IPCC_df_4 = get_metrics_df_4(Lexicon_df, Global_Change_IPCC_Lexicon, 0.1, 2, 0.1, "Global_Change_IPCC")

In [107]:
Global_Change_IPCC_df = pd.concat([Global_Change_IPCC_df_1, Global_Change_IPCC_df_2, Global_Change_IPCC_df_3, Global_Change_IPCC_df_4])

In [108]:
Lexicon_df_2 = pd.concat([Lexicon_df_2, Wikpedia_Global_Change_df, Wikpedia_IPCC_df, Global_Change_IPCC_df]).reset_index(drop = True)

In [109]:
Lexicon_df_2["Nr"] = 2

### Three Lexicon Combined

First, we will only look at the ones that showed great performance in the previous steps

In [110]:
#EPA, UNDP and BBC
EPA_UNDP_BBC_Lexicon = pd.concat([EPA_Lexicon, UNDP_Lexicon, BBC_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_UNDP_BBC_df_1 = get_metrics_df_1(Lexicon_df, EPA_UNDP_BBC_Lexicon, 1, 20, "EPA_UNDP_BBC")
EPA_UNDP_BBC_df_2 = get_metrics_df_2(Lexicon_df, EPA_UNDP_BBC_Lexicon, 0.1, 2, 0.1, "EPA_UNDP_BBC")
EPA_UNDP_BBC_df_3 = get_metrics_df_3(Lexicon_df, EPA_UNDP_BBC_Lexicon, 1, 20, "EPA_UNDP_BBC")
EPA_UNDP_BBC_df_4 = get_metrics_df_4(Lexicon_df, EPA_UNDP_BBC_Lexicon, 0.1, 2, 0.1, "EPA_UNDP_BBC")

In [111]:
EPA_UNDP_BBC_df = pd.concat([EPA_UNDP_BBC_df_1, EPA_UNDP_BBC_df_2, EPA_UNDP_BBC_df_3, EPA_UNDP_BBC_df_4])

In [112]:
#EPA, UNDP and Wikipedia
EPA_UNDP_Wikipedia_Lexicon = pd.concat([EPA_Lexicon, UNDP_Lexicon, Wikipedia_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_UNDP_Wikipedia_df_1 = get_metrics_df_1(Lexicon_df, EPA_UNDP_Wikipedia_Lexicon, 1, 20, "EPA_UNDP_Wikipedia")
EPA_UNDP_Wikipedia_df_2 = get_metrics_df_2(Lexicon_df, EPA_UNDP_Wikipedia_Lexicon, 0.1, 2, 0.1, "EPA_UNDP_Wikipedia")
EPA_UNDP_Wikipedia_df_3 = get_metrics_df_3(Lexicon_df, EPA_UNDP_Wikipedia_Lexicon, 1, 20, "EPA_UNDP_Wikipedia")
EPA_UNDP_Wikipedia_df_4 = get_metrics_df_4(Lexicon_df, EPA_UNDP_Wikipedia_Lexicon, 0.1, 2, 0.1, "EPA_UNDP_Wikipedia")

In [113]:
EPA_UNDP_Wikipedia_df = pd.concat([EPA_UNDP_Wikipedia_df_1, EPA_UNDP_Wikipedia_df_2, EPA_UNDP_Wikipedia_df_3, EPA_UNDP_Wikipedia_df_4])

In [114]:
#EPA, UNDP and IPCC
EPA_UNDP_IPCC_Lexicon = pd.concat([EPA_Lexicon, UNDP_Lexicon, IPCC_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_UNDP_IPCC_df_1 = get_metrics_df_1(Lexicon_df, EPA_UNDP_IPCC_Lexicon, 1, 20, "EPA_UNDP_IPCC")
EPA_UNDP_IPCC_df_2 = get_metrics_df_2(Lexicon_df, EPA_UNDP_IPCC_Lexicon, 0.1, 2, 0.1, "EPA_UNDP_IPCC")
EPA_UNDP_IPCC_df_3 = get_metrics_df_3(Lexicon_df, EPA_UNDP_IPCC_Lexicon, 1, 20, "EPA_UNDP_IPCC")
EPA_UNDP_IPCC_df_4 = get_metrics_df_4(Lexicon_df, EPA_UNDP_IPCC_Lexicon, 0.1, 2, 0.1, "EPA_UNDP_IPCC")

In [115]:
EPA_UNDP_IPCC_df = pd.concat([EPA_UNDP_IPCC_df_1, EPA_UNDP_IPCC_df_2, EPA_UNDP_IPCC_df_3, EPA_UNDP_IPCC_df_4])

In [116]:
#EPA, UNDP and Global Change
EPA_UNDP_Global_Change_Lexicon = pd.concat([EPA_Lexicon, UNDP_Lexicon, Global_Change_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_UNDP_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, EPA_UNDP_Global_Change_Lexicon, 1, 20, "EPA_UNDP_Global_Change")
EPA_UNDP_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, EPA_UNDP_Global_Change_Lexicon, 0.1, 2, 0.1, "EPA_UNDP_Global_Change")
EPA_UNDP_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, EPA_UNDP_Global_Change_Lexicon, 1, 20, "EPA_UNDP_Global_Change")
EPA_UNDP_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, EPA_UNDP_Global_Change_Lexicon, 0.1, 2, 0.1, "EPA_UNDP_Global_Change")

In [117]:
EPA_UNDP_Global_Change_df = pd.concat([EPA_UNDP_Global_Change_df_1, EPA_UNDP_Global_Change_df_2, EPA_UNDP_Global_Change_df_3, EPA_UNDP_Global_Change_df_4])

In [118]:
Lexicon_df_3 = pd.concat([EPA_UNDP_BBC_df, EPA_UNDP_Wikipedia_df, EPA_UNDP_IPCC_df, EPA_UNDP_Global_Change_df])

In [119]:
#EPA, BBC and Wikipedia
EPA_BBC_Wikipedia_Lexicon = pd.concat([EPA_Lexicon, BBC_Lexicon, Wikipedia_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_BBC_Wikipedia_df_1 = get_metrics_df_1(Lexicon_df, EPA_BBC_Wikipedia_Lexicon, 1, 20, "EPA_BBC_Wikipedia")
EPA_BBC_Wikipedia_df_2 = get_metrics_df_2(Lexicon_df, EPA_BBC_Wikipedia_Lexicon, 0.1, 2, 0.1, "EPA_BBC_Wikipedia")
EPA_BBC_Wikipedia_df_3 = get_metrics_df_3(Lexicon_df, EPA_BBC_Wikipedia_Lexicon, 1, 20, "EPA_BBC_Wikipedia")
EPA_BBC_Wikipedia_df_4 = get_metrics_df_4(Lexicon_df, EPA_BBC_Wikipedia_Lexicon, 0.1, 2, 0.1, "EPA_BBC_Wikipedia")

In [267]:
EPA_BBC_Wikipedia_df = pd.concat([EPA_BBC_Wikipedia_df_1, EPA_BBC_Wikipedia_df_2, EPA_BBC_Wikipedia_df_3, EPA_BBC_Wikipedia_df_4])

In [121]:
#EPA, BBC and IPCC
EPA_BBC_IPCC_Lexicon = pd.concat([EPA_Lexicon, BBC_Lexicon, IPCC_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_BBC_IPCC_df_1 = get_metrics_df_1(Lexicon_df, EPA_BBC_IPCC_Lexicon, 1, 20, "EPA_BBC_IPCC")
EPA_BBC_IPCC_df_2 = get_metrics_df_2(Lexicon_df, EPA_BBC_IPCC_Lexicon, 0.1, 2, 0.1, "EPA_BBC_IPCC")
EPA_BBC_IPCC_df_3 = get_metrics_df_3(Lexicon_df, EPA_BBC_IPCC_Lexicon, 1, 20, "EPA_BBC_IPCC")
EPA_BBC_IPCC_df_4 = get_metrics_df_4(Lexicon_df, EPA_BBC_IPCC_Lexicon, 0.1, 2, 0.1, "EPA_BBC_IPCC")

In [122]:
EPA_BBC_IPCC_df = pd.concat([EPA_BBC_IPCC_df_1, EPA_BBC_IPCC_df_2, EPA_BBC_IPCC_df_3, EPA_BBC_IPCC_df_4])

In [123]:
#EPA, BBC and Global Change
EPA_BBC_Global_Change_Lexicon = pd.concat([EPA_Lexicon, BBC_Lexicon, Global_Change_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_BBC_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, EPA_BBC_Global_Change_Lexicon, 1, 20, "EPA_BBC_Global_Change")
EPA_BBC_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, EPA_BBC_Global_Change_Lexicon, 0.1, 2, 0.1, "EPA_BBC_Global_Change")
EPA_BBC_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, EPA_BBC_Global_Change_Lexicon, 1, 20, "EPA_BBC_Global_Change")
EPA_BBC_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, EPA_BBC_Global_Change_Lexicon, 0.1, 2, 0.1, "EPA_BBC_Global_Change")

In [124]:
EPA_BBC_Global_Change_df = pd.concat([EPA_BBC_Global_Change_df_1, EPA_BBC_Global_Change_df_2, EPA_BBC_Global_Change_df_3, EPA_BBC_Global_Change_df_4])

In [125]:
Lexicon_df_3 = pd.concat([Lexicon_df_3, EPA_UNDP_IPCC_df, EPA_BBC_IPCC_df, EPA_BBC_Global_Change_df])

In [126]:
#EPA, Wikipedia and Global Change
EPA_Wikipedia_Global_Change_Lexicon = pd.concat([EPA_Lexicon, Wikipedia_Lexicon, Global_Change_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_Wikipedia_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, EPA_Wikipedia_Global_Change_Lexicon, 1, 20, "EPA_Wikipedia_Global_Change")
EPA_Wikipedia_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, EPA_Wikipedia_Global_Change_Lexicon, 0.1, 2, 0.1, "EPA_Wikipedia_Global_Change")
EPA_Wikipedia_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, EPA_Wikipedia_Global_Change_Lexicon, 1, 20, "EPA_Wikipedia_Global_Change")
EPA_Wikipedia_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, EPA_Wikipedia_Global_Change_Lexicon, 0.1, 2, 0.1, "EPA_Wikipedia_Global_Change")

In [127]:
EPA_Wikipedia_Global_Change_df = pd.concat([EPA_Wikipedia_Global_Change_df_1, EPA_Wikipedia_Global_Change_df_2, EPA_Wikipedia_Global_Change_df_3, EPA_Wikipedia_Global_Change_df_4])

In [128]:
#EPA, Wikipedia and IPCC
EPA_Wikipedia_IPCC_Lexicon = pd.concat([EPA_Lexicon, Wikipedia_Lexicon, IPCC_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_Wikipedia_IPCC_df_1 = get_metrics_df_1(Lexicon_df, EPA_Wikipedia_IPCC_Lexicon, 1, 20, "EPA_Wikipedia_IPCC")
EPA_Wikipedia_IPCC_df_2 = get_metrics_df_2(Lexicon_df, EPA_Wikipedia_IPCC_Lexicon, 0.1, 2, 0.1, "EPA_Wikipedia_IPCC")
EPA_Wikipedia_IPCC_df_3 = get_metrics_df_3(Lexicon_df, EPA_Wikipedia_IPCC_Lexicon, 1, 20, "EPA_Wikipedia_IPCC")
EPA_Wikipedia_IPCC_df_4 = get_metrics_df_4(Lexicon_df, EPA_Wikipedia_IPCC_Lexicon, 0.1, 2, 0.1, "EPA_Wikipedia_IPCC")

In [129]:
EPA_Wikipedia_IPCC_df = pd.concat([EPA_Wikipedia_IPCC_df_1, EPA_Wikipedia_IPCC_df_2, EPA_Wikipedia_IPCC_df_3, EPA_Wikipedia_IPCC_df_4])

In [130]:
#EPA, IPCC en Global Change
EPA_IPCC_Global_Change_Lexicon = pd.concat([EPA_Lexicon, Global_Change_Lexicon, IPCC_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_Global_Change_IPCC_df_1 = get_metrics_df_1(Lexicon_df, EPA_IPCC_Global_Change_Lexicon, 1, 20, "EPA_Global_Change_IPCC")
EPA_Global_Change_IPCC_df_2 = get_metrics_df_2(Lexicon_df, EPA_IPCC_Global_Change_Lexicon, 0.1, 2, 0.1, "EPA_Global_Change_IPCC")
EPA_Global_Change_IPCC_df_3 = get_metrics_df_3(Lexicon_df, EPA_IPCC_Global_Change_Lexicon, 1, 20, "EPA_Global_Change_IPCC")
EPA_Global_Change_IPCC_df_4 = get_metrics_df_4(Lexicon_df, EPA_IPCC_Global_Change_Lexicon, 0.1, 2, 0.1, "EPA_Global_Change_IPCC")

In [131]:
EPA_Global_Change_IPCC_df = pd.concat([EPA_Global_Change_IPCC_df_1, EPA_Global_Change_IPCC_df_2, EPA_Global_Change_IPCC_df_3, EPA_Global_Change_IPCC_df_4])

In [132]:
Lexicon_df_3 = pd.concat([Lexicon_df_3, EPA_Wikipedia_Global_Change_df, EPA_Wikipedia_IPCC_df, EPA_Global_Change_IPCC_df])

In [133]:
#UNDP, BBC and Wikipedia
UNDP_BBC_Wikipedia_Lexicon = pd.concat([UNDP_Lexicon, BBC_Lexicon, Wikipedia_Lexicon]).drop_duplicates().reset_index(drop = True)
UNDP_BBC_Wikipedia_df_1 = get_metrics_df_1(Lexicon_df, UNDP_BBC_Wikipedia_Lexicon, 1, 20, "UNDP_BBC_Wikipedia")
UNDP_BBC_Wikipedia_df_2 = get_metrics_df_2(Lexicon_df, UNDP_BBC_Wikipedia_Lexicon, 0.1, 2, 0.1, "UNDP_BBC_Wikipedia")
UNDP_BBC_Wikipedia_df_3 = get_metrics_df_3(Lexicon_df, UNDP_BBC_Wikipedia_Lexicon, 1, 20, "UNDP_BBC_Wikipedia")
UNDP_BBC_Wikipedia_df_4 = get_metrics_df_4(Lexicon_df, UNDP_BBC_Wikipedia_Lexicon, 0.1, 2, 0.1, "UNDP_BBC_Wikipedia")

In [134]:
UNDP_BBC_Wikipedia_df = pd.concat([UNDP_BBC_Wikipedia_df_1, UNDP_BBC_Wikipedia_df_2, UNDP_BBC_Wikipedia_df_3, UNDP_BBC_Wikipedia_df_4])

In [135]:
#UNDP, BBC and IPCC
UNDP_BBC_IPCC_Lexicon = pd.concat([UNDP_Lexicon, BBC_Lexicon, IPCC_Lexicon]).drop_duplicates().reset_index(drop = True)
UNDP_BBC_IPCC_df_1 = get_metrics_df_1(Lexicon_df, UNDP_BBC_IPCC_Lexicon, 1, 20, "UNDP_BBC_IPCC")
UNDP_BBC_IPCC_df_2 = get_metrics_df_2(Lexicon_df, UNDP_BBC_IPCC_Lexicon, 0.1, 2, 0.1, "UNDP_BBC_IPCC")
UNDP_BBC_IPCC_df_3 = get_metrics_df_3(Lexicon_df, UNDP_BBC_IPCC_Lexicon, 1, 20, "UNDP_BBC_IPCC")
UNDP_BBC_IPCC_df_4 = get_metrics_df_4(Lexicon_df, UNDP_BBC_IPCC_Lexicon, 0.1, 2, 0.1, "UNDP_BBC_IPCC")

In [136]:
UNDP_BBC_IPCC_df = pd.concat([UNDP_BBC_IPCC_df_1, UNDP_BBC_IPCC_df_2, UNDP_BBC_IPCC_df_3, UNDP_BBC_IPCC_df_4])

In [137]:
#UNDP, BBC and Global Change
UNDP_BBC_Global_Change_Lexicon = pd.concat([UNDP_Lexicon, BBC_Lexicon, Global_Change_Lexicon]).drop_duplicates().reset_index(drop = True)
UNDP_BBC_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, UNDP_BBC_Global_Change_Lexicon, 1, 20, "UNDP_BBC_Global_Change")
UNDP_BBC_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, UNDP_BBC_Global_Change_Lexicon, 0.1, 2, 0.1, "UNDP_BBC_Global_Change")
UNDP_BBC_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, UNDP_BBC_Global_Change_Lexicon, 1, 20, "UNDP_BBC_Global_Change")
UNDP_BBC_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, UNDP_BBC_Global_Change_Lexicon, 0.1, 2, 0.1, "UNDP_BBC_Global_Change")

In [138]:
UNDP_BBC_Global_Change_df = pd.concat([UNDP_BBC_Global_Change_df_1, UNDP_BBC_Global_Change_df_2, UNDP_BBC_Global_Change_df_3, UNDP_BBC_Global_Change_df_4])

In [139]:
Lexicon_df_3 = pd.concat([Lexicon_df_3, UNDP_BBC_Wikipedia_df, UNDP_BBC_IPCC_df, UNDP_BBC_Global_Change_df])

In [140]:
#UNDP, Wikipedia and Global Change
UNDP_Wikipedia_Global_Change_Lexicon = pd.concat([UNDP_Lexicon, Wikipedia_Lexicon, Global_Change_Lexicon]).drop_duplicates().reset_index(drop = True)
UNDP_Wikipedia_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, UNDP_Wikipedia_Global_Change_Lexicon, 1, 20, "UNDP_Wikipedia_Global_Change")
UNDP_Wikipedia_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, UNDP_Wikipedia_Global_Change_Lexicon, 0.1, 2, 0.1, "UNDP_Wikipedia_Global_Change")
UNDP_Wikipedia_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, UNDP_Wikipedia_Global_Change_Lexicon, 1, 20, "UNDP_Wikipedia_Global_Change")
UNDP_Wikipedia_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, UNDP_Wikipedia_Global_Change_Lexicon, 0.1, 2, 0.1, "UNDP_Wikipedia_Global_Change")

In [141]:
UNDP_Wikipedia_Global_Change_df = pd.concat([UNDP_Wikipedia_Global_Change_df_1, UNDP_Wikipedia_Global_Change_df_2, UNDP_Wikipedia_Global_Change_df_3, UNDP_Wikipedia_Global_Change_df_4])

In [142]:
#UNDP, Wikipedia and IPCC
UNDP_Wikipedia_IPCC_Lexicon = pd.concat([UNDP_Lexicon, Wikipedia_Lexicon, IPCC_Lexicon]).drop_duplicates().reset_index(drop = True)
UNDP_Wikipedia_IPCC_df_1 = get_metrics_df_1(Lexicon_df, UNDP_Wikipedia_IPCC_Lexicon, 1, 20, "UNDP_Wikipedia_IPCC")
UNDP_Wikipedia_IPCC_df_2 = get_metrics_df_2(Lexicon_df, UNDP_Wikipedia_IPCC_Lexicon, 0.1, 2, 0.1, "UNDP_Wikipedia_IPCC")
UNDP_Wikipedia_IPCC_df_3 = get_metrics_df_3(Lexicon_df, UNDP_Wikipedia_IPCC_Lexicon, 1, 20, "UNDP_Wikipedia_IPCC")
UNDP_Wikipedia_IPCC_df_4 = get_metrics_df_4(Lexicon_df, UNDP_Wikipedia_IPCC_Lexicon, 0.1, 2, 0.1, "UNDP_Wikipedia_IPCC")

In [143]:
UNDP_Wikipedia_IPCC_df = pd.concat([UNDP_Wikipedia_IPCC_df_1, UNDP_Wikipedia_IPCC_df_2, UNDP_Wikipedia_IPCC_df_3, UNDP_Wikipedia_IPCC_df_4])

In [144]:
#UNDP, Global_Change and IPCC
UNDP_Global_Change_IPCC_Lexicon = pd.concat([UNDP_Lexicon, Global_Change_Lexicon, IPCC_Lexicon]).drop_duplicates().reset_index(drop = True)
UNDP_Global_Change_IPCC_df_1 = get_metrics_df_1(Lexicon_df, UNDP_Global_Change_IPCC_Lexicon, 1, 20, "UNDP_Global_Change_IPCC")
UNDP_Global_Change_IPCC_df_2 = get_metrics_df_2(Lexicon_df, UNDP_Global_Change_IPCC_Lexicon, 0.1, 2, 0.1, "UNDP_Global_Change_IPCC")
UNDP_Global_Change_IPCC_df_3 = get_metrics_df_3(Lexicon_df, UNDP_Global_Change_IPCC_Lexicon, 1, 20, "UNDP_Global_Change_IPCC")
UNDP_Global_Change_IPCC_df_4 = get_metrics_df_4(Lexicon_df, UNDP_Global_Change_IPCC_Lexicon, 0.1, 2, 0.1, "UNDP_Global_Change_IPCC")

In [145]:
UNDP_Global_Change_IPCC_df = pd.concat([UNDP_Global_Change_IPCC_df_1, UNDP_Global_Change_IPCC_df_2, UNDP_Global_Change_IPCC_df_3, UNDP_Global_Change_IPCC_df_4])

In [146]:
Lexicon_df_3 = pd.concat([Lexicon_df_3, UNDP_Wikipedia_Global_Change_df, UNDP_Wikipedia_IPCC_df, UNDP_Global_Change_IPCC_df])

In [147]:
#BBC, Wikipedia and Global Change
BBC_Wikipedia_Global_Change_Lexicon = pd.concat([BBC_Lexicon, Global_Change_Lexicon, Wikipedia_Lexicon]).drop_duplicates().reset_index(drop = True)
BBC_Wikipedia_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, BBC_Wikipedia_Global_Change_Lexicon, 1, 20, "BBC_Wikipedia_Global_Change")
BBC_Wikipedia_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, BBC_Wikipedia_Global_Change_Lexicon, 0.1, 2, 0.1, "BBC_Wikipedia_Global_Change")
BBC_Wikipedia_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, BBC_Wikipedia_Global_Change_Lexicon, 1, 20, "BBC_Wikipedia_Global_Change")
BBC_Wikipedia_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, BBC_Wikipedia_Global_Change_Lexicon, 0.1, 2, 0.1, "BBC_Wikipedia_Global_Change")

In [148]:
BBC_Wikipedia_Global_Change_df = pd.concat([BBC_Wikipedia_Global_Change_df_1, BBC_Wikipedia_Global_Change_df_2, BBC_Wikipedia_Global_Change_df_3, BBC_Wikipedia_Global_Change_df_4])

In [149]:
#BBC, Wikipedia and IPCC
BBC_Wikipedia_IPCC_Lexicon = pd.concat([BBC_Lexicon, IPCC_Lexicon, Wikipedia_Lexicon]).drop_duplicates().reset_index(drop = True)
BBC_Wikipedia_IPCC_df_1 = get_metrics_df_1(Lexicon_df, BBC_Wikipedia_IPCC_Lexicon, 1, 20, "BBC_Wikipedia_IPCC")
BBC_Wikipedia_IPCC_df_2 = get_metrics_df_2(Lexicon_df, BBC_Wikipedia_IPCC_Lexicon, 0.1, 2, 0.1, "BBC_Wikipedia_IPCC")
BBC_Wikipedia_IPCC_df_3 = get_metrics_df_3(Lexicon_df, BBC_Wikipedia_IPCC_Lexicon, 1, 20, "BBC_Wikipedia_IPCC")
BBC_Wikipedia_IPCC_df_4 = get_metrics_df_4(Lexicon_df, BBC_Wikipedia_IPCC_Lexicon, 0.1, 2, 0.1, "BBC_Wikipedia_IPCC")

In [150]:
BBC_Wikipedia_IPCC_df = pd.concat([BBC_Wikipedia_IPCC_df_1, BBC_Wikipedia_IPCC_df_2, BBC_Wikipedia_IPCC_df_3, BBC_Wikipedia_IPCC_df_4])

In [151]:
#BBC, Global Change and IPCC
BBC_Global_Change_IPCC_Lexicon = pd.concat([BBC_Lexicon, IPCC_Lexicon, Global_Change_Lexicon]).drop_duplicates().reset_index(drop = True)
BBC_Global_Change_IPCC_df_1 = get_metrics_df_1(Lexicon_df, BBC_Global_Change_IPCC_Lexicon, 1, 20, "BBC_Global_Change_IPCC")
BBC_Global_Change_IPCC_df_2 = get_metrics_df_2(Lexicon_df, BBC_Global_Change_IPCC_Lexicon, 0.1, 2, 0.1, "BBC_Global_Change_IPCC")
BBC_Global_Change_IPCC_df_3 = get_metrics_df_3(Lexicon_df, BBC_Global_Change_IPCC_Lexicon, 1, 20, "BBC_Global_Change_IPCC")
BBC_Global_Change_IPCC_df_4 = get_metrics_df_4(Lexicon_df, BBC_Global_Change_IPCC_Lexicon, 0.1, 2, 0.1, "BBC_Global_Change_IPCC")

In [152]:
BBC_Global_Change_IPCC_df = pd.concat([BBC_Global_Change_IPCC_df_1, BBC_Global_Change_IPCC_df_2, BBC_Global_Change_IPCC_df_3, BBC_Global_Change_IPCC_df_4])

In [153]:
Lexicon_df_3 = pd.concat([Lexicon_df_3, BBC_Wikipedia_Global_Change_df, BBC_Wikipedia_IPCC_df, BBC_Global_Change_IPCC_df])

In [154]:
#Wikipedia, Global Change and IPCC
Wikipedia_Global_Change_IPCC_Lexicon = pd.concat([Wikipedia_Lexicon, IPCC_Lexicon, Global_Change_Lexicon]).drop_duplicates().reset_index(drop = True)
Wikipedia_Global_Change_IPCC_df_1 = get_metrics_df_1(Lexicon_df, Wikipedia_Global_Change_IPCC_Lexicon, 1, 20, "Wikipedia_Global_Change_IPCC")
Wikipedia_Global_Change_IPCC_df_2 = get_metrics_df_2(Lexicon_df, Wikipedia_Global_Change_IPCC_Lexicon, 0.1, 2, 0.1, "Wikipedia_Global_Change_IPCC")
Wikipedia_Global_Change_IPCC_df_3 = get_metrics_df_3(Lexicon_df, Wikipedia_Global_Change_IPCC_Lexicon, 1, 20, "Wikipedia_Global_Change_IPCC")
Wikipedia_Global_Change_IPCC_df_4 = get_metrics_df_4(Lexicon_df, Wikipedia_Global_Change_IPCC_Lexicon, 0.1, 2, 0.1, "Wikipedia_Global_Change_IPCC")

In [155]:
Wikipedia_Global_Change_IPCC_df = pd.concat([Wikipedia_Global_Change_IPCC_df_1, Wikipedia_Global_Change_IPCC_df_2, Wikipedia_Global_Change_IPCC_df_3, Wikipedia_Global_Change_IPCC_df_4])

In [156]:
Lexicon_df_3 = pd.concat([Lexicon_df_3, Wikipedia_Global_Change_IPCC_df])

In [157]:
Lexicon_df_3["Nr"] = 3

### Combine Four Lexicons

In [158]:
#EPA, UNDP, BBC and Wikipedia
EPA_UNDP_BBC_Wikipedia_Lexicon = pd.concat([EPA_Lexicon, UNDP_Lexicon, Wikipedia_Lexicon, BBC_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_UNDP_BBC_Wikipedia_df_1 = get_metrics_df_1(Lexicon_df, EPA_UNDP_BBC_Wikipedia_Lexicon, 0, 20, "EPA_UNDP_Wikipedia_BBC")
EPA_UNDP_BBC_Wikipedia_df_2 = get_metrics_df_2(Lexicon_df, EPA_UNDP_BBC_Wikipedia_Lexicon, 0, 2, 0.1, "EPA_UNDP_Wikipedia_BBC")
EPA_UNDP_BBC_Wikipedia_df_3 = get_metrics_df_3(Lexicon_df, EPA_UNDP_BBC_Wikipedia_Lexicon, 0, 20, "EPA_UNDP_Wikipedia_BBC")
EPA_UNDP_BBC_Wikipedia_df_4 = get_metrics_df_4(Lexicon_df, EPA_UNDP_BBC_Wikipedia_Lexicon, 0, 2, 0.1, "EPA_UNDP_Wikipedia_BBC")

In [159]:
EPA_UNDP_BBC_Wikipedia_df = pd.concat([EPA_UNDP_BBC_Wikipedia_df_1, EPA_UNDP_BBC_Wikipedia_df_2, EPA_UNDP_BBC_Wikipedia_df_3, EPA_UNDP_BBC_Wikipedia_df_4])

In [160]:
#EPA, UNDP, BBC and IPCC
EPA_UNDP_BBC_IPCC_Lexicon = pd.concat([EPA_Lexicon, UNDP_Lexicon, IPCC_Lexicon, BBC_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_UNDP_BBC_IPCC_df_1 = get_metrics_df_1(Lexicon_df, EPA_UNDP_BBC_IPCC_Lexicon, 0, 20, "EPA_UNDP_BBC_IPCC")
EPA_UNDP_BBC_IPCC_df_2 = get_metrics_df_2(Lexicon_df, EPA_UNDP_BBC_IPCC_Lexicon, 0, 2, 0.1, "EPA_UNDP_BBC_IPCC")
EPA_UNDP_BBC_IPCC_df_3 = get_metrics_df_3(Lexicon_df, EPA_UNDP_BBC_IPCC_Lexicon, 0, 20, "EPA_UNDP_BBC_IPCC")
EPA_UNDP_BBC_IPCC_df_4 = get_metrics_df_4(Lexicon_df, EPA_UNDP_BBC_IPCC_Lexicon, 0, 2, 0.1, "EPA_UNDP_BBC_IPCC")

In [161]:
EPA_UNDP_BBC_IPCC_df = pd.concat([EPA_UNDP_BBC_IPCC_df_1, EPA_UNDP_BBC_IPCC_df_2, EPA_UNDP_BBC_IPCC_df_3, EPA_UNDP_BBC_IPCC_df_4])

In [162]:
#EPA, UNDP, BBC and Global Change
EPA_UNDP_BBC_Global_Change_Lexicon = pd.concat([EPA_Lexicon, UNDP_Lexicon, Global_Change_Lexicon, BBC_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_UNDP_BBC_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, EPA_UNDP_BBC_Global_Change_Lexicon, 0, 20, "EPA_UNDP_BBC_Global_Change")
EPA_UNDP_BBC_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, EPA_UNDP_BBC_Global_Change_Lexicon, 0, 2, 0.1, "EPA_UNDP_BBC_Global_Change")
EPA_UNDP_BBC_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, EPA_UNDP_BBC_Global_Change_Lexicon, 0, 20, "EPA_UNDP_BBC_Global_Change")
EPA_UNDP_BBC_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, EPA_UNDP_BBC_Global_Change_Lexicon, 0, 2, 0.1, "EPA_UNDP_BBC_Global_Change")

In [163]:
EPA_UNDP_BBC_Global_Change_df = pd.concat([EPA_UNDP_BBC_Global_Change_df_1, EPA_UNDP_BBC_Global_Change_df_2, EPA_UNDP_BBC_Global_Change_df_3, EPA_UNDP_BBC_Global_Change_df_4])

In [164]:
Lexicon_df_4 = pd.concat([EPA_UNDP_BBC_Wikipedia_df, EPA_UNDP_BBC_IPCC_df, EPA_UNDP_BBC_Global_Change_df])

In [165]:
#EPA, UNDP, Wikipedia and Global Change
EPA_UNDP_Wikipedia_Global_Change_Lexicon = pd.concat([EPA_Lexicon, UNDP_Lexicon, Global_Change_Lexicon, Wikipedia_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_UNDP_Wikipedia_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, EPA_UNDP_Wikipedia_Global_Change_Lexicon, 0, 20, "EPA_UNDP_Wikipedia_Global_Change")
EPA_UNDP_Wikipedia_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, EPA_UNDP_Wikipedia_Global_Change_Lexicon, 0, 2, 0.1, "EPA_UNDP_Wikipedia_Global_Change")
EPA_UNDP_Wikipedia_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, EPA_UNDP_Wikipedia_Global_Change_Lexicon, 0, 20, "EPA_UNDP_Wikipedia_Global_Change")
EPA_UNDP_Wikipedia_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, EPA_UNDP_Wikipedia_Global_Change_Lexicon, 0, 2, 0.1, "EPA_UNDP_Wikipedia_Global_Change")

In [166]:
EPA_UNDP_Wikipedia_Global_Change_df = pd.concat([EPA_UNDP_Wikipedia_Global_Change_df_1, EPA_UNDP_Wikipedia_Global_Change_df_2, EPA_UNDP_Wikipedia_Global_Change_df_3, EPA_UNDP_Wikipedia_Global_Change_df_4])

In [167]:
#EPA, UNDP, Wikipedia and IPCC
EPA_UNDP_Wikipedia_IPCC_Lexicon = pd.concat([EPA_Lexicon, UNDP_Lexicon, IPCC_Lexicon, Wikipedia_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_UNDP_Wikipedia_IPCC_df_1 = get_metrics_df_1(Lexicon_df, EPA_UNDP_Wikipedia_IPCC_Lexicon, 0, 20, "EPA_UNDP_Wikipedia_IPCC")
EPA_UNDP_Wikipedia_IPCC_df_2 = get_metrics_df_2(Lexicon_df, EPA_UNDP_Wikipedia_IPCC_Lexicon, 0, 2, 0.1, "EPA_UNDP_Wikipedia_IPCC")
EPA_UNDP_Wikipedia_IPCC_df_3 = get_metrics_df_3(Lexicon_df, EPA_UNDP_Wikipedia_IPCC_Lexicon, 0, 20, "EPA_UNDP_Wikipedia_IPCC")
EPA_UNDP_Wikipedia_IPCC_df_4 = get_metrics_df_4(Lexicon_df, EPA_UNDP_Wikipedia_IPCC_Lexicon, 0, 2, 0.1, "EPA_UNDP_Wikipedia_IPCC")

In [168]:
EPA_UNDP_Wikipedia_IPCC_df = pd.concat([EPA_UNDP_Wikipedia_IPCC_df_1, EPA_UNDP_Wikipedia_IPCC_df_2, EPA_UNDP_Wikipedia_IPCC_df_3, EPA_UNDP_Wikipedia_IPCC_df_4])

In [169]:
#EPA, UNDP, Global Change and IPCC
EPA_UNDP_Global_Change_IPCC_Lexicon = pd.concat([EPA_Lexicon, UNDP_Lexicon, IPCC_Lexicon, Global_Change_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_UNDP_Global_Change_IPCC_df_1 = get_metrics_df_1(Lexicon_df, EPA_UNDP_Global_Change_IPCC_Lexicon, 0, 20, "EPA_UNDP_Global_Change_IPCC")
EPA_UNDP_Global_Change_IPCC_df_2 = get_metrics_df_2(Lexicon_df, EPA_UNDP_Global_Change_IPCC_Lexicon, 0, 2, 0.1, "EPA_UNDP_Global_Change_IPCC")
EPA_UNDP_Global_Change_IPCC_df_3 = get_metrics_df_3(Lexicon_df, EPA_UNDP_Global_Change_IPCC_Lexicon, 0, 20, "EPA_UNDP_Global_Change_IPCC")
EPA_UNDP_Global_Change_IPCC_df_4 = get_metrics_df_4(Lexicon_df, EPA_UNDP_Global_Change_IPCC_Lexicon, 0, 2, 0.1, "EPA_UNDP_Global_Change_IPCC")

In [170]:
EPA_UNDP_Global_Change_IPCC_df = pd.concat([EPA_UNDP_Global_Change_IPCC_df_1, EPA_UNDP_Global_Change_IPCC_df_2, EPA_UNDP_Global_Change_IPCC_df_3, EPA_UNDP_Global_Change_IPCC_df_4])

In [171]:
Lexicon_df_4 = pd.concat([Lexicon_df_4, EPA_UNDP_Wikipedia_Global_Change_df, EPA_UNDP_Wikipedia_IPCC_df, EPA_UNDP_Global_Change_IPCC_df])

In [172]:
#EPA, BBC, Wikipedia and IPCC
EPA_BBC_Wikipedia_IPCC_Lexicon = pd.concat([EPA_Lexicon, BBC_Lexicon, IPCC_Lexicon, Wikipedia_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_BBC_Wikipedia_IPCC_df_1 = get_metrics_df_1(Lexicon_df, EPA_BBC_Wikipedia_IPCC_Lexicon, 0, 20, "EPA_BBC_Wikipedia_IPCC")
EPA_BBC_Wikipedia_IPCC_df_2 = get_metrics_df_2(Lexicon_df, EPA_BBC_Wikipedia_IPCC_Lexicon, 0, 2, 0.1, "EPA_BBC_Wikipedia_IPCC")
EPA_BBC_Wikipedia_IPCC_df_3 = get_metrics_df_3(Lexicon_df, EPA_BBC_Wikipedia_IPCC_Lexicon, 0, 20, "EPA_BBC_Wikipedia_IPCC")
EPA_BBC_Wikipedia_IPCC_df_4 = get_metrics_df_4(Lexicon_df, EPA_BBC_Wikipedia_IPCC_Lexicon, 0, 2, 0.1, "EPA_BBC_Wikipedia_IPCC")

In [173]:
EPA_BBC_Wikipedia_IPCC_df = pd.concat([EPA_BBC_Wikipedia_IPCC_df_1, EPA_BBC_Wikipedia_IPCC_df_2, EPA_BBC_Wikipedia_IPCC_df_3, EPA_BBC_Wikipedia_IPCC_df_4])

In [174]:
#EPA, BBC, Wikipedia and Global Change
EPA_BBC_Wikipedia_Global_Change_Lexicon = pd.concat([EPA_Lexicon, BBC_Lexicon, Global_Change_Lexicon, Wikipedia_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_BBC_Wikipedia_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, EPA_BBC_Wikipedia_Global_Change_Lexicon, 0, 20, "EPA_BBC_Wikipedia_Global_Change")
EPA_BBC_Wikipedia_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, EPA_BBC_Wikipedia_Global_Change_Lexicon, 0, 2, 0.1, "EPA_BBC_Wikipedia_Global_Change")
EPA_BBC_Wikipedia_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, EPA_BBC_Wikipedia_Global_Change_Lexicon, 0, 20, "EPA_BBC_Wikipedia_Global_Change")
EPA_BBC_Wikipedia_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, EPA_BBC_Wikipedia_Global_Change_Lexicon, 0, 2, 0.1, "EPA_BBC_Wikipedia_Global_Change")

In [175]:
EPA_BBC_Wikipedia_Global_Change_df = pd.concat([EPA_BBC_Wikipedia_Global_Change_df_1, EPA_BBC_Wikipedia_Global_Change_df_2, EPA_BBC_Wikipedia_Global_Change_df_3, EPA_BBC_Wikipedia_Global_Change_df_4])

In [176]:
#EPA, Wikipedia, IPCC and Global Change
EPA_Wikipedia_Global_Change_IPCC_Lexicon = pd.concat([EPA_Lexicon, IPCC_Lexicon, Global_Change_Lexicon, Wikipedia_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_Wikipedia_Global_Change_IPCC_df_1 = get_metrics_df_1(Lexicon_df, EPA_Wikipedia_Global_Change_IPCC_Lexicon, 0, 20, "EPA_Wikipedia_Global_Change_IPCC")
EPA_Wikipedia_Global_Change_IPCC_df_2 = get_metrics_df_2(Lexicon_df, EPA_Wikipedia_Global_Change_IPCC_Lexicon, 0, 2, 0.1, "EPA_Wikipedia_Global_Change_IPCC")
EPA_Wikipedia_Global_Change_IPCC_df_3 = get_metrics_df_3(Lexicon_df, EPA_Wikipedia_Global_Change_IPCC_Lexicon, 0, 20, "EPA_Wikipedia_Global_Change_IPCC")
EPA_Wikipedia_Global_Change_IPCC_df_4 = get_metrics_df_4(Lexicon_df, EPA_Wikipedia_Global_Change_IPCC_Lexicon, 0, 2, 0.1, "EPA_Wikipedia_Global_Change_IPCC")

In [177]:
EPA_Wikipedia_Global_Change_IPCC_df = pd.concat([EPA_Wikipedia_Global_Change_IPCC_df_1, EPA_Wikipedia_Global_Change_IPCC_df_2, EPA_Wikipedia_Global_Change_IPCC_df_3, EPA_Wikipedia_Global_Change_IPCC_df_4])

In [178]:
Lexicon_df_4 = pd.concat([Lexicon_df_4, EPA_BBC_Wikipedia_IPCC_df, EPA_BBC_Wikipedia_Global_Change_df, EPA_Wikipedia_Global_Change_IPCC_df])

In [179]:
#UNDP, BBC, Wikipedia and Global Change
UNDP_BBC_Wikipedia_Global_Change_Lexicon = pd.concat([UNDP_Lexicon, BBC_Lexicon, Global_Change_Lexicon, Wikipedia_Lexicon]).drop_duplicates().reset_index(drop = True)
UNDP_BBC_Wikipedia_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, UNDP_BBC_Wikipedia_Global_Change_Lexicon, 0, 20, "UNDP_BBC_Wikipedia_Global_Change")
UNDP_BBC_Wikipedia_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, UNDP_BBC_Wikipedia_Global_Change_Lexicon, 0, 2, 0.1, "UNDP_BBC_Wikipedia_Global_Change")
UNDP_BBC_Wikipedia_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, UNDP_BBC_Wikipedia_Global_Change_Lexicon, 0, 20, "UNDP_BBC_Wikipedia_Global_Change")
UNDP_BBC_Wikipedia_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, UNDP_BBC_Wikipedia_Global_Change_Lexicon, 0, 2, 0.1, "UNDP_BBC_Wikipedia_Global_Change")

In [180]:
UNDP_BBC_Wikipedia_Global_Change_df = pd.concat([UNDP_BBC_Wikipedia_Global_Change_df_1, UNDP_BBC_Wikipedia_Global_Change_df_2, UNDP_BBC_Wikipedia_Global_Change_df_3, UNDP_BBC_Wikipedia_Global_Change_df_4])

In [181]:
#UNDP, BBC, Wikipedia and IPCC
UNDP_BBC_Wikipedia_IPCC_Lexicon = pd.concat([UNDP_Lexicon, BBC_Lexicon, IPCC_Lexicon, Wikipedia_Lexicon]).drop_duplicates().reset_index(drop = True)
UNDP_BBC_Wikipedia_IPCC_df_1 = get_metrics_df_1(Lexicon_df, UNDP_BBC_Wikipedia_IPCC_Lexicon, 0, 20, "UNDP_BBC_Wikipedia_IPCC")
UNDP_BBC_Wikipedia_IPCC_df_2 = get_metrics_df_2(Lexicon_df, UNDP_BBC_Wikipedia_IPCC_Lexicon, 0, 2, 0.1, "UNDP_BBC_Wikipedia_IPCC")
UNDP_BBC_Wikipedia_IPCC_df_3 = get_metrics_df_3(Lexicon_df, UNDP_BBC_Wikipedia_IPCC_Lexicon, 0, 20, "UNDP_BBC_Wikipedia_IPCC")
UNDP_BBC_Wikipedia_IPCC_df_4 = get_metrics_df_4(Lexicon_df, UNDP_BBC_Wikipedia_IPCC_Lexicon, 0, 2, 0.1, "UNDP_BBC_Wikipedia_IPCC")

In [182]:
UNDP_BBC_Wikipedia_IPCC_df = pd.concat([UNDP_BBC_Wikipedia_IPCC_df_1, UNDP_BBC_Wikipedia_IPCC_df_2, UNDP_BBC_Wikipedia_IPCC_df_3, UNDP_BBC_Wikipedia_IPCC_df_4])

In [183]:
Lexicon_df_4 = pd.concat([Lexicon_df_4, UNDP_BBC_Wikipedia_Global_Change_df, UNDP_BBC_Wikipedia_IPCC_df])

In [184]:
#UNDP, BBC, Global Change and IPCC
UNDP_BBC_Global_Change_IPCC_Lexicon = pd.concat([UNDP_Lexicon, BBC_Lexicon, IPCC_Lexicon, Global_Change_Lexicon]).drop_duplicates().reset_index(drop = True)
UNDP_BBC_Global_Change_IPCC_df_1 = get_metrics_df_1(Lexicon_df, UNDP_BBC_Global_Change_IPCC_Lexicon, 0, 20, "UNDP_BBC_Global_Change_IPCC")
UNDP_BBC_Global_Change_IPCC_df_2 = get_metrics_df_2(Lexicon_df, UNDP_BBC_Global_Change_IPCC_Lexicon, 0, 2, 0.1, "UNDP_BBC_Global_Change_IPCC")
UNDP_BBC_Global_Change_IPCC_df_3 = get_metrics_df_3(Lexicon_df, UNDP_BBC_Global_Change_IPCC_Lexicon, 0, 20, "UNDP_BBC_Global_Change_IPCC")
UNDP_BBC_Global_Change_IPCC_df_4 = get_metrics_df_4(Lexicon_df, UNDP_BBC_Global_Change_IPCC_Lexicon, 0, 2, 0.1, "UNDP_BBC_Global_Change_IPCC")

In [185]:
UNDP_BBC_Global_Change_IPCC_df = pd.concat([UNDP_BBC_Global_Change_IPCC_df_1, UNDP_BBC_Global_Change_IPCC_df_2, UNDP_BBC_Global_Change_IPCC_df_3, UNDP_BBC_Global_Change_IPCC_df_4])

In [186]:
#UNDP, Wikipedia, Global Change and IPCC
UNDP_Wikipedia_Global_Change_IPCC_Lexicon = pd.concat([UNDP_Lexicon, Wikipedia_Lexicon, IPCC_Lexicon, Global_Change_Lexicon]).drop_duplicates().reset_index(drop = True)
UNDP_Wikipedia_Global_Change_IPCC_df_1 = get_metrics_df_1(Lexicon_df, UNDP_Wikipedia_Global_Change_IPCC_Lexicon, 0, 20, "UNDP_Wikipedia_Global_Change_IPCC")
UNDP_Wikipedia_Global_Change_IPCC_df_2 = get_metrics_df_2(Lexicon_df, UNDP_Wikipedia_Global_Change_IPCC_Lexicon, 0, 2, 0.1, "UNDP_Wikipedia_Global_Change_IPCC")
UNDP_Wikipedia_Global_Change_IPCC_df_3 = get_metrics_df_3(Lexicon_df, UNDP_Wikipedia_Global_Change_IPCC_Lexicon, 0, 20, "UNDP_Wikipedia_Global_Change_IPCC")
UNDP_Wikipedia_Global_Change_IPCC_df_4 = get_metrics_df_4(Lexicon_df, UNDP_Wikipedia_Global_Change_IPCC_Lexicon, 0, 2, 0.1, "UNDP_Wikipedia_Global_Change_IPCC")

In [187]:
UNDP_Wikipedia_Global_Change_IPCC_df = pd.concat([UNDP_Wikipedia_Global_Change_IPCC_df_1, UNDP_Wikipedia_Global_Change_IPCC_df_2, UNDP_Wikipedia_Global_Change_IPCC_df_3, UNDP_Wikipedia_Global_Change_IPCC_df_4])

In [188]:
#BBC, Wikipedia, Global Change and IPCC
BBC_Wikipedia_Global_Change_IPCC_Lexicon = pd.concat([BBC_Lexicon, Wikipedia_Lexicon, IPCC_Lexicon, Global_Change_Lexicon]).drop_duplicates().reset_index(drop = True)
BBC_Wikipedia_Global_Change_IPCC_df_1 = get_metrics_df_1(Lexicon_df, BBC_Wikipedia_Global_Change_IPCC_Lexicon, 0, 20, "BBC_Wikipedia_Global_Change_IPCC")
BBC_Wikipedia_Global_Change_IPCC_df_2 = get_metrics_df_2(Lexicon_df, BBC_Wikipedia_Global_Change_IPCC_Lexicon, 0, 2, 0.1, "BBC_Wikipedia_Global_Change_IPCC")
BBC_Wikipedia_Global_Change_IPCC_df_3 = get_metrics_df_3(Lexicon_df, BBC_Wikipedia_Global_Change_IPCC_Lexicon, 0, 20, "BBC_Wikipedia_Global_Change_IPCC")
BBC_Wikipedia_Global_Change_IPCC_df_4 = get_metrics_df_4(Lexicon_df, BBC_Wikipedia_Global_Change_IPCC_Lexicon, 0, 2, 0.1, "BBC_Wikipedia_Global_Change_IPCC")

In [189]:
BBC_Wikipedia_Global_Change_IPCC_df = pd.concat([BBC_Wikipedia_Global_Change_IPCC_df_1, BBC_Wikipedia_Global_Change_IPCC_df_2, BBC_Wikipedia_Global_Change_IPCC_df_3, BBC_Wikipedia_Global_Change_IPCC_df_4])

In [190]:
Lexicon_df_4 = pd.concat([Lexicon_df_4, UNDP_BBC_Global_Change_IPCC_df, UNDP_Wikipedia_Global_Change_IPCC_df, BBC_Wikipedia_Global_Change_IPCC_df])

In [None]:
#EPA, BBC, Global Change and IPCC
EPA_BBC_Global_Change_IPCC_Lexicon = pd.concat([BBC_Lexicon, EPA_Lexicon, IPCC_Lexicon, Global_Change_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_BBC_Global_Change_IPCC_df_1 = get_metrics_df_1(Lexicon_df, EPA_BBC_Global_Change_IPCC_Lexicon, 0, 20, "EPA_BBC_Global_Change_IPCC")
EPA_BBC_Global_Change_IPCC_df_2 = get_metrics_df_2(Lexicon_df, EPA_BBC_Global_Change_IPCC_Lexicon, 0, 2, 0.1, "EPA_BBC_Global_Change_IPCC")
EPA_BBC_Global_Change_IPCC_df_3 = get_metrics_df_3(Lexicon_df, EPA_BBC_Global_Change_IPCC_Lexicon, 0, 20, "EPA_BBC_Global_Change_IPCC")
EPA_BBC_Global_Change_IPCC_df_4 = get_metrics_df_4(Lexicon_df, EPA_BBC_Global_Change_IPCC_Lexicon, 0, 2, 0.1, "EPA_BBC_Global_Change_IPCC")

In [None]:
EPA_BBC_Global_Change_IPCC_df = pd.concat([EPA_BBC_Global_Change_IPCC_df_1, EPA_BBC_Global_Change_IPCC_df_2, EPA_BBC_Global_Change_IPCC_df_3, EPA_BBC_Global_Change_IPCC_df_4])

In [None]:
Lexicon_df_4 = pd.concat([Lexicon_df_4, EPA_BBC_Global_Change_IPCC_df]).reset_index(drop = True)

In [191]:
Lexicon_df_4["Nr"] = 4

### Combining Five Lexicons

In [193]:
#EPA, UNDP, BBC, Wikipedia, IPCC
EPA_UNDP_BBC_Wikipedia_IPCC_Lexicon = pd.concat([BBC_Lexicon, Wikipedia_Lexicon, IPCC_Lexicon, EPA_Lexicon, UNDP_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_UNDP_BBC_Wikipedia_IPCC_df_1 = get_metrics_df_1(Lexicon_df, EPA_UNDP_BBC_Wikipedia_IPCC_Lexicon, 0, 20, "EPA_UNDP_BBC_Wikipedia_IPCC")
EPA_UNDP_BBC_Wikipedia_IPCC_df_2 = get_metrics_df_2(Lexicon_df, EPA_UNDP_BBC_Wikipedia_IPCC_Lexicon, 0, 2, 0.1, "EPA_UNDP_BBC_Wikipedia_IPCC")
EPA_UNDP_BBC_Wikipedia_IPCC_df_3 = get_metrics_df_3(Lexicon_df, EPA_UNDP_BBC_Wikipedia_IPCC_Lexicon, 0, 20, "EPA_UNDP_BBC_Wikipedia_IPCC")
EPA_UNDP_BBC_Wikipedia_IPCC_df_4 = get_metrics_df_4(Lexicon_df, EPA_UNDP_BBC_Wikipedia_IPCC_Lexicon, 0, 2, 0.1, "EPA_UNDP_BBC_Wikipedia_IPCC")

In [194]:
EPA_UNDP_BBC_Wikipedia_IPCC_df = pd.concat([EPA_UNDP_BBC_Wikipedia_IPCC_df_1, EPA_UNDP_BBC_Wikipedia_IPCC_df_2, EPA_UNDP_BBC_Wikipedia_IPCC_df_3, EPA_UNDP_BBC_Wikipedia_IPCC_df_4])

In [196]:
#EPA, UNDP, BBC, Wikipedia, Global Change
EPA_UNDP_BBC_Wikipedia_Global_Change_Lexicon = pd.concat([BBC_Lexicon, Wikipedia_Lexicon, Global_Change_Lexicon, EPA_Lexicon, UNDP_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_UNDP_BBC_Wikipedia_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, EPA_UNDP_BBC_Wikipedia_Global_Change_Lexicon, 0, 20, "EPA_UNDP_BBC_Wikipedia_Global_Change")
EPA_UNDP_BBC_Wikipedia_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, EPA_UNDP_BBC_Wikipedia_Global_Change_Lexicon, 0, 2, 0.1, "EPA_UNDP_BBC_Wikipedia_Global_Change")
EPA_UNDP_BBC_Wikipedia_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, EPA_UNDP_BBC_Wikipedia_Global_Change_Lexicon, 0, 20, "EPA_UNDP_BBC_Wikipedia_Global_Change")
EPA_UNDP_BBC_Wikipedia_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, EPA_UNDP_BBC_Wikipedia_Global_Change_Lexicon, 0, 2, 0.1, "EPA_UNDP_BBC_Wikipedia_Global_Change")

In [197]:
EPA_UNDP_BBC_Wikipedia_Global_Change_df = pd.concat([EPA_UNDP_BBC_Wikipedia_Global_Change_df_1, EPA_UNDP_BBC_Wikipedia_Global_Change_df_2, EPA_UNDP_BBC_Wikipedia_Global_Change_df_3, EPA_UNDP_BBC_Wikipedia_Global_Change_df_4])

In [198]:
#EPA, UNDP, BBC, IPCC, Global Change
EPA_UNDP_BBC_IPCC_Global_Change_Lexicon = pd.concat([BBC_Lexicon, IPCC_Lexicon, Global_Change_Lexicon, EPA_Lexicon, UNDP_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_UNDP_BBC_IPCC_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, EPA_UNDP_BBC_IPCC_Global_Change_Lexicon, 0, 20, "EPA_UNDP_BBC_IPCC_Global_Change")
EPA_UNDP_BBC_IPCC_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, EPA_UNDP_BBC_IPCC_Global_Change_Lexicon, 0, 2, 0.1, "EPA_UNDP_BBC_IPCC_Global_Change")
EPA_UNDP_BBC_IPCC_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, EPA_UNDP_BBC_IPCC_Global_Change_Lexicon, 0, 20, "EPA_UNDP_BBC_IPCC_Global_Change")
EPA_UNDP_BBC_IPCC_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, EPA_UNDP_BBC_IPCC_Global_Change_Lexicon, 0, 2, 0.1, "EPA_UNDP_BBC_IPCC_Global_Change")

In [199]:
EPA_UNDP_BBC_IPCC_Global_Change_df = pd.concat([EPA_UNDP_BBC_IPCC_Global_Change_df_1, EPA_UNDP_BBC_IPCC_Global_Change_df_2, EPA_UNDP_BBC_IPCC_Global_Change_df_3, EPA_UNDP_BBC_IPCC_Global_Change_df_4])

In [200]:
Lexicon_df_5 = pd.concat([EPA_UNDP_BBC_Wikipedia_IPCC_df, EPA_UNDP_BBC_Wikipedia_Global_Change_df, EPA_UNDP_BBC_IPCC_Global_Change_df])

In [201]:
#EPA, UNDP, Wikipedia, IPCC, Global Change
EPA_UNDP_Wikipedia_IPCC_Global_Change_Lexicon = pd.concat([Wikipedia_Lexicon, IPCC_Lexicon, Global_Change_Lexicon, EPA_Lexicon, UNDP_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_UNDP_Wikipedia_IPCC_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, EPA_UNDP_Wikipedia_IPCC_Global_Change_Lexicon, 0, 20, "EPA_UNDP_Wikipedia_IPCC_Global_Change")
EPA_UNDP_Wikipedia_IPCC_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, EPA_UNDP_Wikipedia_IPCC_Global_Change_Lexicon, 0, 2, 0.1, "EPA_UNDP_Wikipedia_IPCC_Global_Change")
EPA_UNDP_Wikipedia_IPCC_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, EPA_UNDP_Wikipedia_IPCC_Global_Change_Lexicon, 0, 20, "EPA_UNDP_Wikipedia_IPCC_Global_Change")
EPA_UNDP_Wikipedia_IPCC_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, EPA_UNDP_Wikipedia_IPCC_Global_Change_Lexicon, 0, 2, 0.1, "EPA_UNDP_Wikipedia_IPCC_Global_Change")

In [202]:
EPA_UNDP_Wikipedia_IPCC_Global_Change_df = pd.concat([EPA_UNDP_Wikipedia_IPCC_Global_Change_df_1, EPA_UNDP_Wikipedia_IPCC_Global_Change_df_2, EPA_UNDP_Wikipedia_IPCC_Global_Change_df_3, EPA_UNDP_Wikipedia_IPCC_Global_Change_df_4])

In [203]:
#EPA, BBC, Wikipedia, IPCC, Global Change
EPA_BBC_Wikipedia_IPCC_Global_Change_Lexicon = pd.concat([Wikipedia_Lexicon, IPCC_Lexicon, Global_Change_Lexicon, EPA_Lexicon, BBC_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_BBC_Wikipedia_IPCC_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, EPA_BBC_Wikipedia_IPCC_Global_Change_Lexicon, 0, 20, "EPA_BBC_Wikipedia_IPCC_Global_Change")
EPA_BBC_Wikipedia_IPCC_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, EPA_BBC_Wikipedia_IPCC_Global_Change_Lexicon, 0, 2, 0.1, "EPA_BBC_Wikipedia_IPCC_Global_Change")
EPA_BBC_Wikipedia_IPCC_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, EPA_BBC_Wikipedia_IPCC_Global_Change_Lexicon, 0, 20, "EPA_BBC_Wikipedia_IPCC_Global_Change")
EPA_BBC_Wikipedia_IPCC_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, EPA_BBC_Wikipedia_IPCC_Global_Change_Lexicon, 0, 2, 0.1, "EPA_BBC_Wikipedia_IPCC_Global_Change")

In [204]:
EPA_BBC_Wikipedia_IPCC_Global_Change_df = pd.concat([EPA_BBC_Wikipedia_IPCC_Global_Change_df_1, EPA_BBC_Wikipedia_IPCC_Global_Change_df_2, EPA_BBC_Wikipedia_IPCC_Global_Change_df_3, EPA_BBC_Wikipedia_IPCC_Global_Change_df_4])

In [205]:
#UNDP, BBC, Wikipedia, IPCC, Global Change
UNDP_BBC_Wikipedia_IPCC_Global_Change_Lexicon = pd.concat([Wikipedia_Lexicon, IPCC_Lexicon, Global_Change_Lexicon, UNDP_Lexicon, BBC_Lexicon]).drop_duplicates().reset_index(drop = True)
UNDP_BBC_Wikipedia_IPCC_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, UNDP_BBC_Wikipedia_IPCC_Global_Change_Lexicon, 0, 20, "UNDP_BBC_Wikipedia_IPCC_Global_Change")
UNDP_BBC_Wikipedia_IPCC_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, UNDP_BBC_Wikipedia_IPCC_Global_Change_Lexicon, 0, 2, 0.1, "UNDP_BBC_Wikipedia_IPCC_Global_Change")
UNDP_BBC_Wikipedia_IPCC_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, UNDP_BBC_Wikipedia_IPCC_Global_Change_Lexicon, 0, 20, "UNDP_BBC_Wikipedia_IPCC_Global_Change")
UNDP_BBC_Wikipedia_IPCC_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, UNDP_BBC_Wikipedia_IPCC_Global_Change_Lexicon, 0, 2, 0.1, "UNDP_BBC_Wikipedia_IPCC_Global_Change")

In [206]:
UNDP_BBC_Wikipedia_IPCC_Global_Change_df = pd.concat([UNDP_BBC_Wikipedia_IPCC_Global_Change_df_1, UNDP_BBC_Wikipedia_IPCC_Global_Change_df_2, UNDP_BBC_Wikipedia_IPCC_Global_Change_df_3, UNDP_BBC_Wikipedia_IPCC_Global_Change_df_4])

In [207]:
Lexicon_df_5 = pd.concat([Lexicon_df_5, EPA_UNDP_Wikipedia_IPCC_Global_Change_df, EPA_BBC_Wikipedia_IPCC_Global_Change_df, UNDP_BBC_Wikipedia_IPCC_Global_Change_df])

In [208]:
Lexicon_df_5["Nr"] = 5

### Combining Six Lexicons

In [216]:
#Full
Full_Lexicon = pd.concat([Global_Change_Lexicon, IPCC_Lexicon, EPA_Lexicon, Wikipedia_Lexicon, BBC_Lexicon, UNDP_Lexicon]).drop_duplicates(subset = ["Lexicon"])
Full_df_1 = get_metrics_df_1(Lexicon_df, Full_Lexicon, 0, 20, "Full")
Full_df_2 = get_metrics_df_2(Lexicon_df, Full_Lexicon, 0, 2, 0.1, "Full")
Full_df_3 = get_metrics_df_3(Lexicon_df, Full_Lexicon, 0, 20, "Full")
Full_df_4 = get_metrics_df_4(Lexicon_df, Full_Lexicon, 0, 2, 0.1, "Full")

In [217]:
Full_df = pd.concat([Full_df_1, Full_df_2, Full_df_3, Full_df_4])

In [218]:
Full_df["Nr"] = 6

In [219]:
Lexicon_Metrics_df = pd.concat([Lexicon_df_1, Lexicon_df_2, Lexicon_df_3, Lexicon_df_4, Lexicon_df_5, Full_df])

In [220]:
Lexicon_Metrics_df.to_parquet("Lexicon_Tagging_Metrics.parquet")

## 3.2. Hugging Face

In [336]:
bert1 = get_metrics_df_hugging_face(Lexicon_df, 'Text',"climatebert/environmental-claims",512, "climatebert/environmental-claims")

In [337]:
bert2 = get_metrics_df_hugging_face(Lexicon_df, "Text", "climatebert/distilroberta-base-climate-detector", 512, "climatebert/distilroberta-base-climate-detector")

In [338]:
bert_df = pd.concat([bert1, bert2]).reset_index(drop = True)

# 4 Final Selection

In [313]:
metrics = pd.read_parquet("Lexicon_Tagging_Metrics.parquet")

In [314]:
#Check if all combinations are present
metrics.groupby("Nr")["Lexicon"].nunique()

Nr
1.0     6
2.0    15
3.0    20
4.0    15
5.0     6
6.0     1
Name: Lexicon, dtype: int64

In [399]:
#Check performance of top 10, for each nummer of combinations
#1
metrics[(metrics["Nr"] == 1) & (metrics["Lexicon"] == "UNDP") & (metrics["Technique"] == "Absolute Presences")].sort_values(by = "Accuracy", ascending = False).reset_index(drop = True)

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score,Nr
0,UNDP,Absolute Presences,3.0,0.833333,0.797619,0.67,0.728261,1.0
1,UNDP,Absolute Presences,4.0,0.75,0.931034,0.27,0.418605,1.0
2,UNDP,Absolute Presences,5.0,0.696667,1.0,0.09,0.165138,1.0
3,UNDP,Absolute Presences,6.0,0.686667,1.0,0.06,0.113208,1.0
4,UNDP,Absolute Presences,7.0,0.673333,1.0,0.02,0.039216,1.0
5,UNDP,Absolute Presences,8.0,0.673333,1.0,0.02,0.039216,1.0
6,UNDP,Absolute Presences,2.0,0.663333,0.497512,1.0,0.664452,1.0
7,UNDP,Absolute Presences,1.0,0.63,0.473934,1.0,0.643087,1.0
8,UNDP,Absolute Presences,15.0,0.333333,0.0,0.0,0.0,1.0
9,UNDP,Absolute Presences,19.0,0.333333,0.0,0.0,0.0,1.0


In [318]:
#2
metrics[metrics["Nr"] == 2].sort_values(by = "Accuracy", ascending = False).head(10).reset_index(drop = True)

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score,Nr
493,EPA_UDNP,Relative Frequency,0.6,0.9,0.8125,0.91,0.858491,2.0
549,BBC_UNDP,Absolute Frequency,4.0,0.9,0.843137,0.86,0.851485,2.0
883,BBC_EPA,Relative Frequency,0.6,0.896667,0.80531,0.91,0.85446,2.0
1195,BBC_Wikipedia,Relative Frequency,0.6,0.896667,0.841584,0.85,0.845771,2.0
885,BBC_EPA,Relative Frequency,0.8,0.896667,0.834951,0.86,0.847291,2.0
1039,EPA_Wikipedia,Relative Frequency,0.6,0.893333,0.803571,0.9,0.849057,2.0
570,BBC_UNDP,Relative Frequency,0.5,0.893333,0.809091,0.89,0.847619,2.0
494,EPA_UDNP,Relative Frequency,0.7,0.893333,0.826923,0.86,0.843137,2.0
1040,EPA_Wikipedia,Relative Frequency,0.7,0.893333,0.826923,0.86,0.843137,2.0
884,BBC_EPA,Relative Frequency,0.7,0.893333,0.820755,0.87,0.84466,2.0


In [319]:
#3
metrics[metrics["Nr"] == 3].sort_values(by = "Accuracy", ascending = False).head(10).reset_index(drop = True)

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score,Nr
1507,EPA_UNDP_BBC,Relative Frequency,0.6,0.9,0.807018,0.92,0.859813,3.0
1509,EPA_UNDP_BBC,Relative Frequency,0.8,0.896667,0.834951,0.86,0.847291,3.0
2131,UNDP_BBC_Wikipedia,Relative Frequency,0.6,0.896667,0.834951,0.86,0.847291,3.0
4755,EPA_BBC_Wikipedia,Relative Frequency,0.8,0.896667,0.834951,0.86,0.847291,3.0
1585,EPA_UNDP_Wikipedia,Relative Frequency,0.6,0.896667,0.80531,0.91,0.85446,3.0
4753,EPA_BBC_Wikipedia,Relative Frequency,0.6,0.893333,0.798246,0.91,0.850467,3.0
1508,EPA_UNDP_BBC,Relative Frequency,0.7,0.893333,0.820755,0.87,0.84466,3.0
1511,EPA_UNDP_BBC,Relative Frequency,1.0,0.893333,0.861702,0.81,0.835052,3.0
1586,EPA_UNDP_Wikipedia,Relative Frequency,0.7,0.893333,0.826923,0.86,0.843137,3.0
4754,EPA_BBC_Wikipedia,Relative Frequency,0.7,0.893333,0.820755,0.87,0.84466,3.0


In [320]:
#4
metrics[metrics["Nr"] == 4].sort_values(by = "Accuracy", ascending = False).head(10).reset_index(drop = True)

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score,Nr
2915,EPA_UNDP_Wikipedia_BBC,Relative Frequency,0.8,0.896667,0.834951,0.86,0.847291,4.0
2913,EPA_UNDP_Wikipedia_BBC,Relative Frequency,0.6,0.896667,0.8,0.92,0.855814,4.0
2914,EPA_UNDP_Wikipedia_BBC,Relative Frequency,0.7,0.893333,0.820755,0.87,0.84466,4.0
3086,EPA_UNDP_BBC_Global_Change,Relative Frequency,1.1,0.89,0.852632,0.81,0.830769,4.0
3085,EPA_UNDP_BBC_Global_Change,Relative Frequency,1.0,0.89,0.831683,0.84,0.835821,4.0
2917,EPA_UNDP_Wikipedia_BBC,Relative Frequency,1.0,0.89,0.852632,0.81,0.830769,4.0
2916,EPA_UNDP_Wikipedia_BBC,Relative Frequency,0.9,0.886667,0.836735,0.82,0.828283,4.0
3083,EPA_UNDP_BBC_Global_Change,Relative Frequency,0.8,0.886667,0.784483,0.91,0.842593,4.0
3507,EPA_BBC_Wikipedia_Global_Change,Relative Frequency,1.2,0.883333,0.849462,0.79,0.818653,4.0
3169,EPA_UNDP_Wikipedia_Global_Change,Relative Frequency,1.0,0.883333,0.828283,0.82,0.824121,4.0


In [321]:
#5
metrics[metrics["Nr"] == 5].sort_values(by = "Accuracy", ascending = False).head(10).reset_index(drop = True)

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score,Nr
4177,EPA_UNDP_BBC_Wikipedia_Global_Change,Relative Frequency,1.0,0.886667,0.823529,0.84,0.831683,5.0
4178,EPA_UNDP_BBC_Wikipedia_Global_Change,Relative Frequency,1.1,0.886667,0.84375,0.81,0.826531,5.0
4179,EPA_UNDP_BBC_Wikipedia_Global_Change,Relative Frequency,1.2,0.883333,0.849462,0.79,0.818653,5.0
4175,EPA_UNDP_BBC_Wikipedia_Global_Change,Relative Frequency,0.8,0.883333,0.777778,0.91,0.83871,5.0
4176,EPA_UNDP_BBC_Wikipedia_Global_Change,Relative Frequency,0.9,0.88,0.790909,0.87,0.828571,5.0
4099,EPA_UNDP_BBC_Wikipedia_IPCC,Relative Frequency,1.6,0.876667,0.805825,0.83,0.817734,5.0
4437,EPA_BBC_Wikipedia_IPCC_Global_Change,Relative Frequency,1.8,0.876667,0.818182,0.81,0.81407,5.0
4353,EPA_UNDP_Wikipedia_IPCC_Global_Change,Relative Frequency,1.8,0.876667,0.818182,0.81,0.81407,5.0
4098,EPA_UNDP_BBC_Wikipedia_IPCC,Relative Frequency,1.5,0.876667,0.788991,0.86,0.822967,5.0
4269,EPA_UNDP_BBC_IPCC_Global_Change,Relative Frequency,1.8,0.876667,0.818182,0.81,0.81407,5.0


In [322]:
#6
metrics[metrics["Nr"] == 6].sort_values(by = "Accuracy", ascending = False).head(10).reset_index(drop = True)

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score,Nr
4605,Full,Relative Frequency,1.8,0.876667,0.818182,0.81,0.81407,6.0
4603,Full,Relative Frequency,1.6,0.873333,0.798077,0.83,0.813725,6.0
4604,Full,Relative Frequency,1.7,0.873333,0.803922,0.82,0.811881,6.0
4606,Full,Relative Frequency,1.9,0.866667,0.8125,0.78,0.795918,6.0
4602,Full,Relative Frequency,1.5,0.866667,0.767857,0.86,0.811321,6.0
4607,Full,Relative Frequency,2.0,0.86,0.822222,0.74,0.778947,6.0
4601,Full,Relative Frequency,1.4,0.85,0.727273,0.88,0.79638,6.0
4600,Full,Relative Frequency,1.3,0.843333,0.705426,0.91,0.79476,6.0
4599,Full,Relative Frequency,1.2,0.816667,0.659574,0.93,0.771784,6.0
4581,Full,Absolute Frequency,15.0,0.81,0.765432,0.62,0.685083,6.0


In [324]:
#Overall Top 10
#1
metrics.sort_values(by = "Accuracy", ascending = False).head(10).reset_index(drop = True)

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score,Nr
0,BBC_UNDP,Absolute Frequency,4.0,0.9,0.843137,0.86,0.851485,2.0
1,EPA_UDNP,Relative Frequency,0.6,0.9,0.8125,0.91,0.858491,2.0
2,EPA_UNDP_BBC,Relative Frequency,0.6,0.9,0.807018,0.92,0.859813,3.0
3,EPA_BBC_Wikipedia,Relative Frequency,0.8,0.896667,0.834951,0.86,0.847291,3.0
4,UNDP_BBC_Wikipedia,Relative Frequency,0.6,0.896667,0.834951,0.86,0.847291,3.0
5,EPA_UNDP_BBC,Relative Frequency,0.8,0.896667,0.834951,0.86,0.847291,3.0
6,EPA,Relative Frequency,0.6,0.896667,0.810811,0.9,0.853081,1.0
7,BBC_Wikipedia,Relative Frequency,0.6,0.896667,0.841584,0.85,0.845771,2.0
8,BBC_EPA,Relative Frequency,0.8,0.896667,0.834951,0.86,0.847291,2.0
9,BBC_EPA,Relative Frequency,0.6,0.896667,0.80531,0.91,0.85446,2.0


In [364]:
Data = metrics.sort_values(by = ["Accuracy", "F1 Score"], ascending = False).head(10).reset_index(drop = True)

In [349]:
bert_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,climatebert/environmental-claims,0.666667,0.0,0.0,0.0
1,climatebert/distilroberta-base-climate-detector,0.766667,0.59375,0.95,0.730769


In [363]:
threshold_metrics_2(Lexicon_df, EPA_Lexicon, 0.60, 0.60, 0)

Threshhold: 0.6
Accuracy: 0.8966666666666666
Precision: 0.8108108108108109
Recall: 0.9
F1 score: 0.8530805687203792
Target Lexicon   No  Yes  All
Target                       
No              179   21  200
Yes              10   90  100
All             189  111  300




In [360]:
get_nsmh_crosstab_2(Lexicon_df, EPA_Lexicon, 0.6, 0.6, 0)

Target Lexicon                     No  Yes  All
Final_Climate_Change_Level_Label               
High                                1   64   65
Medium                              9   26   35
No Climate                        109    0  109
Small                              70   21   91
All                               189  111  300


In [397]:
Data = metrics.sort_values(by = ["Accuracy", "F1 Score"], ascending = False).reset_index(drop = True)
Data["Treshhold"] = round(Data["Treshhold"], 2)
Data.loc[Data["Treshhold"] < 1, "Treshhold"] = Data.loc[Data["Treshhold"] < 1, "Treshhold"].astype("string") + "%"
Data

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score,Nr
0,EPA_UNDP_BBC,Relative Frequency,0.6%,0.900000,0.807018,0.92,0.859813,3.0
1,EPA_UDNP,Relative Frequency,0.6%,0.900000,0.812500,0.91,0.858491,2.0
2,BBC_UNDP,Absolute Frequency,4.0,0.900000,0.843137,0.86,0.851485,2.0
3,EPA_UNDP_Wikipedia_BBC,Relative Frequency,0.6%,0.896667,0.800000,0.92,0.855814,4.0
4,BBC_EPA,Relative Frequency,0.6%,0.896667,0.805310,0.91,0.854460,2.0
...,...,...,...,...,...,...,...,...
5040,EPA_GLobal_Change,Absolute Presences,20.0,0.333333,0.000000,0.00,0.000000,2.0
5041,EPA_BBC_Global_Change_IPCC,Absolute Frequency,0.0%,0.333333,0.000000,0.00,0.000000,4.0
5042,EPA_BBC_Global_Change_IPCC,Relative Frequency,0.0%,0.333333,0.000000,0.00,0.000000,4.0
5043,EPA_BBC_Global_Change_IPCC,Absolute Presences,0.0%,0.333333,0.000000,0.00,0.000000,4.0


In [402]:
max_a = metrics.loc[metrics.groupby(["Lexicon", "Technique"])["Accuracy"].idxmax()]

In [403]:
max_a[max_a["Lexicon"] == "IPCC"]

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score,Nr
91,IPCC,Absolute Frequency,14.0,0.803333,0.80597,0.54,0.646707,1.0
124,IPCC,Absolute Presences,8.0,0.73,0.686275,0.35,0.463576,1.0
112,IPCC,Relative Frequency,1.5,0.85,0.789474,0.75,0.769231,1.0
143,IPCC,Relative Presences,0.7,0.773333,0.637931,0.74,0.685185,1.0


In [416]:
metrics[(metrics["Lexicon"] == "EPA") & (metrics["Technique"] == "Relative Frequency")].sort_values(by = "Accuracy", ascending = False)

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score,Nr
259,EPA,Relative Frequency,0.6,0.896667,0.810811,0.9,0.853081,1.0
260,EPA,Relative Frequency,0.7,0.893333,0.826923,0.86,0.843137,1.0
261,EPA,Relative Frequency,0.8,0.89,0.838384,0.83,0.834171,1.0
263,EPA,Relative Frequency,1.0,0.883333,0.865169,0.77,0.814815,1.0
258,EPA,Relative Frequency,0.5,0.883333,0.773109,0.92,0.840183,1.0
262,EPA,Relative Frequency,0.9,0.883333,0.842105,0.8,0.820513,1.0
257,EPA,Relative Frequency,0.4,0.873333,0.731343,0.98,0.837607,1.0
264,EPA,Relative Frequency,1.1,0.87,0.858824,0.73,0.789189,1.0
265,EPA,Relative Frequency,1.2,0.86,0.853659,0.7,0.769231,1.0
266,EPA,Relative Frequency,1.3,0.853333,0.858974,0.67,0.752809,1.0


In [415]:
threshold_metrics_1(Lexicon_df, EPA_Lexicon, 5, 5)

Threshhold: 5
Accuracy: 0.8766666666666667
Precision: 0.8247422680412371
Recall: 0.8
F1 score: 0.8121827411167513
Target Lexicon   No  Yes  All
Target                       
No              183   17  200
Yes              20   80  100
All             203   97  300


