# 0. Packages and Functions

## 0.1. Packages

In [None]:
import pandas as pd
import string
from nltk import ngrams
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM
import pyarrow.parquet as pq
import pyarrow as pyarrow
import numpy as np

## 0.2. Functions

In [None]:
#pre-process text for lexicon based approaches
def preprocess_text(text):
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # convert to lower case
    text = text.lower()
    # remove blank spaces
    text = ' '.join(text.split())
    # remove newline characters
    text = text.replace('\n', '')
    return text

### Absolute Count with Frequency

In [None]:
#Create function that take the dataframe, lexicon and n-gram value (how many n-grams should be considered) and determine the 
#count of words in dataframe text that match the lexicon
def count_lexicon_words_1(text_df, lexicon):
    lexicon = lexicon["Lexicon"]
    count = []

    for text in text_df["Text"]:
        lexicon_counts = 0

        for word in lexicon:
            lexicon_counts += text.lower().count(word.lower())
            
        count.append(lexicon_counts)
        
    text_df["Lexicon Count"] = count
    
    return(text_df)

#create a function that used the lexicon approach to determine with the target is yes or no
def lexicon_target_classifier_1(df, treshold):
    target = []
    
    count = df["Lexicon Count"]
    
    for c in count:
        if c < treshold:
            target.append("No")
        else:
            target.append("Yes")
            
    df["Target Lexicon"] = target
    return(df)

#Combine both functions to classify articles based on the lexicon
def lexicon_climate_classifier_1(text_df, lexicon, treshold):
    df = count_lexicon_words_1(text_df, lexicon)
    
    return(lexicon_target_classifier_1(df, treshold))

def threshold_metrics_1(df_text, lexicon, min_treshhold, max_treshhold):
    for i in range(min_treshhold, max_treshhold + 1):
        df = lexicon_climate_classifier_1(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)
        
        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / cross_table.iloc[0,1] if cross_table.iloc[0,1] != 0 else 0
        recall = cross_table.iloc[1,1] / cross_table.iloc[1,0] if cross_table.iloc[1,0] != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

        # print the metrics
        print("Threshhold:", i)
        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 score:", f1_score)
        print(cross_table)
        print("\n")
        
def get_metrics_df_1(df_text, lexicon, min_treshhold, max_treshhold, Lexicon_name):    
    Accuracy = []
    Precision = []
    Recall = []
    F1_score = []
    Name = []
    Treshhold = []
    Technique = []
    
    for i in range(min_treshhold, max_treshhold + 1):
        df = lexicon_climate_classifier_1(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / cross_table.iloc[0,1] if cross_table.iloc[0,1] != 0 else 0
        recall = cross_table.iloc[1,1] / cross_table.iloc[1,0] if cross_table.iloc[1,0] != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
        
        Accuracy.append(accuracy)
        Precision.append(precision)
        Recall.append(recall)
        F1_score.append(f1_score)
        Name.append(Lexicon_name)
        Treshhold.append(i)
        Technique.append("Absolute Frequency")
        
    return(pd.DataFrame({"Lexicon" : Name, "Technique": Technique, "Treshhold" : Treshhold, "Accuracy" : Accuracy, "Precision" : Precision, "Recall" : Recall, "F1 Score" : F1_score}))
    


In [None]:
#Create function that take the dataframe, lexicon and n-gram value (how many n-grams should be considered) and determine the 
#count of words in dataframe text that match the lexicon
def count_lexicon_words_1(text_df, lexicon):
    lexicon = lexicon["Lexicon"]
    count = []

    for text in text_df["Text"]:
        lexicon_counts = 0

        for word in lexicon:
            lexicon_counts += text.lower().count(word.lower())
        
        count.append(lexicon_counts)
        
    text_df["Lexicon Count"] = count
    
    return(text_df)

In [None]:
count_lexicon_words_1(tag_climate_df, BBC_Lexicon)

### Relative Count with Frequency

In [None]:
#Create function that take the dataframe, lexicon and n-gram value (how many n-grams should be considered) and determine the 
#count of words in dataframe text that match the lexicon
def count_lexicon_words_2(text_df, lexicon):
    lexicon = lexicon["Lexicon"]
    count = []
    
    for text in text_df["Text"]:
        lexicon_counts = 0

        for word in lexicon:
            lexicon_counts += text.lower().count(word.lower())
        
        word_list = text.split() 
        word_count = len(word_list)
        count.append((lexicon_counts/word_count)*100)

    text_df["Lexicon Count"] = count
    
    return(text_df)

#create a function that used the lexicon approach to determine with the target is yes or no
def lexicon_target_classifier_2(df, treshold):
    target = []
    
    count = df["Lexicon Count"]
    
    for c in count:
        if c < treshold:
            target.append("No")
        else:
            target.append("Yes")
            
    df["Target Lexicon"] = target
    return(df)

#Combine both functions to classify articles based on the lexicon
def lexicon_climate_classifier_2(text_df, lexicon, treshold):
    df = count_lexicon_words_2(text_df, lexicon)
    
    return(lexicon_target_classifier_2(df, treshold))

def threshold_metrics_2(df_text, lexicon, min_treshhold, max_treshhold, jump):
    
    if(jump == 0):
        df = lexicon_climate_classifier_2(df_text, lexicon, min_treshhold)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / cross_table.iloc[0,1] if cross_table.iloc[0,1] != 0 else 0
        recall = cross_table.iloc[1,1] / cross_table.iloc[1,0] if cross_table.iloc[1,0] != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

        # print the metrics
        print("Threshhold:", min_treshhold)
        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 score:", f1_score)
        print(cross_table)
        print("\n")

    else:   
        for num in range(int((max_treshhold - min_treshhold) / jump) + 1):
            i = min_treshhold + num * jump
            df = lexicon_climate_classifier_2(df_text, lexicon, i)
            cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

            # calculate classification metrics using scikit-learn
            accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
            precision = cross_table.iloc[1,1] / cross_table.iloc[0,1] if cross_table.iloc[0,1] != 0 else 0
            recall = cross_table.iloc[1,1] / cross_table.iloc[1,0] if cross_table.iloc[1,0] != 0 else 0
            f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

            # print the metrics
            print("Threshhold:", i)
            print("Accuracy:", accuracy)
            print("Precision:", precision)
            print("Recall:", recall)
            print("F1 score:", f1_score)
            print(cross_table)
            print("\n")

def get_metrics_df_2(df_text, lexicon, min_treshhold, max_treshhold, jump, Lexicon_name):    
    Accuracy = []
    Precision = []
    Recall = []
    F1_score = []
    Name = []
    Treshhold = []
    Technique = []
    
    for num in range(int((max_treshhold - min_treshhold) / jump) + 1):
        i = min_treshhold + num * jump
        df = lexicon_climate_classifier_2(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / cross_table.iloc[0,1] if cross_table.iloc[0,1] != 0 else 0
        recall = cross_table.iloc[1,1] / cross_table.iloc[1,0] if cross_table.iloc[1,0] != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
        
        Accuracy.append(accuracy)
        Precision.append(precision)
        Recall.append(recall)
        F1_score.append(f1_score)
        Name.append(Lexicon_name)
        Treshhold.append(i)
        Technique.append("Relative Frequency")
        
    return(pd.DataFrame({"Lexicon" : Name, "Technique": Technique, "Treshhold" : Treshhold, "Accuracy" : Accuracy, "Precision" : Precision, "Recall" : Recall, "F1 Score" : F1_score}))


### Absolute Term Presences

In [None]:
#Create function that take the dataframe, lexicon and n-gram value (how many n-grams should be considered) and determine the 
#count of words in dataframe text that match the lexicon
def count_lexicon_words_3(text_df, lexicon):
    lexicon = lexicon["Lexicon"]
    count = []

    for text in text_df["Text"]:
        lexicon_counts = 0

        for word in lexicon:
            if text.lower().count(word.lower()) > 0:
                lexicon_counts += 1
        
        count.append(lexicon_counts)
        
    text_df["Lexicon Count"] = count
    
    return text_df

#create a function that used the lexicon approach to determine with the target is yes or no
def lexicon_target_classifier_3(df, treshold):
    target = []
    
    count = df["Lexicon Count"]
    
    for c in count:
        if c < treshold:
            target.append("No")
        else:
            target.append("Yes")
            
    df["Target Lexicon"] = target
    return(df)

#Combine both functions to classify articles based on the lexicon
def lexicon_climate_classifier_3(text_df, lexicon, treshold):
    df = count_lexicon_words_3(text_df, lexicon)
    
    return(lexicon_target_classifier_3(df, treshold))

def threshold_metrics_3(df_text, lexicon, min_treshhold, max_treshhold):
    for i in range(min_treshhold, max_treshhold + 1):
        df = lexicon_climate_classifier_3(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)
        
        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / cross_table.iloc[0,1] if cross_table.iloc[0,1] != 0 else 0
        recall = cross_table.iloc[1,1] / cross_table.iloc[1,0] if cross_table.iloc[1,0] != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

        # print the metrics
        print("Threshhold:", i)
        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 score:", f1_score)
        print(cross_table)
        print("\n")
        
def get_metrics_df_3(df_text, lexicon, min_treshhold, max_treshhold, Lexicon_name):    
    Accuracy = []
    Precision = []
    Recall = []
    F1_score = []
    Name = []
    Treshhold = []
    Technique = []
    
    for i in range(min_treshhold, max_treshhold + 1):
        df = lexicon_climate_classifier_3(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / cross_table.iloc[0,1] if cross_table.iloc[0,1] != 0 else 0
        recall = cross_table.iloc[1,1] / cross_table.iloc[1,0] if cross_table.iloc[1,0] != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
        
        Accuracy.append(accuracy)
        Precision.append(precision)
        Recall.append(recall)
        F1_score.append(f1_score)
        Name.append(Lexicon_name)
        Treshhold.append(i)
        Technique.append("Absolute Presences")
        
    return(pd.DataFrame({"Lexicon" : Name, "Technique": Technique, "Treshhold" : Treshhold, "Accuracy" : Accuracy, "Precision" : Precision, "Recall" : Recall, "F1 Score" : F1_score}))
    


### Relative Term Presences

In [None]:
#Create function that take the dataframe, lexicon and n-gram value (how many n-grams should be considered) and determine the 
#count of words in dataframe text that match the lexicon
def count_lexicon_words_4(text_df, lexicon):
    lexicon = lexicon["Lexicon"]
    count = []

    for text in text_df["Text"]:
        lexicon_counts = 0

        for word in lexicon:
            if text.lower().count(word.lower()) > 0:
                lexicon_counts += 1
        
        count.append(lexicon_counts)
        
    text_df["Lexicon Count"] = count
    
    return text_df

#create a function that used the lexicon approach to determine with the target is yes or no
def lexicon_target_classifier_4(df, treshold):
    target = []
    
    count = df["Lexicon Count"]
    
    for c in count:
        if c < treshold:
            target.append("No")
        else:
            target.append("Yes")
            
    df["Target Lexicon"] = target
    return(df)

#Combine both functions to classify articles based on the lexicon
def lexicon_climate_classifier_4(text_df, lexicon, treshold):
    df = count_lexicon_words_4(text_df, lexicon)
    
    return(lexicon_target_classifier_4(df, treshold))

def threshold_metrics_4(df_text, lexicon, min_treshhold, max_treshhold, jump):
    
    if(jump == 0):
        df = lexicon_climate_classifier_4(df_text, lexicon, min_treshhold)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / cross_table.iloc[0,1] if cross_table.iloc[0,1] != 0 else 0
        recall = cross_table.iloc[1,1] / cross_table.iloc[1,0] if cross_table.iloc[1,0] != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

        # print the metrics
        print("Threshhold:", i)
        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 score:", f1_score)
        print(cross_table)
        print("\n")
        
    else:
        for num in range(int((max_treshhold - min_treshhold) / jump) + 1):
            i = min_treshhold + num * jump
            df = lexicon_climate_classifier_4(df_text, lexicon, i)
            cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

            # calculate classification metrics using scikit-learn
            accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
            precision = cross_table.iloc[1,1] / cross_table.iloc[0,1] if cross_table.iloc[0,1] != 0 else 0
            recall = cross_table.iloc[1,1] / cross_table.iloc[1,0] if cross_table.iloc[1,0] != 0 else 0
            f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

            # print the metrics
            print("Threshhold:", i)
            print("Accuracy:", accuracy)
            print("Precision:", precision)
            print("Recall:", recall)
            print("F1 score:", f1_score)
            print(cross_table)
            print("\n")

def get_metrics_df_4(df_text, lexicon, min_treshhold, max_treshhold, jump, Lexicon_name):    
    Accuracy = []
    Precision = []
    Recall = []
    F1_score = []
    Name = []
    Treshhold = []
    Technique = []
    
    for num in range(int((max_treshhold - min_treshhold) / jump) + 1):
        i = min_treshhold + num * jump
        df = lexicon_climate_classifier_4(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / cross_table.iloc[0,1] if cross_table.iloc[0,1] != 0 else 0
        recall = cross_table.iloc[1,1] / cross_table.iloc[1,0] if cross_table.iloc[1,0] != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
        
        Accuracy.append(accuracy)
        Precision.append(precision)
        Recall.append(recall)
        F1_score.append(f1_score)
        Name.append(Lexicon_name)
        Treshhold.append(i)
        Technique.append("Relative Presences")
        
    return(pd.DataFrame({"Lexicon" : Name, "Technique": Technique, "Treshhold" : Treshhold, "Accuracy" : Accuracy, "Precision" : Precision, "Recall" : Recall, "F1 Score" : F1_score}))


Accuracy: This metric measures the overall performance of a model. It is defined as the number of correct predictions divided by the total number of predictions. Accuracy is a good metric to use when the classes are roughly balanced, meaning there are about the same number of positive and negative examples in the dataset.

Precision: This metric measures how many of the positive predictions made by a model are actually correct. It is defined as the number of true positives divided by the total number of positive predictions. Precision is a good metric to use when we care more about avoiding false positives than false negatives.

Recall: This metric measures how many of the positive examples in the dataset are correctly predicted by the model. It is defined as the number of true positives divided by the total number of actual positive examples. Recall is a good metric to use when we care more about avoiding false negatives than false positives.

F1 score: This metric is a weighted average of precision and recall, where the weight is determined by the beta parameter. The most common value for beta is 1, which gives equal weight to precision and recall. The F1 score is a good metric to use when we want to balance precision and recall, and when the classes are imbalanced.

# 1. Import Label Dataset

In [None]:
tag_climate_df = pd.read_parquet("Climate_Labels_Dataset.parquet")
tag_climate_df.head(5)

In [None]:
#Only keep the required columns
tag_climate_df = tag_climate_df[["Text", "Final_Climate_Change_Level_Label"]]

In [None]:
#Clean the tabel
tag_climate_df['Final_Climate_Change_Level_Label'] = tag_climate_df['Final_Climate_Change_Level_Label'].str.strip()
tag_climate_df[tag_climate_df["Final_Climate_Change_Level_Label"] == "NA"] = "Na"
tag_climate_df[tag_climate_df["Final_Climate_Change_Level_Label"] == "0"] = "Na"
tag_climate_df["Target"] = tag_climate_df["Final_Climate_Change_Level_Label"].apply(lambda x: "Yes" if x in ["High", "Medium"] else "No")

In [None]:
overview_labels_hms = tag_climate_df.groupby("Final_Climate_Change_Level_Label")["Text"].count().reset_index()

In [None]:
overview_labels_hms

In [None]:
overview_labels = tag_climate_df.groupby("Target")["Text"].count().reset_index()

In [None]:
overview_labels

In [None]:
#overview_labels_hms.to_csv("C:/Users/Boedt/OneDrive/Bureaublad/R Thesis/overview_tag_labels_hms", index = False)
#overview_labels.to_csv("C:/Users/Boedt/OneDrive/Bureaublad/R Thesis/overview_tag_labels", index = False)

# 2. Taggers

## 2.1. Lexicon

### Global Change Lexicon

Uitleg

In [None]:
#Load the lexicon
Global_Change_Lexicon = pd.read_csv("Global_Change_Lexicon")
Global_Change_Lexicon = Global_Change_Lexicon.drop_duplicates().reset_index(drop = True)
Global_Change_Lexicon["Lexicon"] = Global_Change_Lexicon["Lexicon"].str.lower()

Global_Change_Lexicon

### IPCC Lexicon

Uitleg

In [None]:
#Load the lexicon
IPCC_Lexicon = pd.read_csv("IPCC_Lexicon")
IPCC_Lexicon = IPCC_Lexicon.drop_duplicates().reset_index(drop = True)
IPCC_Lexicon["Lexicon"] = IPCC_Lexicon["Lexicon"].str.lower()

IPCC_Lexicon

### Wikipedia Lexicon

Uitleg

In [None]:
#Load the lexicon
Wikipedia_Lexicon = pd.read_csv("Wikipedia_Lexicon")
Wikipedia_Lexicon = Wikipedia_Lexicon.drop_duplicates().reset_index(drop = True)
Wikipedia_Lexicon["Lexicon"] = Wikipedia_Lexicon["Lexicon"].str.lower()

Wikipedia_Lexicon

### EPA Lexicon

Uitleg

In [None]:
#Load the lexicon
EPA_Lexicon = pd.read_csv("EPA_Lexicon")
EPA_Lexicon = EPA_Lexicon.drop_duplicates().reset_index(drop = True)
EPA_Lexicon["Lexicon"] = EPA_Lexicon["Lexicon"].str.lower()

EPA_Lexicon

list(EPA_Lexicon["Lexicon"])

### BBC Lexicon

Uitleg

In [None]:
#Load the lexicon
BBC_Lexicon = pd.read_csv("BBC_Lexicon")
BBC_Lexicon = BBC_Lexicon.drop_duplicates().reset_index(drop = True)
BBC_Lexicon["Lexicon"] = BBC_Lexicon["Lexicon"].str.lower()

BBC_Lexicon

### UNDP Lexicon

Uitleg

In [None]:
#Load the lexicon
UNDP_Lexicon = pd.read_csv("UNDP_Lexicon")
UNDP_Lexicon = UNDP_Lexicon.drop_duplicates().reset_index(drop = True)
UNDP_Lexicon["Lexicon"] = UNDP_Lexicon["Lexicon"].str.lower()

UNDP_Lexicon

### Compare the lexicons to each other

In [None]:
#Create an empty dataframe and write a function to fill with the values

common_words_df = pd.DataFrame({"Lexicon" : ["Global Change", "IPCC", "Wikipedia", "EPA", "BBC", "UNDP"], 
                               "Global Change": [0, 0, 0, 0, 0, 0], "IPCC" : [0, 0, 0, 0, 0, 0], 
                               "Wikipedia" : [0, 0, 0, 0, 0, 0], "EPA" : [0, 0, 0, 0, 0, 0], 
                               "BBC" : [0, 0, 0, 0, 0, 0], "UNDP" : [0, 0, 0, 0, 0, 0]})

dfs = [Global_Change_Lexicon, IPCC_Lexicon, Wikipedia_Lexicon, EPA_Lexicon, BBC_Lexicon, UNDP_Lexicon]

for r in range(0, len(dfs)):
    for c in range(0, len(dfs)):
        # Get the common values between the two columns
        common_words = set(dfs[r]['Lexicon']).intersection(set(dfs[c]['Lexicon']))
        common_words_df.loc[r, common_words_df.columns[c +1]] = len(common_words)

common_words_df

## 2.2. Hugging Face

In [None]:
def summary(df_with_text, name, model_name, max_lenght_input=-1):
    data_in_list = df_with_text[name].tolist()
    tokenizer_sum = AutoTokenizer.from_pretrained(model_name)
    model_sum = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    summarizer = pipeline('summarization', model=model_sum, tokenizer = tokenizer_sum) 

    if max_lenght_input>=0:
        df_with_text['summary'] = summarizer(data_in_list, max_length=max_lenght_input)

    else:
        df_with_text['summary'] = summarizer(data_in_list)

def classification(df_with_text, name, model_name, max_lenght_input=-1):
    data_in_list = df_with_text[name].tolist()
    tokenizer_clas = AutoTokenizer.from_pretrained(model_name)
    model_clas = AutoModelForSequenceClassification.from_pretrained(model_name)
    classification = pipeline('text-classification', model=model_clas, tokenizer = tokenizer_clas) 

    if max_lenght_input>=0:
        df_with_text['classification'] = classification(data_in_list, max_length=max_lenght_input, truncation=True)

    else:
        df_with_text['classification'] = classification(data_in_list)

In [None]:
#classification(Lexicon_df, 'Text',"climatebert/environmental-claims",512)

# 3. Testen Taggers

## 3.1. Lexicon

In [307]:
#create a separate df with the specific cleaning for the lexicons
Lexicon_df = tag_climate_df.copy()
Lexicon_df["Text"] = Lexicon_df["Text"].apply(preprocess_text)

### 3.1.1. One Lexicon

In [None]:
#Global Change Lexicon
Global_Change_df_1 = get_metrics_df_1(Lexicon_df, Global_Change_Lexicon, 0, 20, "Global Change")
Global_Change_df_2 = get_metrics_df_2(Lexicon_df, Global_Change_Lexicon, 0, 2, 0.1, "Global Change")
Global_Change_df_3 = get_metrics_df_3(Lexicon_df, Global_Change_Lexicon, 0, 20, "Global Change")
Global_Change_df_4 = get_metrics_df_4(Lexicon_df, Global_Change_Lexicon, 0, 2, 0.1, "Global Change")

In [None]:
Global_Change_df = pd.concat([Global_Change_df_1, Global_Change_df_2, Global_Change_df_3, Global_Change_df_4])

In [None]:
#IPCC Lexicon
IPCC_df_1 = get_metrics_df_1(Lexicon_df, IPCC_Lexicon, 0, 20, "IPCC")
IPCC_df_2 = get_metrics_df_2(Lexicon_df, IPCC_Lexicon, 0, 2, 0.1, "IPCC")
IPCC_df_3 = get_metrics_df_3(Lexicon_df, IPCC_Lexicon, 0, 20, "IPCC")
IPCC_df_4 = get_metrics_df_4(Lexicon_df, IPCC_Lexicon, 0, 2, 0.1, "IPCC")

In [None]:
IPCC_df = pd.concat([IPCC_df_1, IPCC_df_2, IPCC_df_3, IPCC_df_4])

In [None]:
#Wikipedia Lexicon
Wikipedia_df_1 = get_metrics_df_1(Lexicon_df, Wikipedia_Lexicon, 0, 20, "Wikipedia")
Wikipedia_df_2 = get_metrics_df_2(Lexicon_df, Wikipedia_Lexicon, 0, 2, 0.1, "Wikipedia")
Wikipedia_df_3 = get_metrics_df_3(Lexicon_df, Wikipedia_Lexicon, 0, 20, "Wikipedia")
Wikipedia_df_4 = get_metrics_df_4(Lexicon_df, Wikipedia_Lexicon, 0, 2, 0.1, "Wikipedia")

In [None]:
Wikipedia_df = pd.concat([Wikipedia_df_1, Wikipedia_df_2, Wikipedia_df_3, Wikipedia_df_4])

In [None]:
#EPA Lexicon
EPA_df_1 = get_metrics_df_1(Lexicon_df, EPA_Lexicon, 0, 20, "EPA")
EPA_df_2 = get_metrics_df_2(Lexicon_df, EPA_Lexicon, 0, 2, 0.1, "EPA")
EPA_df_3 = get_metrics_df_3(Lexicon_df, EPA_Lexicon, 0, 20, "EPA")
EPA_df_4 = get_metrics_df_4(Lexicon_df, EPA_Lexicon, 0, 2, 0.1, "EPA")

In [None]:
EPA_df = pd.concat([EPA_df_1, EPA_df_2, EPA_df_3, EPA_df_4])

In [None]:
#BBC Lexicon
BBC_df_1 = get_metrics_df_1(Lexicon_df, BBC_Lexicon, 0, 20, "BBC")
BBC_df_2 = get_metrics_df_2(Lexicon_df, BBC_Lexicon, 0, 2, 0.1, "BBC")
BBC_df_3 = get_metrics_df_3(Lexicon_df, BBC_Lexicon, 0, 20, "BBC")
BBC_df_4 = get_metrics_df_4(Lexicon_df, BBC_Lexicon, 0, 2, 0.1, "BBC")

In [None]:
BBC_df = pd.concat([BBC_df_1, BBC_df_2, BBC_df_3, BBC_df_4])

In [None]:
#UNDP Lexicon
UNDP_df_1 = get_metrics_df_1(Lexicon_df, UNDP_Lexicon, 0, 20, "UNDP")
UNDP_df_2 = get_metrics_df_2(Lexicon_df, UNDP_Lexicon, 0, 2, 0.1, "UNDP")
UNDP_df_3 = get_metrics_df_3(Lexicon_df, UNDP_Lexicon, 0, 20, "UNDP")
UNDP_df_4 = get_metrics_df_4(Lexicon_df, UNDP_Lexicon, 0, 2, 0.1, "UNDP")

In [None]:
UNDP_df = pd.concat([UNDP_df_1, UNDP_df_2, UNDP_df_3, UNDP_df_4])

In [None]:
#Get all lexicons together
Lexicon_df_1 = pd.concat([Global_Change_df, IPCC_df, Wikipedia_df, EPA_df, BBC_df, UNDP_df])

In [None]:
Lexicon_df_1.sort_values("F1 Score", ascending = False).reset_index(drop = True).head(20)

In [None]:
threshold_metrics_2(Lexicon_df, EPA_Lexicon, 0.6, 0.6, 0)

### 3.1.2. Two Lexicons Combined

In [None]:
#UNDP and EPA
EPA_UNDP_Lexicon = pd.concat([EPA_Lexicon, UNDP_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_UNDP_df_1 = get_metrics_df_1(Lexicon_df, EPA_UNDP_Lexicon, 0, 20, "EPA_UDNP")
EPA_UNDP_df_2 = get_metrics_df_2(Lexicon_df, EPA_UNDP_Lexicon, 0, 2, 0.1, "EPA_UDNP")
EPA_UNDP_df_3 = get_metrics_df_3(Lexicon_df, EPA_UNDP_Lexicon, 0, 20, "EPA_UDNP")
EPA_UNDP_df_4 = get_metrics_df_4(Lexicon_df, EPA_UNDP_Lexicon, 0, 2, 0.1, "EPA_UDNP")

In [None]:
EPA_UNDP_df = pd.concat([EPA_UNDP_df_1, EPA_UNDP_df_2, EPA_UNDP_df_3, EPA_UNDP_df_4])

In [None]:
#UNDP and BBC
BBC_UNDP_Lexicon = pd.concat([BBC_Lexicon, UNDP_Lexicon]).drop_duplicates().reset_index(drop = True)
BBC_UNDP_df_1 = get_metrics_df_1(Lexicon_df, BBC_UNDP_Lexicon, 0, 20, "BBC_UNDP")
BBC_UNDP_df_2 = get_metrics_df_2(Lexicon_df, BBC_UNDP_Lexicon, 0, 2, 0.1, "BBC_UNDP")
BBC_UNDP_df_3 = get_metrics_df_3(Lexicon_df, BBC_UNDP_Lexicon, 0, 20, "BBC_UNDP")
BBC_UNDP_df_4 = get_metrics_df_4(Lexicon_df, BBC_UNDP_Lexicon, 0, 2, 0.1, "BBC_UNDP")

In [None]:
BBC_UNDP_df = pd.concat([BBC_UNDP_df_1, BBC_UNDP_df_2, BBC_UNDP_df_3, BBC_UNDP_df_4])

In [None]:
#UNDP and Global Change 
UNDP_Global_Change_Lexicon = pd.concat([Global_Change_Lexicon, UNDP_Lexicon]).drop_duplicates().reset_index(drop = True)
UNDP_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, UNDP_Global_Change_Lexicon, 0, 20, "UNDP_Global_Change")
UNDP_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, UNDP_Global_Change_Lexicon, 0, 2, 0.1, "UNDP_Global_Change")
UNDP_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, UNDP_Global_Change_Lexicon, 0, 20, "UNDP_Global_Change")
UNDP_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, UNDP_Global_Change_Lexicon, 0, 2, 0.1, "UNDP_Global_Change")

In [None]:
Global_Change_UNDP_df = pd.concat([UNDP_Global_Change_df_1, UNDP_Global_Change_df_2, UNDP_Global_Change_df_3, UNDP_Global_Change_df_4])

In [None]:
Global_Change_UNDP_df.sort_values("Accuracy", ascending = False)

In [None]:
#UNDP and IPCC
UNDP_IPCC_Lexicon = pd.concat([IPCC_Lexicon, UNDP_Lexicon]).drop_duplicates().reset_index(drop = True)
UNDP_IPCC_df_1 = get_metrics_df_1(Lexicon_df, UNDP_IPCC_Lexicon, 0, 20, "UNDP_IPCC")
UNDP_IPCC_df_2 = get_metrics_df_2(Lexicon_df, UNDP_IPCC_Lexicon, 0, 2, 0.1, "UNDP_IPCC")
UNDP_IPCC_df_3 = get_metrics_df_3(Lexicon_df, UNDP_IPCC_Lexicon, 0, 20, "UNDP_IPCC")
UNDP_IPCC_df_4 = get_metrics_df_4(Lexicon_df, UNDP_IPCC_Lexicon, 0, 2, 0.1, "UNDP_IPCC")

In [None]:
IPCC_UNDP_df = pd.concat([UNDP_IPCC_df_1, UNDP_IPCC_df_2, UNDP_IPCC_df_3, UNDP_IPCC_df_4])

In [None]:
#UNDP and Wikipedia
UNDP_Wikipedia_Lexicon = pd.concat([Wikipedia_Lexicon, UNDP_Lexicon]).drop_duplicates().reset_index(drop = True)
UNDP_Wikipedia_df_1 = get_metrics_df_1(Lexicon_df, UNDP_Wikipedia_Lexicon, 0, 20, "UNDP_Wikipedia")
UNDP_Wikipedia_df_2 = get_metrics_df_2(Lexicon_df, UNDP_Wikipedia_Lexicon, 0, 2, 0.1, "UNDP_Wikipedia")
UNDP_Wikipedia_df_3 = get_metrics_df_3(Lexicon_df, UNDP_Wikipedia_Lexicon, 0, 20, "UNDP_Wikipedia")
UNDP_Wikipedia_df_4 = get_metrics_df_4(Lexicon_df, UNDP_Wikipedia_Lexicon, 0, 2, 0.1, "UNDP_Wikipedia")

In [None]:
Wikipedia_UNDP_df = pd.concat([UNDP_Wikipedia_df_1, UNDP_Wikipedia_df_2, UNDP_Wikipedia_df_3, UNDP_Wikipedia_df_4])

In [None]:
Wikipedia_UNDP_df.sort_values("Accuracy", ascending = False)

In [None]:
Lexicon_df_2 = pd.concat([EPA_UNDP_df, BBC_UNDP_df, Global_Change_UNDP_df, IPCC_UNDP_df, Wikipedia_UNDP_df]).reset_index(drop = True)

In [None]:
Lexicon_df_2.sort_values("Accuracy", ascending = False)

In [None]:
#EPA and BBC
EPA_BBC_Lexicon = pd.concat([EPA_Lexicon, BBC_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_BBC_df_1 = get_metrics_df_1(Lexicon_df, EPA_BBC_Lexicon, 0, 20, "BBC_EPA")
EPA_BBC_df_2 = get_metrics_df_2(Lexicon_df, EPA_BBC_Lexicon, 0, 2, 0.1, "BBC_EPA")
EPA_BBC_df_3 = get_metrics_df_3(Lexicon_df, EPA_BBC_Lexicon, 0, 20, "BBC_EPA")
EPA_BBC_df_4 = get_metrics_df_4(Lexicon_df, EPA_BBC_Lexicon, 0, 2, 0.1, "BBC_EPA")

In [None]:
EPA_BBC_df = pd.concat([EPA_BBC_df_1, EPA_BBC_df_2, EPA_BBC_df_3, EPA_BBC_df_4])

In [None]:
EPA_BBC_df.sort_values("Accuracy", ascending = False)

In [None]:
#EPA and Global Change
EPA_Global_Change_Lexicon = pd.concat([EPA_Lexicon, Global_Change_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, EPA_Global_Change_Lexicon, 0, 20, "EPA_GLobal_Change")
EPA_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, EPA_Global_Change_Lexicon, 0, 2, 0.1, "BBC_Global_Change")
EPA_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, EPA_Global_Change_Lexicon, 0, 20, "BBC_Global_Change")
EPA_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, EPA_Global_Change_Lexicon, 0, 2, 0.1, "BBC_Global_Change")

In [None]:
EPA_Global_Change_df = pd.concat([EPA_Global_Change_df_1, EPA_Global_Change_df_2, EPA_Global_Change_df_3, EPA_Global_Change_df_4])

In [None]:
EPA_Global_Change_df.sort_values("Accuracy", ascending = False)

In [None]:
#EPA and IPCC
EPA_IPCC_Lexicon = pd.concat([EPA_Lexicon, IPCC_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_IPCC_df_1 = get_metrics_df_1(Lexicon_df, EPA_IPCC_Lexicon, 0, 20, "EPA_IPCC")
EPA_IPCC_df_2 = get_metrics_df_2(Lexicon_df, EPA_IPCC_Lexicon, 0, 2, 0.1, "EPA_IPCC")
EPA_IPCC_df_3 = get_metrics_df_3(Lexicon_df, EPA_IPCC_Lexicon, 0, 20, "EPA_IPCC")
EPA_IPCC_df_4 = get_metrics_df_4(Lexicon_df, EPA_IPCC_Lexicon, 0, 2, 0.1, "EPA_IPCC")

In [None]:
EPA_IPCC_df = pd.concat([EPA_IPCC_df_1, EPA_IPCC_df_2, EPA_IPCC_df_3, EPA_IPCC_df_4])

In [None]:
EPA_IPCC_df.sort_values("Accuracy", ascending = False)

In [None]:
#EPA and Wikipedia
EPA_Wikipedia_Lexicon = pd.concat([EPA_Lexicon, Wikipedia_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_Wikipedia_df_1 = get_metrics_df_1(Lexicon_df, EPA_Wikipedia_Lexicon, 0, 20, "EPA_Wikipedia")
EPA_Wikipedia_df_2 = get_metrics_df_2(Lexicon_df, EPA_Wikipedia_Lexicon, 0, 2, 0.1, "EPA_Wikipedia")
EPA_Wikipedia_df_3 = get_metrics_df_3(Lexicon_df, EPA_Wikipedia_Lexicon, 0, 20, "EPA_Wikipedia")
EPA_Wikipedia_df_4 = get_metrics_df_4(Lexicon_df, EPA_Wikipedia_Lexicon, 0, 2, 0.1, "EPA_Wikipedia")

In [None]:
EPA_Wikipedia_df = pd.concat([EPA_Wikipedia_df_1, EPA_Wikipedia_df_2, EPA_Wikipedia_df_3, EPA_Wikipedia_df_4])

In [None]:
EPA_Wikipedia_df.sort_values("Accuracy", ascending = False)

In [None]:
Lexicon_df_2 = pd.concat([Lexicon_df_2, EPA_BBC_df, EPA_Global_Change_df, EPA_IPCC_df, EPA_Wikipedia_df]).reset_index(drop = True)

In [None]:
Lexicon_df_2.sort_values("Accuracy", ascending = False)

In [None]:
#BBC and Global Change
BBC_Global_Change_Lexicon = pd.concat([BBC_Lexicon, Global_Change_Lexicon]).drop_duplicates().reset_index(drop = True)
BBC_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, BBC_Global_Change_Lexicon, 0, 20, "BBC_Global_Change")
BBC_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, BBC_Global_Change_Lexicon, 0, 2, 0.1, "BBC_Global_Change")
BBC_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, BBC_Global_Change_Lexicon, 0, 20, "BBC_Global_Change")
BBC_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, BBC_Global_Change_Lexicon, 0, 2, 0.1, "BBC_Global_Change")

In [None]:
BBC_Global_Change_df = pd.concat([BBC_Global_Change_df_1, BBC_Global_Change_df_2, BBC_Global_Change_df_3, BBC_Global_Change_df_4])

In [None]:
#BBC and IPCC
BBC_IPCC_Lexicon = pd.concat([BBC_Lexicon, IPCC_Lexicon]).drop_duplicates().reset_index(drop = True)
BBC_IPCC_df_1 = get_metrics_df_1(Lexicon_df, BBC_IPCC_Lexicon, 0, 20, "BBC_IPCC")
BBC_IPCC_df_2 = get_metrics_df_2(Lexicon_df, BBC_IPCC_Lexicon, 0, 2, 0.1, "BBC_IPCC")
BBC_IPCC_df_3 = get_metrics_df_3(Lexicon_df, BBC_IPCC_Lexicon, 0, 20, "BBC_IPCC")
BBC_IPCC_df_4 = get_metrics_df_4(Lexicon_df, BBC_IPCC_Lexicon, 0, 2, 0.1, "BBC_IPCC")

In [None]:
BBC_IPCC_df = pd.concat([BBC_IPCC_df_1, BBC_IPCC_df_2, BBC_IPCC_df_3, BBC_IPCC_df_4])

In [None]:
#BBC and Wikipedia
BBC_Wikipedia_Lexicon = pd.concat([BBC_Lexicon, Wikipedia_Lexicon]).drop_duplicates().reset_index(drop = True)
BBC_Wikipedia_df_1 = get_metrics_df_1(Lexicon_df, BBC_Wikipedia_Lexicon, 0, 20, "BBC_Wikipedia")
BBC_Wikipedia_df_2 = get_metrics_df_2(Lexicon_df, BBC_Wikipedia_Lexicon, 0, 2, 0.1, "BBC_Wikipedia")
BBC_Wikipedia_df_3 = get_metrics_df_3(Lexicon_df, BBC_Wikipedia_Lexicon, 0, 20, "BBC_Wikipedia")
BBC_Wikipedia_df_4 = get_metrics_df_4(Lexicon_df, BBC_Wikipedia_Lexicon, 0, 2, 0.1, "BBC_Wikipedia")

In [None]:
BBC_Wikpedia_df = pd.concat([BBC_Wikipedia_df_1, BBC_Wikipedia_df_2, BBC_Wikipedia_df_3, BBC_Wikipedia_df_4])

In [None]:
Lexicon_df_2 = pd.concat([Lexicon_df_2, BBC_Global_Change_df, BBC_IPCC_df, BBC_Wikpedia_df]).reset_index(drop = True)

In [None]:
#Wikipedia and Global Change
Wikipedia_Global_Change_Lexicon = pd.concat([Global_Change_Lexicon, Wikipedia_Lexicon]).drop_duplicates().reset_index(drop = True)
Wikipedia_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, Wikipedia_Global_Change_Lexicon, 0, 20, "Wikipedia_Global_Change")
Wikipedia_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, Wikipedia_Global_Change_Lexicon, 0, 2, 0.1, "Wikipedia_Global_Change")
Wikipedia_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, Wikipedia_Global_Change_Lexicon, 0, 20, "Wikipedia_Global_Change")
Wikipedia_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, Wikipedia_Global_Change_Lexicon, 0, 2, 0.1, "Wikipedia_Global_Change")

In [None]:
Wikpedia_Global_Change_df = pd.concat([Wikipedia_Global_Change_df_1, Wikipedia_Global_Change_df_2, Wikipedia_Global_Change_df_3, Wikipedia_Global_Change_df_4])

In [None]:
#Wikipedia and IPCC
Wikipedia_IPCC_Lexicon = pd.concat([IPCC_Lexicon, Wikipedia_Lexicon]).drop_duplicates().reset_index(drop = True)
Wikipedia_IPCC_df_1 = get_metrics_df_1(Lexicon_df, Wikipedia_IPCC_Lexicon, 0, 20, "Wikipedia_IPCC")
Wikipedia_IPCC_df_2 = get_metrics_df_2(Lexicon_df, Wikipedia_IPCC_Lexicon, 0, 2, 0.1, "Wikipedia_IPCC")
Wikipedia_IPCC_df_3 = get_metrics_df_3(Lexicon_df, Wikipedia_IPCC_Lexicon, 0, 20, "Wikipedia_IPCC")
Wikipedia_IPCC_df_4 = get_metrics_df_4(Lexicon_df, Wikipedia_IPCC_Lexicon, 0, 2, 0.1, "Wikipedia_IPCC")

In [None]:
Wikpedia_IPCC_df = pd.concat([Wikipedia_IPCC_df_1, Wikipedia_IPCC_df_2, Wikipedia_IPCC_df_3, Wikipedia_IPCC_df_4])

In [None]:
#IPCC and Global Change
Global_Change_IPCC_Lexicon = pd.concat([IPCC_Lexicon, Global_Change_Lexicon]).drop_duplicates().reset_index(drop = True)
Global_Change_IPCC_df_1 = get_metrics_df_1(Lexicon_df, Global_Change_IPCC_Lexicon, 0, 20, "Global_Change_IPCC")
Global_Change_IPCC_df_2 = get_metrics_df_2(Lexicon_df, Global_Change_IPCC_Lexicon, 0, 2, 0.1, "Global_Change_IPCC")
Global_Change_IPCC_df_3 = get_metrics_df_3(Lexicon_df, Global_Change_IPCC_Lexicon, 0, 20, "Global_Change_IPCC")
Global_Change_IPCC_df_4 = get_metrics_df_4(Lexicon_df, Global_Change_IPCC_Lexicon, 0, 2, 0.1, "Global_Change_IPCC")

In [None]:
Global_Change_IPCC_df = pd.concat([Global_Change_IPCC_df_1, Global_Change_IPCC_df_2, Global_Change_IPCC_df_3, Global_Change_IPCC_df_4])

In [None]:
Lexicon_df_2 = pd.concat([Lexicon_df_2, Wikpedia_Global_Change_df, Wikpedia_IPCC_df, Global_Change_IPCC_df]).reset_index(drop = True)

In [None]:
Lexicon_df_2.sort_values("Accuracy", ascending = False)

In [310]:
Lexicon_Metrics = pd.concat([Lexicon_df_1, Lexicon_df_2]).reset_index(drop = True)

In [313]:
Lexicon_Metrics.sort_values(by="Precision", ascending = False)

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score
597,BBC_UNDP,Absolute Frequency,9.0,0.823333,48.0,0.923077,1.811321
936,BBC_EPA,Absolute Frequency,12.0,0.823333,48.0,0.923077,1.811321
516,EPA_UDNP,Absolute Frequency,12.0,0.820000,47.0,0.886792,1.740741
264,EPA,Absolute Frequency,12.0,0.816667,46.0,0.851852,1.672727
518,EPA_UDNP,Absolute Frequency,14.0,0.813333,45.0,0.818182,1.607143
...,...,...,...,...,...,...,...
438,UNDP,Absolute Frequency,18.0,0.720000,0.0,0.190476,0.000000
437,UNDP,Absolute Frequency,17.0,0.726667,0.0,0.219512,0.000000
436,UNDP,Absolute Frequency,16.0,0.733333,0.0,0.250000,0.000000
435,UNDP,Absolute Frequency,15.0,0.743333,0.0,0.298701,0.000000


In [316]:
threshold_metrics_1(Lexicon_df, BBC_UNDP_Lexicon, 12, 12)

Threshhold: 12
Accuracy: 0.7866666666666666
Precision: 0
Recall: 0.5625
F1 score: 0.0
Target Lexicon   No  Yes  All
Target                       
No              200    0  200
Yes              64   36  100
All             264   36  300


