# 0. Packages and Functions

## 0.1. Packages

In [1]:
import pandas as pd
import string
from nltk import ngrams
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM
import pyarrow.parquet as pq
import pyarrow as pyarrow

## 0.2. Functions

In [2]:
#pre-process text for lexicon based approaches
def preprocess_text(text):
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # convert to lower case
    text = text.lower()
    # remove blank spaces
    text = ' '.join(text.split())
    # remove newline characters
    text = text.replace('\n', '')
    return text

### Absolute Count with Frequency

In [3]:
#Create function that take the dataframe, lexicon and n-gram value (how many n-grams should be considered) and determine the 
#count of words in dataframe text that match the lexicon
def count_lexicon_words_1(text_df, lexicon):
    lexicon = lexicon["Lexicon"]
    count = []

    for text in text_df["Text"]:
        lexicon_counts = 0

        for word in lexicon:
            lexicon_counts += text.lower().count(word.lower())
        
        count.append(lexicon_counts)
        
    text_df["Lexicon Count"] = count
    
    return(text_df)

#create a function that used the lexicon approach to determine with the target is yes or no
def lexicon_target_classifier_1(df, treshold):
    target = []
    
    count = df["Lexicon Count"]
    
    for c in count:
        if c < treshold:
            target.append("No")
        else:
            target.append("Yes")
            
    df["Target Lexicon"] = target
    return(df)

#Combine both functions to classify articles based on the lexicon
def lexicon_climate_classifier_1(text_df, lexicon, treshold):
    df = count_lexicon_words_1(text_df, lexicon)
    
    return(lexicon_target_classifier_1(df, treshold))

def threshold_metrics_1(df_text, lexicon, min_treshhold, max_treshhold):
    for i in range(min_treshhold, max_treshhold + 1):
        df = lexicon_climate_classifier_1(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)
        
        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / cross_table.iloc[0,1] if cross_table.iloc[0,1] != 0 else 0
        recall = cross_table.iloc[1,1] / cross_table.iloc[1,0] if cross_table.iloc[1,0] != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

        # print the metrics
        print("Threshhold:", i)
        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 score:", f1_score)
        print(cross_table)
        print("\n")
        
def get_metrics_df_1(df_text, lexicon, min_treshhold, max_treshhold, Lexicon_name):    
    Accuracy = []
    Precision = []
    Recall = []
    F1_score = []
    Name = []
    Treshhold = []
    Technique = []
    
    for i in range(min_treshhold, max_treshhold + 1):
        df = lexicon_climate_classifier_1(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / cross_table.iloc[0,1] if cross_table.iloc[0,1] != 0 else 0
        recall = cross_table.iloc[1,1] / cross_table.iloc[1,0] if cross_table.iloc[1,0] != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
        
        Accuracy.append(accuracy)
        Precision.append(precision)
        Recall.append(recall)
        F1_score.append(f1_score)
        Name.append(Lexicon_name)
        Treshhold.append(i)
        Technique.append("Absolute Frequency")
        
    return(pd.DataFrame({"Lexicon" : Name, "Technique": Technique, "Treshhold" : Treshhold, "Accuracy" : Accuracy, "Precision" : Precision, "Recall" : Recall, "F1 Score" : F1_score}))
    


### Relative Count with Frequency

In [4]:
#Create function that take the dataframe, lexicon and n-gram value (how many n-grams should be considered) and determine the 
#count of words in dataframe text that match the lexicon
def count_lexicon_words_2(text_df, lexicon):
    lexicon = lexicon["Lexicon"]
    count = []
    
    for text in text_df["Text"]:
        lexicon_counts = 0

        for word in lexicon:
            lexicon_counts += text.lower().count(word.lower())
        
        word_list = text.split() 
        word_count = len(word_list)
        count.append((lexicon_counts/word_count)*100)

    text_df["Lexicon Count"] = count
    
    return(text_df)

#create a function that used the lexicon approach to determine with the target is yes or no
def lexicon_target_classifier_2(df, treshold):
    target = []
    
    count = df["Lexicon Count"]
    
    for c in count:
        if c < treshold:
            target.append("No")
        else:
            target.append("Yes")
            
    df["Target Lexicon"] = target
    return(df)

#Combine both functions to classify articles based on the lexicon
def lexicon_climate_classifier_2(text_df, lexicon, treshold):
    df = count_lexicon_words_2(text_df, lexicon)
    
    return(lexicon_target_classifier_2(df, treshold))

def threshold_metrics_2(df_text, lexicon, min_treshhold, max_treshhold, jump):
    for num in range(int((max_treshhold - min_treshhold) / jump) + 1):
        i = min_treshhold + num * jump
        df = lexicon_climate_classifier_2(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / cross_table.iloc[0,1] if cross_table.iloc[0,1] != 0 else 0
        recall = cross_table.iloc[1,1] / cross_table.iloc[1,0] if cross_table.iloc[1,0] != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

        # print the metrics
        print("Threshhold:", i)
        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 score:", f1_score)
        print(cross_table)
        print("\n")

def get_metrics_df_2(df_text, lexicon, min_treshhold, max_treshhold, jump, Lexicon_name):    
    Accuracy = []
    Precision = []
    Recall = []
    F1_score = []
    Name = []
    Treshhold = []
    Technique = []
    
    for num in range(int((max_treshhold - min_treshhold) / jump) + 1):
        i = min_treshhold + num * jump
        df = lexicon_climate_classifier_2(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / cross_table.iloc[0,1] if cross_table.iloc[0,1] != 0 else 0
        recall = cross_table.iloc[1,1] / cross_table.iloc[1,0] if cross_table.iloc[1,0] != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
        
        Accuracy.append(accuracy)
        Precision.append(precision)
        Recall.append(recall)
        F1_score.append(f1_score)
        Name.append(Lexicon_name)
        Treshhold.append(i)
        Technique.append("Relative Frequency")
        
    return(pd.DataFrame({"Lexicon" : Name, "Technique": Technique, "Treshhold" : Treshhold, "Accuracy" : Accuracy, "Precision" : Precision, "Recall" : Recall, "F1 Score" : F1_score}))


### Absolute Term Presences

In [5]:
#Create function that take the dataframe, lexicon and n-gram value (how many n-grams should be considered) and determine the 
#count of words in dataframe text that match the lexicon
def count_lexicon_words_3(text_df, lexicon):
    lexicon = lexicon["Lexicon"]
    count = []

    for text in text_df["Text"]:
        lexicon_counts = 0

        for word in lexicon:
            if text.lower().count(word.lower()) > 0:
                lexicon_counts += 1
        
        count.append(lexicon_counts)
        
    text_df["Lexicon Count"] = count
    
    return text_df

#create a function that used the lexicon approach to determine with the target is yes or no
def lexicon_target_classifier_3(df, treshold):
    target = []
    
    count = df["Lexicon Count"]
    
    for c in count:
        if c < treshold:
            target.append("No")
        else:
            target.append("Yes")
            
    df["Target Lexicon"] = target
    return(df)

#Combine both functions to classify articles based on the lexicon
def lexicon_climate_classifier_3(text_df, lexicon, treshold):
    df = count_lexicon_words_3(text_df, lexicon)
    
    return(lexicon_target_classifier_3(df, treshold))

def threshold_metrics_3(df_text, lexicon, min_treshhold, max_treshhold):
    for i in range(min_treshhold, max_treshhold + 1):
        df = lexicon_climate_classifier_3(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)
        
        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / cross_table.iloc[0,1] if cross_table.iloc[0,1] != 0 else 0
        recall = cross_table.iloc[1,1] / cross_table.iloc[1,0] if cross_table.iloc[1,0] != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

        # print the metrics
        print("Threshhold:", i)
        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 score:", f1_score)
        print(cross_table)
        print("\n")
        
def get_metrics_df_3(df_text, lexicon, min_treshhold, max_treshhold, Lexicon_name):    
    Accuracy = []
    Precision = []
    Recall = []
    F1_score = []
    Name = []
    Treshhold = []
    Technique = []
    
    for i in range(min_treshhold, max_treshhold + 1):
        df = lexicon_climate_classifier_3(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / cross_table.iloc[0,1] if cross_table.iloc[0,1] != 0 else 0
        recall = cross_table.iloc[1,1] / cross_table.iloc[1,0] if cross_table.iloc[1,0] != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
        
        Accuracy.append(accuracy)
        Precision.append(precision)
        Recall.append(recall)
        F1_score.append(f1_score)
        Name.append(Lexicon_name)
        Treshhold.append(i)
        Technique.append("Absolute Presences")
        
    return(pd.DataFrame({"Lexicon" : Name, "Technique": Technique, "Treshhold" : Treshhold, "Accuracy" : Accuracy, "Precision" : Precision, "Recall" : Recall, "F1 Score" : F1_score}))
    


### Relative Term Presences

In [6]:
#Create function that take the dataframe, lexicon and n-gram value (how many n-grams should be considered) and determine the 
#count of words in dataframe text that match the lexicon
def count_lexicon_words_4(text_df, lexicon):
    lexicon = lexicon["Lexicon"]
    count = []

    for text in text_df["Text"]:
        lexicon_counts = 0

        for word in lexicon:
            if text.lower().count(word.lower()) > 0:
                lexicon_counts += 1
        
        count.append(lexicon_counts)
        
    text_df["Lexicon Count"] = count
    
    return text_df

#create a function that used the lexicon approach to determine with the target is yes or no
def lexicon_target_classifier_4(df, treshold):
    target = []
    
    count = df["Lexicon Count"]
    
    for c in count:
        if c < treshold:
            target.append("No")
        else:
            target.append("Yes")
            
    df["Target Lexicon"] = target
    return(df)

#Combine both functions to classify articles based on the lexicon
def lexicon_climate_classifier_4(text_df, lexicon, treshold):
    df = count_lexicon_words_4(text_df, lexicon)
    
    return(lexicon_target_classifier_4(df, treshold))

def threshold_metrics_4(df_text, lexicon, min_treshhold, max_treshhold, jump):
    for num in range(int((max_treshhold - min_treshhold) / jump) + 1):
        i = min_treshhold + num * jump
        df = lexicon_climate_classifier_4(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / cross_table.iloc[0,1] if cross_table.iloc[0,1] != 0 else 0
        recall = cross_table.iloc[1,1] / cross_table.iloc[1,0] if cross_table.iloc[1,0] != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

        # print the metrics
        print("Threshhold:", i)
        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 score:", f1_score)
        print(cross_table)
        print("\n")

def get_metrics_df_4(df_text, lexicon, min_treshhold, max_treshhold, jump, Lexicon_name):    
    Accuracy = []
    Precision = []
    Recall = []
    F1_score = []
    Name = []
    Treshhold = []
    Technique = []
    
    for num in range(int((max_treshhold - min_treshhold) / jump) + 1):
        i = min_treshhold + num * jump
        df = lexicon_climate_classifier_4(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / cross_table.iloc[0,1] if cross_table.iloc[0,1] != 0 else 0
        recall = cross_table.iloc[1,1] / cross_table.iloc[1,0] if cross_table.iloc[1,0] != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
        
        Accuracy.append(accuracy)
        Precision.append(precision)
        Recall.append(recall)
        F1_score.append(f1_score)
        Name.append(Lexicon_name)
        Treshhold.append(i)
        Technique.append("Relative Presences")
        
    return(pd.DataFrame({"Lexicon" : Name, "Technique": Technique, "Treshhold" : Treshhold, "Accuracy" : Accuracy, "Precision" : Precision, "Recall" : Recall, "F1 Score" : F1_score}))


Accuracy: This metric measures the overall performance of a model. It is defined as the number of correct predictions divided by the total number of predictions. Accuracy is a good metric to use when the classes are roughly balanced, meaning there are about the same number of positive and negative examples in the dataset.

Precision: This metric measures how many of the positive predictions made by a model are actually correct. It is defined as the number of true positives divided by the total number of positive predictions. Precision is a good metric to use when we care more about avoiding false positives than false negatives.

Recall: This metric measures how many of the positive examples in the dataset are correctly predicted by the model. It is defined as the number of true positives divided by the total number of actual positive examples. Recall is a good metric to use when we care more about avoiding false negatives than false positives.

F1 score: This metric is a weighted average of precision and recall, where the weight is determined by the beta parameter. The most common value for beta is 1, which gives equal weight to precision and recall. The F1 score is a good metric to use when we want to balance precision and recall, and when the classes are imbalanced.

# 1. Import Label Dataset

In [7]:
tag_climate_df = pd.read_parquet("Climate_Labels_Dataset.parquet")
tag_climate_df.head(5)

Unnamed: 0,Text,Link,Sentiment_Label,Sentiment_Label_R,Level_Climate_Change_Topic,Level_Climate_Change_Topic_R,Final_Climate_Change_Level_Label
0,More than a dozen state attorneys general gath...,https://www.washingtonpost.com/news/energy-env...,1,-1,Medium,Medium,Medium
1,Sen. Jeff Merkley of Oregon endorsed Bernie S...,http://www.wsj.com/articles/campaign-wire-1460...,0,0,Small,Small,Small
2,When Carmen Luna moved to a neighborhood on t...,https://www.wsj.com/articles/mexico-city-strug...,-1,-1,Medium,Medium,Medium
3,As ocean warming continues to trigger widespre...,https://www.washingtonpost.com/national/health...,1,-1,High,High,High
4,PG&E Corp. told California regulators that it...,https://www.wsj.com/articles/pg-e-equipment-mi...,-1,-1,Medium,Medium,Medium


In [8]:
#Only keep the required columns
tag_climate_df = tag_climate_df[["Text", "Final_Climate_Change_Level_Label"]]

In [9]:
#Clean the tabel
tag_climate_df['Final_Climate_Change_Level_Label'] = tag_climate_df['Final_Climate_Change_Level_Label'].str.strip()
tag_climate_df[tag_climate_df["Final_Climate_Change_Level_Label"] == "NA"] = "Na"
tag_climate_df[tag_climate_df["Final_Climate_Change_Level_Label"] == "0"] = "Na"
tag_climate_df["Target"] = tag_climate_df["Final_Climate_Change_Level_Label"].apply(lambda x: "Yes" if x in ["High", "Medium"] else "No")

In [10]:
overview_labels_hms = tag_climate_df.groupby("Final_Climate_Change_Level_Label")["Text"].count().reset_index()

In [11]:
overview_labels_hms

Unnamed: 0,Final_Climate_Change_Level_Label,Text
0,High,65
1,Medium,35
2,Na,109
3,Small,91


In [12]:
overview_labels = tag_climate_df.groupby("Target")["Text"].count().reset_index()

In [13]:
overview_labels

Unnamed: 0,Target,Text
0,No,200
1,Yes,100


In [14]:
#overview_labels_hms.to_csv("C:/Users/Boedt/OneDrive/Bureaublad/R Thesis/overview_tag_labels_hms", index = False)
#overview_labels.to_csv("C:/Users/Boedt/OneDrive/Bureaublad/R Thesis/overview_tag_labels", index = False)

# 2. Taggers

## 2.1. Lexicon

### Global Change Lexicon

Uitleg

In [19]:
#Load the lexicon
Global_Change_Lexicon = pd.read_excel('lexicons-used.xlsx', sheet_name = "Global Change", header = None)
Global_Change_Lexicon.columns = ["Lexicon"]
Global_Change_Lexicon = Global_Change_Lexicon.drop_duplicates().reset_index(drop = True)
Global_Change_Lexicon["Lexicon"] = Global_Change_Lexicon["Lexicon"].str.lower()

Global_Change_Lexicon

Unnamed: 0,Lexicon
0,100-year flood
1,emissions scenario
2,adaptation
3,adaptation science
4,adaptive capacity
...,...
101,vector
102,vulnerability
103,vulnerability assessment
104,water security


### IPCC Lexicon

Uitleg

In [20]:
#Load the lexicon
IPCC_Lexicon = pd.read_excel('lexicons-used.xlsx', sheet_name = "IPCC", header = None)
IPCC_Lexicon.columns = ["Lexicon"]
IPCC_Lexicon = IPCC_Lexicon.drop_duplicates().reset_index(drop = True)
IPCC_Lexicon["Lexicon"] = IPCC_Lexicon["Lexicon"].str.lower()

IPCC_Lexicon

Unnamed: 0,Lexicon
0,acceptability of policy or system change
1,adaptability
2,adaptation
3,adaptation behaviour
4,adaptation limits
...,...
402,sd
403,sdgs
404,tcre
405,tod


### Wikipedia Lexicon

Uitleg

In [21]:
#Load the lexicon
Wikipedia_Lexicon = pd.read_excel('lexicons-used.xlsx', sheet_name = "Wikipedia", header = None)
Wikipedia_Lexicon = pd.DataFrame(Wikipedia_Lexicon[0])
Wikipedia_Lexicon.columns = ["Lexicon"]
Wikipedia_Lexicon = Wikipedia_Lexicon.drop_duplicates().reset_index(drop = True)
Wikipedia_Lexicon["Lexicon"] = Wikipedia_Lexicon["Lexicon"].str.lower()

Wikipedia_Lexicon

Unnamed: 0,Lexicon
0,"100,000-year problem"
1,adaptation
2,additionality
3,albedo
4,anoxic event
...,...
159,volcanism
160,water vapor
161,weather
162,world climate report


### EPA Lexicon

Uitleg

In [22]:
#Load the lexicon
EPA_Lexicon = pd.read_excel('lexicons-used.xlsx', sheet_name = "EPA", header = None)
EPA_Lexicon = pd.DataFrame(EPA_Lexicon[0])
EPA_Lexicon.columns = ["Lexicon"]
EPA_Lexicon = EPA_Lexicon.drop_duplicates().reset_index(drop = True)
EPA_Lexicon["Lexicon"] = EPA_Lexicon["Lexicon"].str.lower()

EPA_Lexicon

Unnamed: 0,Lexicon
0,abrupt climate change
1,adaptation
2,adaptive capacity
3,aerosols
4,afforestation
...,...
171,pfcs
172,sf6
173,o3
174,uv


### BBC Lexicon

Uitleg

In [23]:
#Load the lexicon
BBC_Lexicon = pd.read_excel('lexicons-used.xlsx', sheet_name = "EPA", header = None)
BBC_Lexicon = pd.DataFrame(BBC_Lexicon[0])
BBC_Lexicon.columns = ["Lexicon"]
BBC_Lexicon = BBC_Lexicon.drop_duplicates().reset_index(drop = True)
BBC_Lexicon["Lexicon"] = BBC_Lexicon["Lexicon"].str.lower()

BBC_Lexicon

Unnamed: 0,Lexicon
0,abrupt climate change
1,adaptation
2,adaptive capacity
3,aerosols
4,afforestation
...,...
171,pfcs
172,sf6
173,o3
174,uv


### UNDP Lexicon

Uitleg

In [24]:
#Load the lexicon
UNDP_Lexicon = pd.read_excel('lexicons-used.xlsx', sheet_name = "UNDP", header = None)
UNDP_Lexicon = pd.DataFrame(UNDP_Lexicon[0])
UNDP_Lexicon.columns = ["Lexicon"]
UNDP_Lexicon = UNDP_Lexicon.drop_duplicates().reset_index(drop = True)
UNDP_Lexicon["Lexicon"] = UNDP_Lexicon["Lexicon"].str.lower()

UNDP_Lexicon

Unnamed: 0,Lexicon
0,weather
1,climate
2,greenhouse gases
3,greenhouse gas emmisions
4,global warming
5,climate change
6,climate crisis
7,feedback loop
8,tipping point
9,climate overshoot


## 2.2. Hugging Face

In [25]:
def summary(df_with_text, name, model_name, max_lenght_input=-1):
    data_in_list = df_with_text[name].tolist()
    tokenizer_sum = AutoTokenizer.from_pretrained(model_name)
    model_sum = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    summarizer = pipeline('summarization', model=model_sum, tokenizer = tokenizer_sum) 

    if max_lenght_input>=0:
        df_with_text['summary'] = summarizer(data_in_list, max_length=max_lenght_input)

    else:
        df_with_text['summary'] = summarizer(data_in_list)

def classification(df_with_text, name, model_name, max_lenght_input=-1):
    data_in_list = df_with_text[name].tolist()
    tokenizer_clas = AutoTokenizer.from_pretrained(model_name)
    model_clas = AutoModelForSequenceClassification.from_pretrained(model_name)
    classification = pipeline('text-classification', model=model_clas, tokenizer = tokenizer_clas) 

    if max_lenght_input>=0:
        df_with_text['classification'] = classification(data_in_list, max_length=max_lenght_input, truncation=True)

    else:
        df_with_text['classification'] = classification(data_in_list)

In [48]:
#classification(Lexicon_df, 'Text',"climatebert/environmental-claims",512)

# 3. Testen Taggers

## 3.1. Lexicon

In [26]:
#create a separate df with the specific cleaning for the lexicons
Lexicon_df = tag_climate_df.copy()
Lexicon_df["Text"] = Lexicon_df["Text"].apply(preprocess_text)

### 3.1.1. One Lexicon

In [27]:
#Global Change Lexicon
Global_Change_df_1 = get_metrics_df_1(Lexicon_df, Global_Change_Lexicon, 0, 20, "Global Change")
Global_Change_df_2 = get_metrics_df_2(Lexicon_df, Global_Change_Lexicon, 0, 2, 0.1, "Global Change")
Global_Change_df_3 = get_metrics_df_3(Lexicon_df, Global_Change_Lexicon, 0, 20, "Global Change")
Global_Change_df_4 = get_metrics_df_4(Lexicon_df, Global_Change_Lexicon, 0, 2, 0.1, "Global Change")

In [28]:
Global_Change_df = pd.concat([Global_Change_df_1, Global_Change_df_2, Global_Change_df_3, Global_Change_df_4])

In [29]:
#IPCC Lexicon
IPCC_df_1 = get_metrics_df_1(Lexicon_df, IPCC_Lexicon, 0, 20, "IPCC")
IPCC_df_2 = get_metrics_df_2(Lexicon_df, IPCC_Lexicon, 0, 2, 0.1, "IPCC")
IPCC_df_3 = get_metrics_df_3(Lexicon_df, IPCC_Lexicon, 0, 20, "IPCC")
IPCC_df_4 = get_metrics_df_4(Lexicon_df, IPCC_Lexicon, 0, 2, 0.1, "IPCC")

In [30]:
IPCC_df = pd.concat([IPCC_df_1, IPCC_df_2, IPCC_df_3, IPCC_df_4])

In [31]:
#Wikipedia Lexicon
Wikipedia_df_1 = get_metrics_df_1(Lexicon_df, Wikipedia_Lexicon, 0, 20, "Wikipedia")
Wikipedia_df_2 = get_metrics_df_2(Lexicon_df, Wikipedia_Lexicon, 0, 2, 0.1, "Wikipedia")
Wikipedia_df_3 = get_metrics_df_3(Lexicon_df, Wikipedia_Lexicon, 0, 20, "Wikipedia")
Wikipedia_df_4 = get_metrics_df_4(Lexicon_df, Wikipedia_Lexicon, 0, 2, 0.1, "Wikipedia")

In [32]:
Wikipedia_df = pd.concat([Wikipedia_df_1, Wikipedia_df_2, Wikipedia_df_3, Wikipedia_df_4])

In [33]:
#EPA Lexicon
EPA_df_1 = get_metrics_df_1(Lexicon_df, EPA_Lexicon, 0, 20, "EPA")
EPA_df_2 = get_metrics_df_2(Lexicon_df, EPA_Lexicon, 0, 2, 0.1, "EPA")
EPA_df_3 = get_metrics_df_3(Lexicon_df, EPA_Lexicon, 0, 20, "EPA")
EPA_df_4 = get_metrics_df_4(Lexicon_df, EPA_Lexicon, 0, 2, 0.1, "EPA")

In [34]:
EPA_df = pd.concat([EPA_df_1, EPA_df_2, EPA_df_3, EPA_df_4])

In [None]:
#BBC Lexicon
BBC_df_1 = get_metrics_df_1(Lexicon_df, BBC_Lexicon, 0, 20, "BBC")
BBC_df_2 = get_metrics_df_2(Lexicon_df, BBC_Lexicon, 0, 2, 0.1, "BBC")
BBC_df_3 = get_metrics_df_3(Lexicon_df, BBC_Lexicon, 0, 20, "BBC")
BBC_df_4 = get_metrics_df_4(Lexicon_df, BBC_Lexicon, 0, 2, 0.1, "BBC")

In [None]:
BBC_df = pd.concat([BBC_df_1, BBC_df_2, BBC_df_3, BBC_df_4])

In [None]:
#UNDP Lexicon
UNDP_df_1 = get_metrics_df_1(Lexicon_df, UNDP_Lexicon, 0, 20, "UNDP")
UNDP_df_2 = get_metrics_df_2(Lexicon_df, UNDP_Lexicon, 0, 2, 0.1, "UNDP")
UNDP_df_3 = get_metrics_df_3(Lexicon_df, UNDP_Lexicon, 0, 20, "UNDP")
UNDP_df_4 = get_metrics_df_4(Lexicon_df, UNDP_Lexicon, 0, 2, 0.1, "UNDP")

In [None]:
UNDP_df = pd.concat([UNDP_df_1, UNDP_df_2, UNDP_df_3, UNDP_df_4])

In [None]:
#Get all lexicons together
Lexicon_df_1 = pd.concat([Global_Change_df, IPCC_df, Wikipedia_df, EPA_df, BBC_df, UNDP_df])

In [None]:
Lexicon_df_1.sort_values("Accuracy", ascending = False).reset_index(drop = True).head(20)

### 3.1.2. Two Lexicons Combined

In [57]:
#UNDP and EPA
EPA_UNDP_Lexicon = pd.concat([EPA_Lexicon, UNDP_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_UNDP_df_1 = get_metrics_df_1(Lexicon_df, EPA_UNDP_Lexicon, 0, 20, "UNDP_EPA")
EPA_UNDP_df_2 = get_metrics_df_2(Lexicon_df, EPA_UNDP_Lexicon, 0, 2, 0.1, "UNDP_EPA")
EPA_UNDP_df_3 = get_metrics_df_3(Lexicon_df, EPA_UNDP_Lexicon, 0, 20, "UNDP_EPA")
EPA_UNDP_df_4 = get_metrics_df_4(Lexicon_df, EPA_UNDP_Lexicon, 0, 2, 0.1, "UNDP_EPA")

In [58]:
EPA_UNDP_df = pd.concat([EPA_UNDP_df_1, EPA_UNDP_df_2, EPA_UNDP_df_3, EPA_UNDP_df_4])

In [59]:
#UNDP and BBC
BBC_UNDP_Lexicon = pd.concat([BBC_Lexicon, UNDP_Lexicon]).drop_duplicates().reset_index(drop = True)
BBC_UNDP_df_1 = get_metrics_df_1(Lexicon_df, BBC_UNDP_Lexicon, 0, 20, "BBC_UNDP")
BBC_UNDP_df_2 = get_metrics_df_2(Lexicon_df, BBC_UNDP_Lexicon, 0, 2, 0.1, "BBC_UNDP")
BBC_UNDP_df_3 = get_metrics_df_3(Lexicon_df, BBC_UNDP_Lexicon, 0, 20, "BBC_UNDP")
BBC_UNDP_df_4 = get_metrics_df_4(Lexicon_df, BBC_UNDP_Lexicon, 0, 2, 0.1, "BBC_UNDP")

In [60]:
BBC_UNDP_df = pd.concat([BBC_UNDP_df_1, BBC_UNDP_df_2, BBC_UNDP_df_3, BBC_UNDP_df_4])

In [61]:
#UNDP and Global Change 
UNDP_Global_Change_Lexicon = pd.concat([Global_Change_Lexicon, UNDP_Lexicon]).drop_duplicates().reset_index(drop = True)
UNDP_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, UNDP_Global_Change_Lexicon, 0, 20, "UNDP_Global_Change")
UNDP_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, UNDP_Global_Change_Lexicon, 0, 2, 0.1, "UNDP_Global_Change")
UNDP_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, UNDP_Global_Change_Lexicon, 0, 20, "UNDP_Global_Change")
UNDP_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, UNDP_Global_Change_Lexicon, 0, 2, 0.1, "UNDP_Global_Change")

In [62]:
Global_Change_UNDP_df = pd.concat([UNDP_Global_Change_df_1, UNDP_Global_Change_df_2, UNDP_Global_Change_df_3, UNDP_Global_Change_df_4])

In [63]:
Global_Change_UNDP_df.sort_values("Accuracy", ascending = False)

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score
9,UNDP_Global_Change,Relative Frequency,0.9,0.860000,4.052632,3.347826,3.666667
8,UNDP_Global_Change,Relative Frequency,0.8,0.856667,3.478261,4.000000,3.720930
10,UNDP_Global_Change,Relative Frequency,1.0,0.853333,4.500000,2.571429,3.272727
12,UNDP_Global_Change,Relative Frequency,1.2,0.846667,5.153846,2.030303,2.913043
11,UNDP_Global_Change,Relative Frequency,1.1,0.846667,4.600000,2.225806,3.000000
...,...,...,...,...,...,...,...
17,UNDP_Global_Change,Absolute Presences,17.0,0.333333,0.500000,1.000000,0.666667
16,UNDP_Global_Change,Absolute Presences,16.0,0.333333,0.500000,1.000000,0.666667
15,UNDP_Global_Change,Absolute Presences,15.0,0.333333,0.500000,1.000000,0.666667
0,UNDP_Global_Change,Relative Frequency,0.0,0.333333,0.500000,1.000000,0.666667


In [64]:
#UNDP and IPCC
UNDP_IPCC_Lexicon = pd.concat([IPCC_Lexicon, UNDP_Lexicon]).drop_duplicates().reset_index(drop = True)
UNDP_IPCC_df_1 = get_metrics_df_1(Lexicon_df, UNDP_IPCC_Lexicon, 0, 20, "UNDP_IPCC")
UNDP_IPCC_df_2 = get_metrics_df_2(Lexicon_df, UNDP_IPCC_Lexicon, 0, 2, 0.1, "UNDP_IPCC")
UNDP_IPCC_df_3 = get_metrics_df_3(Lexicon_df, UNDP_IPCC_Lexicon, 0, 20, "UNDP_IPCC")
UNDP_IPCC_df_4 = get_metrics_df_4(Lexicon_df, UNDP_IPCC_Lexicon, 0, 2, 0.1, "UNDP_IPCC")

In [65]:
IPCC_UNDP_df = pd.concat([UNDP_IPCC_df_1, UNDP_IPCC_df_2, UNDP_IPCC_df_3, UNDP_IPCC_df_4])

In [66]:
#UNDP and Wikipedia
UNDP_Wikipedia_Lexicon = pd.concat([Wikipedia_Lexicon, UNDP_Lexicon]).drop_duplicates().reset_index(drop = True)
UNDP_Wikipedia_df_1 = get_metrics_df_1(Lexicon_df, UNDP_Wikipedia_Lexicon, 0, 20, "UNDP_Wikipedia")
UNDP_Wikipedia_df_2 = get_metrics_df_2(Lexicon_df, UNDP_Wikipedia_Lexicon, 0, 2, 0.1, "UNDP_Wikipedia")
UNDP_Wikipedia_df_3 = get_metrics_df_3(Lexicon_df, UNDP_Wikipedia_Lexicon, 0, 20, "UNDP_Wikipedia")
UNDP_Wikipedia_df_4 = get_metrics_df_4(Lexicon_df, UNDP_Wikipedia_Lexicon, 0, 2, 0.1, "UNDP_Wikipedia")

In [67]:
Wikipedia_UNDP_df = pd.concat([UNDP_Wikipedia_df_1, UNDP_Wikipedia_df_2, UNDP_Wikipedia_df_3, UNDP_Wikipedia_df_4])

In [68]:
Wikipedia_UNDP_df.sort_values("Accuracy", ascending = False)

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score
11,UNDP_Wikipedia,Relative Frequency,1.1,0.886667,5.125000,4.555556,4.823529
12,UNDP_Wikipedia,Relative Frequency,1.2,0.886667,5.714286,4.000000,4.705882
13,UNDP_Wikipedia,Relative Frequency,1.3,0.880000,5.571429,3.545455,4.333333
10,UNDP_Wikipedia,Relative Frequency,1.0,0.880000,3.909091,6.142857,4.777778
15,UNDP_Wikipedia,Relative Frequency,1.5,0.876667,6.250000,3.000000,4.054054
...,...,...,...,...,...,...,...
19,UNDP_Wikipedia,Absolute Presences,19.0,0.333333,0.500000,1.000000,0.666667
18,UNDP_Wikipedia,Absolute Presences,18.0,0.333333,0.500000,1.000000,0.666667
17,UNDP_Wikipedia,Absolute Presences,17.0,0.333333,0.500000,1.000000,0.666667
0,UNDP_Wikipedia,Relative Frequency,0.0,0.333333,0.500000,1.000000,0.666667


In [69]:
Lexicon_df_2 = pd.concat([EPA_UNDP_df, BBC_UNDP_df, Global_Change_UNDP_df, IPCC_UNDP_df, Wikipedia_UNDP_df]).reset_index(drop = True)

In [70]:
Lexicon_df_2.sort_values("Accuracy", ascending = False)

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score
30,UNDP_EPA,Relative Frequency,0.9,0.886667,3.869565,8.090909,5.235294
114,BBC_UNDP,Relative Frequency,0.9,0.886667,3.869565,8.090909,5.235294
368,UNDP_Wikipedia,Relative Frequency,1.1,0.886667,5.125000,4.555556,4.823529
369,UNDP_Wikipedia,Relative Frequency,1.2,0.886667,5.714286,4.000000,4.705882
29,UNDP_EPA,Relative Frequency,0.8,0.883333,3.407407,11.500000,5.257143
...,...,...,...,...,...,...,...
262,UNDP_IPCC,Absolute Frequency,10.0,0.326667,0.471503,10.111111,0.900990
260,UNDP_IPCC,Absolute Frequency,8.0,0.326667,0.474227,11.500000,0.910891
258,UNDP_IPCC,Absolute Frequency,6.0,0.326667,0.484848,24.000000,0.950495
263,UNDP_IPCC,Absolute Frequency,11.0,0.326667,0.471503,10.111111,0.900990


In [71]:
#EPA and BBC
EPA_BBC_Lexicon = pd.concat([EPA_Lexicon, BBC_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_BBC_df_1 = get_metrics_df_1(Lexicon_df, EPA_BBC_Lexicon, 0, 20, "BBC_EPA")
EPA_BBC_df_2 = get_metrics_df_2(Lexicon_df, EPA_BBC_Lexicon, 0, 2, 0.1, "BBC_EPA")
EPA_BBC_df_3 = get_metrics_df_3(Lexicon_df, EPA_BBC_Lexicon, 0, 20, "BBC_EPA")
EPA_BBC_df_4 = get_metrics_df_4(Lexicon_df, EPA_BBC_Lexicon, 0, 2, 0.1, "BBC_EPA")

In [72]:
EPA_BBC_df = pd.concat([EPA_BBC_df_1, EPA_BBC_df_2, EPA_BBC_df_3, EPA_BBC_df_4])

In [73]:
EPA_BBC_df.sort_values("Accuracy", ascending = False)

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score
6,BBC_EPA,Relative Frequency,0.6,0.886667,3.538462,11.500000,5.411765
10,BBC_EPA,Relative Frequency,1.0,0.883333,5.062500,4.263158,4.628571
9,BBC_EPA,Relative Frequency,0.9,0.883333,4.611111,4.882353,4.742857
8,BBC_EPA,Relative Frequency,0.8,0.883333,4.250000,5.666667,4.857143
7,BBC_EPA,Relative Frequency,0.7,0.880000,3.782609,6.692308,4.833333
...,...,...,...,...,...,...,...
19,BBC_EPA,Absolute Presences,19.0,0.333333,0.500000,1.000000,0.666667
18,BBC_EPA,Absolute Presences,18.0,0.333333,0.500000,1.000000,0.666667
17,BBC_EPA,Absolute Presences,17.0,0.333333,0.500000,1.000000,0.666667
0,BBC_EPA,Relative Frequency,0.0,0.333333,0.500000,1.000000,0.666667


In [74]:
#EPA and Global Change
EPA_Global_Change_Lexicon = pd.concat([EPA_Lexicon, Global_Change_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, EPA_Global_Change_Lexicon, 0, 20, "EPA_GLobal_Change")
EPA_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, EPA_Global_Change_Lexicon, 0, 2, 0.1, "BBC_Global_Change")
EPA_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, EPA_Global_Change_Lexicon, 0, 20, "BBC_Global_Change")
EPA_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, EPA_Global_Change_Lexicon, 0, 2, 0.1, "BBC_Global_Change")

In [75]:
EPA_Global_Change_df = pd.concat([EPA_Global_Change_df_1, EPA_Global_Change_df_2, EPA_Global_Change_df_3, EPA_Global_Change_df_4])

In [76]:
EPA_Global_Change_df.sort_values("Accuracy", ascending = False)

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score
12,BBC_Global_Change,Relative Frequency,1.2,0.876667,4.000000,5.250000,4.540541
10,BBC_Global_Change,Relative Frequency,1.0,0.873333,3.214286,9.000000,4.736842
11,BBC_Global_Change,Relative Frequency,1.1,0.873333,3.480000,6.692308,4.578947
13,BBC_Global_Change,Relative Frequency,1.3,0.870000,3.904762,4.555556,4.205128
18,BBC_Global_Change,Relative Frequency,1.8,0.866667,5.615385,2.703704,3.650000
...,...,...,...,...,...,...,...
7,BBC_Global_Change,Relative Presences,0.7,0.453333,0.609756,0.000000,0.000000
0,EPA_GLobal_Change,Absolute Frequency,0.0,0.333333,0.500000,1.000000,0.666667
0,BBC_Global_Change,Relative Presences,0.0,0.333333,0.500000,1.000000,0.666667
0,BBC_Global_Change,Relative Frequency,0.0,0.333333,0.500000,1.000000,0.666667


In [77]:
#EPA and IPCC
EPA_IPCC_Lexicon = pd.concat([EPA_Lexicon, IPCC_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_IPCC_df_1 = get_metrics_df_1(Lexicon_df, EPA_IPCC_Lexicon, 0, 20, "EPA_IPCC")
EPA_IPCC_df_2 = get_metrics_df_2(Lexicon_df, EPA_IPCC_Lexicon, 0, 2, 0.1, "EPA_IPCC")
EPA_IPCC_df_3 = get_metrics_df_3(Lexicon_df, EPA_IPCC_Lexicon, 0, 20, "EPA_IPCC")
EPA_IPCC_df_4 = get_metrics_df_4(Lexicon_df, EPA_IPCC_Lexicon, 0, 2, 0.1, "EPA_IPCC")

In [78]:
EPA_IPCC_df = pd.concat([EPA_IPCC_df_1, EPA_IPCC_df_2, EPA_IPCC_df_3, EPA_IPCC_df_4])

In [79]:
EPA_IPCC_df.sort_values("Accuracy", ascending = False)

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score
20,EPA_IPCC,Absolute Presences,20.0,0.730000,1.863636,0.694915,1.012346
19,EPA_IPCC,Absolute Presences,19.0,0.730000,1.730769,0.818182,1.111111
18,EPA_IPCC,Absolute Presences,18.0,0.723333,1.500000,1.040816,1.228916
17,EPA_IPCC,Absolute Presences,17.0,0.716667,1.384615,1.173913,1.270588
16,EPA_IPCC,Absolute Presences,16.0,0.713333,1.285714,1.702703,1.465116
...,...,...,...,...,...,...,...
2,EPA_IPCC,Absolute Presences,2.0,0.333333,0.500000,1.000000,0.666667
20,EPA_IPCC,Relative Presences,2.0,0.333333,0.500000,1.000000,0.666667
10,EPA_IPCC,Absolute Frequency,10.0,0.330000,0.476684,11.500000,0.915423
11,EPA_IPCC,Absolute Frequency,11.0,0.326667,0.471503,10.111111,0.900990


In [80]:
#EPA and Wikipedia
EPA_Wikipedia_Lexicon = pd.concat([EPA_Lexicon, Wikipedia_Lexicon]).drop_duplicates().reset_index(drop = True)
EPA_Wikipedia_df_1 = get_metrics_df_1(Lexicon_df, EPA_Wikipedia_Lexicon, 0, 20, "EPA_Wikipedia")
EPA_Wikipedia_df_2 = get_metrics_df_2(Lexicon_df, EPA_Wikipedia_Lexicon, 0, 2, 0.1, "EPA_Wikipedia")
EPA_Wikipedia_df_3 = get_metrics_df_3(Lexicon_df, EPA_Wikipedia_Lexicon, 0, 20, "EPA_Wikipedia")
EPA_Wikipedia_df_4 = get_metrics_df_4(Lexicon_df, EPA_Wikipedia_Lexicon, 0, 2, 0.1, "EPA_Wikipedia")

In [81]:
EPA_Wikipedia_df = pd.concat([EPA_Wikipedia_df_1, EPA_Wikipedia_df_2, EPA_Wikipedia_df_3, EPA_Wikipedia_df_4])

In [82]:
EPA_Wikipedia_df.sort_values("Accuracy", ascending = False)

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score
10,EPA_Wikipedia,Relative Frequency,1.0,0.896667,3.760000,15.666667,6.064516
15,EPA_Wikipedia,Relative Frequency,1.5,0.893333,5.533333,4.882353,5.187500
11,EPA_Wikipedia,Relative Frequency,1.1,0.893333,3.956522,10.111111,5.687500
17,EPA_Wikipedia,Relative Frequency,1.7,0.890000,6.153846,4.000000,4.848485
16,EPA_Wikipedia,Relative Frequency,1.6,0.890000,5.466667,4.555556,4.969697
...,...,...,...,...,...,...,...
7,EPA_Wikipedia,Relative Presences,0.7,0.526667,0.704225,0.000000,0.000000
0,EPA_Wikipedia,Absolute Frequency,0.0,0.333333,0.500000,1.000000,0.666667
0,EPA_Wikipedia,Relative Presences,0.0,0.333333,0.500000,1.000000,0.666667
0,EPA_Wikipedia,Relative Frequency,0.0,0.333333,0.500000,1.000000,0.666667


In [83]:
Lexicon_df_2 = pd.concat([Lexicon_df_2, EPA_BBC_df, EPA_Global_Change_df, EPA_IPCC_df, EPA_Wikipedia_df]).reset_index(drop = True)

In [84]:
Lexicon_df_2.sort_values("Accuracy", ascending = False)

Unnamed: 0,Lexicon,Technique,Treshhold,Accuracy,Precision,Recall,F1 Score
703,EPA_Wikipedia,Relative Frequency,1.0,0.896667,3.760000,15.666667,6.064516
704,EPA_Wikipedia,Relative Frequency,1.1,0.893333,3.956522,10.111111,5.687500
708,EPA_Wikipedia,Relative Frequency,1.5,0.893333,5.533333,4.882353,5.187500
710,EPA_Wikipedia,Relative Frequency,1.7,0.890000,6.153846,4.000000,4.848485
709,EPA_Wikipedia,Relative Frequency,1.6,0.890000,5.466667,4.555556,4.969697
...,...,...,...,...,...,...,...
599,EPA_IPCC,Absolute Frequency,11.0,0.326667,0.471503,10.111111,0.900990
260,UNDP_IPCC,Absolute Frequency,8.0,0.326667,0.474227,11.500000,0.910891
262,UNDP_IPCC,Absolute Frequency,10.0,0.326667,0.471503,10.111111,0.900990
263,UNDP_IPCC,Absolute Frequency,11.0,0.326667,0.471503,10.111111,0.900990


In [87]:
#BBC and Global Change
#EPA and Wikipedia
BBC_Global_Change_Lexicon = pd.concat([BBC_Lexicon, Global_Change_Lexicon]).drop_duplicates().reset_index(drop = True)
BBC_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, BBC_Global_Change_Lexicon, 0, 20, "BBC_Global_Change")
BBC_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, BBC_Global_Change_Lexicon, 0, 2, 0.1, "BBC_Global_Change")
BBC_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, BBC_Global_Change_Lexicon, 0, 20, "BBC_Global_Change")
BBC_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, BBC_Global_Change_Lexicon, 0, 2, 0.1, "BBC_Global_Change")

In [88]:
BBC_Global_Change_df = pd.concat([BBC_Global_Change_df_1, BBC_Global_Change_df_2, BBC_Global_Change_df_3, BBC_Global_Change_df_4])

In [89]:
#BBC and IPCC
BBC_IPCC_Lexicon = pd.concat([BBC_Lexicon, IPCC_Lexicon]).drop_duplicates().reset_index(drop = True)
BBC_IPCC_df_1 = get_metrics_df_1(Lexicon_df, BBC_IPCC_Lexicon, 0, 20, "BBC_IPCC")
BBC_IPCC_df_2 = get_metrics_df_2(Lexicon_df, BBC_IPCC_Lexicon, 0, 2, 0.1, "BBC_IPCC")
BBC_IPCC_df_3 = get_metrics_df_3(Lexicon_df, BBC_IPCC_Lexicon, 0, 20, "BBC_IPCC")
BBC_IPCC_df_4 = get_metrics_df_4(Lexicon_df, BBC_IPCC_Lexicon, 0, 2, 0.1, "BBC_IPCC")

In [90]:
BBC_IPCC_df = pd.concat([BBC_IPCC_df_1, BBC_IPCC_df_2, BBC_IPCC_df_3, BBC_IPCC_df_4])

In [91]:
#BBC and Wikipedia
BBC_Wikipedia_Lexicon = pd.concat([BBC_Lexicon, Wikipedia_Lexicon]).drop_duplicates().reset_index(drop = True)
BBC_Wikipedia_df_1 = get_metrics_df_1(Lexicon_df, BBC_Wikipedia_Lexicon, 0, 20, "BBC_Wikipedia")
BBC_Wikipedia_df_2 = get_metrics_df_2(Lexicon_df, BBC_Wikipedia_Lexicon, 0, 2, 0.1, "BBC_Wikipedia")
BBC_Wikipedia_df_3 = get_metrics_df_3(Lexicon_df, BBC_Wikipedia_Lexicon, 0, 20, "BBC_Wikipedia")
BBC_Wikipedia_df_4 = get_metrics_df_4(Lexicon_df, BBC_Wikipedia_Lexicon, 0, 2, 0.1, "BBC_Wikipedia")

In [92]:
BBC_Wikpedia_df = pd.concat([BBC_Wikipedia_df_1, BBC_Wikipedia_df_2, BBC_Wikipedia_df_3, BBC_Wikipedia_df_4])

In [93]:
Lexicon_df_2 = pd.concat([Lexicon_df_2, BBC_Global_Change_df, BBC_IPCC_df, BBC_Wikpedia_df]).reset_index(drop = True)

In [None]:
#Wikipedia and Global Change
Wikipedia_Global_Change_Lexicon = pd.concat([Global_Change_Lexicon, Wikipedia_Lexicon]).drop_duplicates().reset_index(drop = True)
Wikipedia_Global_Change_df_1 = get_metrics_df_1(Lexicon_df, Wikipedia_Global_Change_Lexicon, 0, 20, "Wikipedia_Global_Change")
Wikipedia_Global_Change_df_2 = get_metrics_df_2(Lexicon_df, Wikipedia_Global_Change_Lexicon, 0, 2, 0.1, "Wikipedia_Global_Change")
Wikipedia_Global_Change_df_3 = get_metrics_df_3(Lexicon_df, Wikipedia_Global_Change_Lexicon, 0, 20, "Wikipedia_Global_Change")
Wikipedia_Global_Change_df_4 = get_metrics_df_4(Lexicon_df, Wikipedia_Global_Change_Lexicon, 0, 2, 0.1, "Wikipedia_Global_Change")

In [None]:
Wikpedia_Global_Change_df = pd.concat([Wikipedia_Global_Change_df_1, Wikipedia_Global_Change_df_2, Wikipedia_Global_Change_df_3, Wikipedia_Global_Change_df_4])

In [None]:
#Wikipedia and IPCC
Wikipedia_IPCC_Lexicon = pd.concat([IPCC_Lexicon, Wikipedia_Lexicon]).drop_duplicates().reset_index(drop = True)
Wikipedia_IPCC_df_1 = get_metrics_df_1(Lexicon_df, Wikipedia_IPCC_Lexicon, 0, 20, "Wikipedia_IPCC")
Wikipedia_IPCC_df_2 = get_metrics_df_2(Lexicon_df, Wikipedia_IPCC_Lexicon, 0, 2, 0.1, "Wikipedia_IPCC")
Wikipedia_IPCC_df_3 = get_metrics_df_3(Lexicon_df, Wikipedia_IPCC_Lexicon, 0, 20, "Wikipedia_IPCC")
Wikipedia_IPCC_df_4 = get_metrics_df_4(Lexicon_df, Wikipedia_IPCC_Lexicon, 0, 2, 0.1, "Wikipedia_IPCC")

In [None]:
Wikpedia_IPCC_df = pd.concat([Wikipedia_IPCC_df_1, Wikipedia_IPCC_df_2, Wikipedia_IPCC_df_3, Wikipedia_IPCC_df_4])

In [None]:
#IPCC and Global Change
Global_Change_IPCC_Lexicon = pd.concat([IPCC_Lexicon, Global_Change_Lexicon]).drop_duplicates().reset_index(drop = True)
Global_Change_IPCC_df_1 = get_metrics_df_1(Lexicon_df, Global_Change_IPCC_Lexicon, 0, 20, "Global_Change_IPCC")
Global_Change_IPCC_df_2 = get_metrics_df_2(Lexicon_df, Global_Change_IPCC_Lexicon, 0, 2, 0.1, "Global_Change_IPCC")
Global_Change_IPCC_df_3 = get_metrics_df_3(Lexicon_df, Global_Change_IPCC_Lexicon, 0, 20, "Global_Change_IPCC")
Global_Change_IPCC_df_4 = get_metrics_df_4(Lexicon_df, Global_Change_IPCC_Lexicon, 0, 2, 0.1, "Global_Change_IPCC")

In [None]:
Global_Change_IPCC_df = pd.concat([Global_Change_IPCC_df_1, Global_Change_IPCC_df_2, Global_Change_IPCC_df_3, Global_Change_IPCC_df_4])

In [None]:
Lexicon_df_2 = pd.concat([Lexicon_df_2, Wikpedia_Global_Change_df, Wikpedia_IPCC_df, Global_Change_IPCC_df]).reset_index(drop = True)

In [None]:
Lexicon_df_2.sort_values("Accuracy", ascending = False)