# 0. Packages & Functions

In [1]:
import pandas as pd
import string
from nltk import ngrams
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM
import pyarrow.parquet as pq
import pyarrow as pyarrow
import numpy as np
from itertools import combinations
import time
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from tqdm import tqdm
import re

In [2]:
#pre-process text for lexicon based approaches
def preprocess_text(text):
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # convert to lower case
    text = text.lower()
    # remove blank spaces
    text = ' '.join(text.split())
    # remove newline characters
    text = text.replace('\n', '')
    return text

def text_cleaning_final(text):
    # remove blank spaces
    text = ' '.join(text.split())
    
    # remove newline characters
    text = text.replace('\n', '')
    
    # Use regular expressions to match punctuation marks
    pattern = r'([!"#$%&\'()*+,-./:;<=>?@\[\\\]^_`{|}~])'
    # Replace punctuation marks with whitespaces before and after them
    text = re.sub(pattern, r' \1 ', text)
    
    return text

def text_cleaning_append(text):
    # remove blank spaces
    text = ' '.join(text.split())
    
    # remove newline characters
    text = text.replace('\n', '')
    
    return text

In [3]:
def count_lexicon_words(text_df, lexicon):
    lexicon = lexicon["Lexicon"]
    frequency = []
    present = []
    rfrequency = []
    rpresent = []

    for text in tqdm(text_df["Text"]):
        lexicon_counts = 0
        present_count = 0
        for word in lexicon:
            lexicon_counts += text.lower().count(word.lower())
            if(text.lower().count(word.lower()) > 0):
                present_count += 1
        
        word_list = text.split() 
        word_count = len(word_list)

        frequency.append(lexicon_counts)
        present.append(present_count)
        rfrequency.append((lexicon_counts/word_count)*100)
        rpresent.append((present_count/word_count)*100)
        
        
    text_df["Absolute Frequency"] = frequency
    text_df["Absolute Present"] = present
    text_df["Relative Frequency"] = rfrequency
    text_df["Relative Present"] = rpresent
    
    return(text_df)

def get_metrics(df, colname, threshold):
    target = []
    values = df[colname]
    
    for v in values:
        if v >= threshold:
            target.append("Yes")
        else:
            target.append("No")
    
    df["Estimate"] = target
    
    cross_table = pd.crosstab(df['Target'], df['Estimate'], margins=True)
    accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
    precision = cross_table.iloc[1,1] / (cross_table.iloc[0,1] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) else 0
    recall = cross_table.iloc[1,1] / (cross_table.iloc[1,0] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
    return([accuracy, precision, recall, f1_score])

def get_gross_table_data(df, colname, threshold, binary):
    target = []
    values = df[colname]
    
    for v in values:
        if v >= threshold:
            target.append("Yes")
        else:
            target.append("No")
    
    df["Estimate"] = target
    
    if(binary):
        cross_table = pd.crosstab(df['Target'], df['Estimate'], margins=True)
    else:
        cross_table = pd.crosstab(df['Final_Climate_Change_Level_Label'], df['Estimate'], margins=True)
    return(cross_table)
    
def find_optimal_threshold(df, lexicon, lexicon_name):
    df = count_lexicon_words(df, lexicon)
    
    #absolute frequency
    af_accuracy = 0
    af_th = 0
    found = False
    while(found == False):
        metrics = get_metrics(df, "Absolute Frequency", af_th)
        if metrics[0] > af_accuracy:
            af_accuracy = metrics[0]
            af_precision = metrics[1]
            af_recall = metrics[2]
            af_f1 = metrics[3]
            af_th += 1
        else:
            found = True
            
    #absolute present
    ap_accuracy = 0
    ap_th = 0
    found = False
    while(found == False):
        metrics = get_metrics(df, "Absolute Present", ap_th)
        if metrics[0] > ap_accuracy:
            ap_accuracy = metrics[0]
            ap_precision = metrics[1]
            ap_recall = metrics[2]
            ap_f1 = metrics[3]
            ap_th += 1
        else:
            found = True
            
    #relative frequency
    rf_accuracy = 0
    rf_th = 0
    found = False
    while(found == False):
        metrics = get_metrics(df, "Relative Frequency", rf_th)
        if metrics[0] > rf_accuracy:
            rf_accuracy = metrics[0]
            rf_precision = metrics[1]
            rf_recall = metrics[2]
            rf_f1 = metrics[3]
            rf_th += 0.1
        else:
            found = True
            
    #relative present
    rp_accuracy = 0
    rp_th = 0
    found = False
    while(found == False):
        metrics = get_metrics(df, "Relative Present", rp_th)
        if metrics[0] > rp_accuracy:
            rp_accuracy = metrics[0]
            rp_precision = metrics[1]
            rp_recall = metrics[2]
            rp_f1 = metrics[3]
            rp_th += 0.1
        else:
            found = True
    
    return(pd.DataFrame({"Lexicon" : [lexicon_name] * 4, 
                         "Technique" : ["Absolute Frequency", "Absolute Present", "Relative Frequency", "Relative Present"],
                         "Threshold" : [af_th - 1, ap_th - 1, rf_th - 0.1, rp_th - 0.1], 
                 "Accuracy" : [af_accuracy, ap_accuracy, rf_accuracy, rp_accuracy], 
                        "Precision" : [af_precision, ap_precision, rf_precision, rp_precision], 
                        "Recall" : [af_recall, ap_recall, rf_recall, rp_recall], 
                        "F1 Score" : [af_f1, ap_f1, rf_f1, rp_f1]}))
    
def test_lexicon(test_df, results_df, lexicon, lexicon_name):
    techniques = ["Absolute Frequency", "Absolute Present", "Relative Frequency", "Relative Present"]
    df = count_lexicon_words(test_df, lexicon)
    accuracy = []
    precision = []
    recall = []
    f1 = []
    th_df = []
    for t in range(len(techniques)):
        th = results_df[results_df["Technique"] == techniques[t]]["Threshold"].iloc[0]
        th_df.append(th)
        accuracy.append(get_metrics(df, techniques[t], th)[0])
        precision.append(get_metrics(df, techniques[t], th)[1])
        recall.append(get_metrics(df, techniques[t], th)[2])
        f1.append(get_metrics(df, techniques[t], th)[3])
    
    return(pd.DataFrame({"Lexicon" : [lexicon_name] * 4, "Technique" : techniques, "Threshold" : th_df ,
                         "Test Accuracy" : accuracy, 
                        "Test Precision" : precision, 
                        "Test Recall" : recall, 
                        "Test F1 Score" : f1}))



def get_cross_table(text_df, lexicon, threshold, colname, binary):
    df = count_lexicon_words(text_df, lexicon)
    return(get_gross_table_data(df, colname, threshold, binary))

def get_most_words_used(text_df, lexicon):
    lexicon = lexicon["Lexicon"]
    output = {}

    for text in tqdm(text_df["Text"]):
        for word in lexicon:
            if(text.lower().count(word.lower()) > 0):
                if word in output:
                    # Increment the value by 1
                    output[word] += 1
                else:
                    # Add the value to the dictionary with an initial count of 1
                    output[word] = 1

    
    return(output)

In [4]:
def summary(df_with_text, name, model_name, max_lenght_input=-1):
    data_in_list = df_with_text[name].tolist()
    tokenizer_sum = AutoTokenizer.from_pretrained(model_name)
    model_sum = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    summarizer = pipeline('summarization', model=model_sum, tokenizer = tokenizer_sum) 

    if max_lenght_input>=0:
        df_with_text['summary'] = summarizer(data_in_list, max_length=max_lenght_input)

    else:
        df_with_text['summary'] = summarizer(data_in_list)

def classification(df_with_text, name, model_name, max_lenght_input=-1):
    data_in_list = df_with_text[name].tolist()
    tokenizer_clas = AutoTokenizer.from_pretrained(model_name)
    model_clas = AutoModelForSequenceClassification.from_pretrained(model_name)
    classification = pipeline('text-classification', model=model_clas, tokenizer = tokenizer_clas) 

    if max_lenght_input>=0:
        df_with_text['classification'] = classification(data_in_list, max_length=max_lenght_input, truncation=True)

    else:
        df_with_text['classification'] = classification(data_in_list)
        
    return(df_with_text)

def get_metrics_hugging_face(text_df, text_column, model, tokens):
    
    df = classification(text_df, text_column, model, tokens)
    label_list = list(df["classification"])
    labels = [entry['label'] for entry in label_list]
    df["Label_Hugging"] = labels
    
    cross_table = pd.crosstab(df['Target'], df['Label_Hugging'], margins=True)
    
    # calculate classification metrics using scikit-learn
    accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
    precision = cross_table.iloc[1,1] / (cross_table.iloc[0,1] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) else 0
    recall = cross_table.iloc[1,1] / (cross_table.iloc[1,0] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0


    # print the metrics
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 score:", f1_score)
    print(cross_table)
    print("\n")
    print(pd.crosstab(df['Final_Climate_Change_Level_Label'], df['Label_Hugging'], margins=True))
    

def get_metrics_df_hugging_face(text_df, text_column, model, tokens, model_name):
    df = classification(text_df, text_column, model, tokens)
    label_list = list(df["classification"])
    labels = [entry['label'] for entry in label_list]
    df["Label_Hugging"] = labels
    
    cross_table = pd.crosstab(df['Target'], df['Label_Hugging'], margins=True)
    
    # calculate classification metrics using scikit-learn
    accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
    precision = cross_table.iloc[1,1] / (cross_table.iloc[0,1] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) else 0
    recall = cross_table.iloc[1,1] / (cross_table.iloc[1,0] + cross_table.iloc[1,1]) if cross_table.shape == (3,3) != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
    
    return(pd.DataFrame({"Model" : [model_name], "Accuracy" : [accuracy], "Precision" : [precision], "Recall" : [recall],
                         "F1 Score" : [f1_score]}))

# 1. Labels

In [5]:
tag_climate_df = pd.read_parquet("Climate_Labels_Dataset.parquet")

#Clean the tabel
tag_climate_df['Final_Climate_Change_Level_Label'] = tag_climate_df['Final_Climate_Change_Level_Label'].str.strip()
tag_climate_df.loc[tag_climate_df["Final_Climate_Change_Level_Label"] == "NA", "Final_Climate_Change_Level_Label"] = "Na"
tag_climate_df.loc[tag_climate_df["Final_Climate_Change_Level_Label"] == "0", "Final_Climate_Change_Level_Label"] = "Na"
tag_climate_df.loc[tag_climate_df["Final_Climate_Change_Level_Label"] == "Na", "Final_Climate_Change_Level_Label"] = "No Climate"
tag_climate_df["Target"] = tag_climate_df["Final_Climate_Change_Level_Label"].apply(lambda x: "Yes" if x in ["High", "Medium"] else "No")
tag_climate_df

Unnamed: 0,Text,Link,Final_Climate_Change_Level_Label,Final_Sentiment_Label,Target
0,"On a Train trip north toward Aberdeen, the Sc...",https://www.wsj.com/articles/surfacing-review-...,Medium,-1,Yes
1,A container area at the Yangshan Deep Water Po...,https://www.washingtonpost.com/news/monkey-cag...,No Climate,0,No
2,"This week, New York City observed an annual r...",https://www.wsj.com/articles/lets-get-the-un-o...,No Climate,0,No
3,"During a visit to Detroit last year, Presiden...",https://www.wsj.com/articles/make-cars-great-a...,High,1,Yes
4,Investors holding more than $5 billion in Exx...,http://www.wsj.com/articles/calpers-pushes-exx...,High,1,Yes
...,...,...,...,...,...
495,No spending cuts to Medicaid? Then no tax cuts...,https://www.washingtonpost.com/news/fact-check...,No Climate,0,No
496,"The U.S. dollar edged higher Tuesday, maintai...",https://www.wsj.com/articles/u-s-dollar-edges-...,No Climate,0,No
497,Lots of today’s board games are just jazzed-u...,http://www.wsj.com/articles/the-many-guises-of...,No Climate,0,No
498,"The new $1,000 iPhone 11 Pro and the $1,100 iP...",https://www.washingtonpost.com/technology/2019...,No Climate,0,No


In [6]:
overview_labels_hms = tag_climate_df.groupby("Final_Climate_Change_Level_Label")["Text"].count().reset_index()
overview_labels_hms

Unnamed: 0,Final_Climate_Change_Level_Label,Text
0,High,57
1,Medium,42
2,No Climate,307
3,Small,94


In [7]:
overview_labels = tag_climate_df.groupby("Target")["Text"].count().reset_index()
overview_labels

Unnamed: 0,Target,Text
0,No,401
1,Yes,99


## Splits in Train en Test set

In [8]:
# Split the dataframe into two sets
df_train, df_test = train_test_split(tag_climate_df, test_size = 0.3, random_state = 23)
df_train = df_train.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

In [9]:
df_train.groupby("Final_Climate_Change_Level_Label")["Text"].count().reset_index()

Unnamed: 0,Final_Climate_Change_Level_Label,Text
0,High,38
1,Medium,35
2,No Climate,213
3,Small,64


In [10]:
df_train.groupby("Target")["Text"].count().reset_index()

Unnamed: 0,Target,Text
0,No,277
1,Yes,73


In [11]:
df_test.groupby("Final_Climate_Change_Level_Label")["Text"].count().reset_index()

Unnamed: 0,Final_Climate_Change_Level_Label,Text
0,High,19
1,Medium,7
2,No Climate,94
3,Small,30


In [12]:
df_test.groupby("Target")["Text"].count().reset_index()

Unnamed: 0,Target,Text
0,No,124
1,Yes,26


# 2. Lexicons

In [13]:
#Load the lexicon
UNDP_Lexicon = pd.read_csv("Lexicons/UNDP_Lexicon")
UNDP_Lexicon = UNDP_Lexicon.drop_duplicates().reset_index(drop = True)
UNDP_Lexicon["Lexicon"] = UNDP_Lexicon["Lexicon"].str.lower()
UNDP_Lexicon = pd.DataFrame(UNDP_Lexicon["Lexicon"])
list(UNDP_Lexicon["Lexicon"])

['weather',
 'climate',
 'greenhouse gases',
 'greenhouse gas emmisions',
 'global warming',
 'climate change',
 'climate crisis',
 'feedback loop',
 'tipping point',
 'climate overshoot',
 'mitigation',
 'adaptation',
 'resilience',
 'carbon footprint',
 'climate justice',
 'nature-based solutions',
 'indigenous knowledge',
 'loss and damage',
 'climate security',
 'climate finance',
 'net zero',
 'decarbonization',
 'renewable energy',
 'carbon sink',
 'carbon removal',
 'carbon capture',
 'carbon markets',
 'regenerative agriculture',
 'reforestation',
 'afforestation',
 'rewilding',
 'circular economy',
 'blue economy',
 'green jobs',
 'greenwashing',
 'just transition',
 ' unfccc ',
 'conference of the parties',
 ' cop ',
 'paris agreement',
 'nationally determined contributions',
 'transparent reporting',
 'transparency',
 'national adaptation plans',
 'long-term strategies',
 'intergovernmental panel on climate change']

In [14]:
#Load the lexicon
IPCC_Lexicon = pd.read_csv("Lexicons/IPCC_Lexicon")
IPCC_Lexicon = IPCC_Lexicon.drop_duplicates().reset_index(drop = True)
IPCC_Lexicon["Lexicon"] = IPCC_Lexicon["Lexicon"].str.lower()
IPCC_Lexicon = pd.DataFrame(IPCC_Lexicon["Lexicon"])
list(IPCC_Lexicon["Lexicon"])

['acceptability of policy or system change',
 'adaptability',
 'adaptation',
 'adaptation behaviour',
 'adaptation limits',
 'adaptation options',
 'adaptation pathways',
 'adaptive capacity',
 'adaptive governance',
 'aerosol',
 'afforestation',
 'agreement',
 'air pollution',
 'albedo',
 'ambient persuasive technology',
 'anomaly',
 'anthropocene',
 'anthropogenic',
 'anthropogenic emissions',
 'anthropogenic removals',
 'artificial intelligence',
 'atmosphere',
 'atmosphere–ocean general circulation model',
 'attribution',
 'baseline scenario',
 'battery electric vehicle',
 'biochar',
 'biodiversity',
 'bioenergy',
 'bioenergy with carbon dioxide capture and storage',
 'biofuel',
 'biomass',
 'biophilic urbanism',
 'black carbon',
 'blue carbon',
 'burden sharing',
 'effort sharing',
 'business as usual',
 'carbon budget',
 'carbon cycle',
 'carbon dioxide',
 ' co2 ',
 'carbon dioxide capture and storage',
 'carbon dioxide capture and utilisation',
 'carbon dioxide capture, utilisat

In [15]:
#Load the lexicon
EPA_Lexicon = pd.read_csv("Lexicons/EPA_Lexicon")
EPA_Lexicon = EPA_Lexicon.drop_duplicates().reset_index(drop = True)
EPA_Lexicon["Lexicon"] = EPA_Lexicon["Lexicon"].str.lower()
EPA_Lexicon = pd.DataFrame(EPA_Lexicon["Lexicon"])

list(EPA_Lexicon["Lexicon"])

['abrupt climate change',
 'adaptation',
 'adaptive capacity',
 'aerosols',
 'afforestation',
 'albedo',
 'alternative energy',
 'annex i countries/parties',
 'anthropogenic',
 'atmosphere',
 'atmospheric lifetime',
 'biofuels',
 'biogeochemical cycle',
 'biomass',
 'biosphere',
 'black carbon aerosol',
 'borehole',
 'carbon cycle',
 'carbon dioxide',
 'carbon dioxide equivalent',
 'carbon dioxide fertilization',
 'carbon footprint',
 'carbon sequestration',
 'carbon capture and sequestration',
 'chlorofluorocarbons',
 'climate',
 'climate change',
 'climate feedback',
 'climate lag',
 'climate model',
 'climate sensitivity',
 'climate system',
 'coal mine methane',
 'coalbed methane',
 'co-benefit',
 'concentration',
 'conference of the parties',
 'coral bleaching',
 'cryosphere',
 'deforestation',
 'desertification',
 'dryland farming',
 'eccentricity',
 'ecosystem',
 'el niño - southern oscillation',
 'emissions',
 'emissions factor',
 'energy efficiency',
 'energy star',
 'enhanced

In [16]:
#Load the lexicon
Wikipedia_Lexicon = pd.read_csv("Lexicons/Wikipedia_Lexicon")
Wikipedia_Lexicon = Wikipedia_Lexicon.drop_duplicates().reset_index(drop = True)
Wikipedia_Lexicon["Lexicon"] = Wikipedia_Lexicon["Lexicon"].str.lower()
list(Wikipedia_Lexicon["Lexicon"])

['100,000-year problem',
 'adaptation',
 'additionality',
 'albedo',
 'anoxic event',
 'antarctic bottom water',
 'antarctic oscillation',
 'antarctica cooling controversy',
 'anthropogenic',
 'anthropogenic climate change',
 'anthropogenic global warming',
 'anti-greenhouse effect',
 'arctic amplification',
 'arctic dipole anomaly',
 'arctic oscillation',
 'arctic shrinkage',
 'argo',
 'atlantic multidecadal oscillation',
 'atmospheric sciences',
 'atmospheric window',
 'attribution of recent climate change',
 'biofuels',
 'biomass',
 'blytt–sernander system',
 'bond event',
 'bunker fuels',
 'callendar effect',
 'cap and trade',
 'capacity building',
 'carbon cycle',
 'carbon diet',
 'carbon dioxide',
 'carbon footprint',
 'carbon offset',
 'carbon sequestration',
 'carbon sink',
 'carbon tax',
 'cartagena dialogue',
 'clathrate gun hypothesis',
 'climate',
 'climate change',
 'climate change denial',
 'climate change feedback',
 'climate change mitigation',
 'climate commitment',
 '

In [17]:
#Load the lexicon
Global_Change_Lexicon = pd.read_csv("Lexicons/Global_Change_Lexicon")
Global_Change_Lexicon = Global_Change_Lexicon.drop_duplicates().reset_index(drop = True)
Global_Change_Lexicon["Lexicon"] = Global_Change_Lexicon["Lexicon"].str.lower()
list(Global_Change_Lexicon["Lexicon"])

['100-year flood',
 'emissions scenario',
 'adaptation',
 'adaptation science',
 'adaptive capacity',
 'adaptive management',
 'aerosol',
 'algal bloom',
 'assisted migration',
 'biodiversity',
 'bioenergy',
 'biofuel',
 'biogeochemical cycles',
 'biomass',
 'black carbon',
 'carbon capture and storage',
 'carbon cycle',
 'carbon sequestration',
 'climate change',
 'climate change refugia',
 'climate variability',
 'cold wave',
 'dispersal',
 'drought',
 'dry spell',
 'ecological corridor',
 'ecosystem',
 'ecosystem services',
 'el niño-southern oscillation',
 'emergent properties',
 'emissions scenarios',
 'energy systems',
 'environmental justice',
 'evapotranspiration',
 'evolution',
 'extreme events',
 'extreme precipitation',
 'feedback',
 'fitness',
 'food security',
 'forcing',
 'frontline communities',
 'frost-free season',
 'genetic rescue',
 'geoengineering',
 'global change',
 'global climate models',
 'global warming',
 'greenhouse gases',
 'heat stress',
 'heat wave',
 'he

In [18]:
#Load the lexicon
BBC_Lexicon = pd.read_csv("Lexicons/BBC_Lexicon")
BBC_Lexicon = BBC_Lexicon.drop_duplicates().reset_index(drop = True)
BBC_Lexicon["Lexicon"] = BBC_Lexicon["Lexicon"].str.lower()

BBC_Lexicon = pd.DataFrame(BBC_Lexicon["Lexicon"])
list(BBC_Lexicon["Lexicon"])

['adaptation',
 'adaptation fund',
 'annex i countries',
 'annex ii',
 'anthropogenic climate change',
 'aosis',
 'atmospheric aerosols',
 'bali action plan',
 'bali roadmap',
 'baseline for cuts',
 'biofuel',
 'black carbon',
 'business as usual',
 'cap and trade',
 'carbon capture and storage',
 'carbon dioxide',
 'carbon dioxide  equivalent',
 'carbon footprint',
 'carbon intensity',
 'carbon leakage',
 'carbon neutral',
 'carbon offsetting',
 'carbon sequestration',
 'carbon sink',
 'certified emission reduction',
 'clean coal technology',
 'clean development mechanism',
 'climate change',
 'country in transition',
 'dangerous climate change',
 'deforestation',
 'emission trading scheme',
 'eu burden-sharing agreement',
 'feedback loop',
 'flexible mechanism',
 'fossil fuels',
 'geological sequestration',
 'global average temperature',
 'global energy budget',
 'global dimming',
 'global warming',
 'global warming potential',
 'greenhouse gases',
 'greenhouse effect',
 'hockey stic

In [19]:
pd.concat([EPA_Lexicon, BBC_Lexicon]).drop_duplicates().reset_index(drop = True).to_csv("Lexicons/EPA_BBC_Lexicon")

In [20]:
#Create an empty dataframe and write a function to fill with the values

common_words_df = pd.DataFrame({"Lexicon" : ["Global Change", "IPCC", "Wikipedia", "EPA", "BBC", "UNDP"], 
                               "Global Change": [0, 0, 0, 0, 0, 0], "IPCC" : [0, 0, 0, 0, 0, 0], 
                               "Wikipedia" : [0, 0, 0, 0, 0, 0], "EPA" : [0, 0, 0, 0, 0, 0], 
                               "BBC" : [0, 0, 0, 0, 0, 0], "UNDP" : [0, 0, 0, 0, 0, 0]})

dfs = [Global_Change_Lexicon, IPCC_Lexicon, Wikipedia_Lexicon, EPA_Lexicon, BBC_Lexicon, UNDP_Lexicon]

for r in range(0, len(dfs)):
    for c in range(0, len(dfs)):
        # Get the common values between the two columns
        common_words = set(dfs[r]['Lexicon']).intersection(set(dfs[c]['Lexicon']))
        common_words_df.loc[r, common_words_df.columns[c +1]] = len(common_words)/len(dfs[c]['Lexicon'])

common_words_df

Unnamed: 0,Lexicon,Global Change,IPCC,Wikipedia,EPA,BBC,UNDP
0,Global Change,1.0,0.110787,0.073171,0.130719,0.177419,0.152174
1,IPCC,0.361905,1.0,0.170732,0.261438,0.306452,0.391304
2,Wikipedia,0.114286,0.081633,1.0,0.215686,0.241935,0.217391
3,EPA,0.190476,0.116618,0.20122,1.0,0.225806,0.282609
4,BBC,0.104762,0.055394,0.091463,0.091503,1.0,0.217391
5,UNDP,0.066667,0.052478,0.060976,0.084967,0.16129,1.0


In [21]:
#Create an empty dataframe and write a function to fill with the values

common_words_df = pd.DataFrame({"Lexicon" : ["Global Change", "IPCC", "Wikipedia", "EPA", "BBC", "UNDP"], 
                               "Global Change": [0, 0, 0, 0, 0, 0], "IPCC" : [0, 0, 0, 0, 0, 0], 
                               "Wikipedia" : [0, 0, 0, 0, 0, 0], "EPA" : [0, 0, 0, 0, 0, 0], 
                               "BBC" : [0, 0, 0, 0, 0, 0], "UNDP" : [0, 0, 0, 0, 0, 0]})

dfs = [Global_Change_Lexicon, IPCC_Lexicon, Wikipedia_Lexicon, EPA_Lexicon, BBC_Lexicon, UNDP_Lexicon]

for r in range(0, len(dfs)):
    for c in range(0, len(dfs)):
        # Get the common values between the two columns
        common_words = set(dfs[r]['Lexicon']).intersection(set(dfs[c]['Lexicon']))
        common_words_df.loc[r, common_words_df.columns[c +1]] = len(common_words)

common_words_df

Unnamed: 0,Lexicon,Global Change,IPCC,Wikipedia,EPA,BBC,UNDP
0,Global Change,105,38,12,20,11,7
1,IPCC,38,343,28,40,19,18
2,Wikipedia,12,28,164,33,15,10
3,EPA,20,40,33,153,14,13
4,BBC,11,19,15,14,62,10
5,UNDP,7,18,10,13,10,46


In [22]:
dfs = [Global_Change_Lexicon, IPCC_Lexicon, Wikipedia_Lexicon, EPA_Lexicon, BBC_Lexicon, UNDP_Lexicon]
non_unique_words = []
unique_words = []
total_words = []
for r in range(len(dfs)):
    common_words = []
    for c in range(len(dfs)):
        if c != r:
            # Get the common values between the two columns
            common_words.extend(list(set(dfs[r]['Lexicon']).intersection(set(dfs[c]['Lexicon']))))
    common_words = list(set(common_words))  # Remove duplicates by converting to a set and back to a list
    total_words.append(len(dfs[r]["Lexicon"]))
    unique_words.append(len(dfs[r]["Lexicon"]) - len(common_words))
    non_unique_words.append(len(common_words))

non_unique_words

unique_words_df = pd.DataFrame({"Lexicon" : ["Global Change", "IPCC", "Wikipedia", "EPA", "BBC", "UNDP"], 
                                "Non unique words" : non_unique_words, "unique words" : unique_words, 
                               "total_words" : total_words})

unique_words_df["Richness"] = unique_words_df["unique words"] / unique_words_df["total_words"]

unique_words_df

Unnamed: 0,Lexicon,Non unique words,unique words,total_words,Richness
0,Global Change,45,60,105,0.571429
1,IPCC,80,263,343,0.766764
2,Wikipedia,47,117,164,0.713415
3,EPA,59,94,153,0.614379
4,BBC,30,32,62,0.516129
5,UNDP,25,21,46,0.456522


# 3. Classification Models

## 3.1. Lexicons

### 3.1.1. Train Lexicons

In [26]:
Text_df = df_train.copy()
Text_df["Text"] = Text_df["Text"].apply(preprocess_text) 

In [None]:
start = time.time()

names = ["IPCC", "Global_Change", "UNDP", "EPA", "Wikipedia", "BBC"]
All_names = []
# Generate all combinations
all_combinations = []
for r in range(1, len(names) + 1):
    combinations_r = combinations(names, r)
    all_combinations.extend(combinations_r)

# Print all combinations
for combination in all_combinations:
    combined_string = "_".join(combination)
    All_names.append(combined_string)
    
Lexicons = [IPCC_Lexicon, Global_Change_Lexicon, UNDP_Lexicon, EPA_Lexicon, Wikipedia_Lexicon, BBC_Lexicon]
All_Lexicons = []
# Generate all combinations
all_combinations = []
for r in range(1, len(Lexicons) + 1):
    combinations_r = combinations(Lexicons, r)
    all_combinations.extend(combinations_r)

# Concatenate and print all combinations
for combination in all_combinations:
    combined_df = pd.concat(combination, axis=0).drop_duplicates().reset_index()
    All_Lexicons.append(combined_df)
    
results_df = pd.DataFrame()

start_results = time.time()

print("--- %s seconds ---" % (start_results - start))

for i in range(len(All_Lexicons)):
    r_df = find_optimal_threshold(Text_df, All_Lexicons[i], All_names[i])
    results_df = pd.concat([results_df, r_df])

results_df = results_df.reset_index(drop = True)

print("--- %s seconds ---" % (time.time() - start_results))

In [None]:
results_df.sort_values(by = "Accuracy", ascending = False)

In [None]:
results_df.to_parquet("Classification_Results\Lexicon_Tagging_Train_Results.parquet")

### 3.1.2. Test Lexicons

In [27]:
Text_df_test = df_test.copy()
Text_df_test["Text"] = Text_df_test["Text"].apply(preprocess_text) 

In [None]:
start = time.time()

names = ["IPCC", "Global_Change", "UNDP", "EPA", "Wikipedia", "BBC"]
All_names = []
# Generate all combinations
all_combinations = []
for r in range(1, len(names) + 1):
    combinations_r = combinations(names, r)
    all_combinations.extend(combinations_r)

# Print all combinations
for combination in all_combinations:
    combined_string = "_".join(combination)
    All_names.append(combined_string)
    
Lexicons = [IPCC_Lexicon, Global_Change_Lexicon, UNDP_Lexicon, EPA_Lexicon, Wikipedia_Lexicon, BBC_Lexicon]
All_Lexicons = []
# Generate all combinations
all_combinations = []
for r in range(1, len(Lexicons) + 1):
    combinations_r = combinations(Lexicons, r)
    all_combinations.extend(combinations_r)

# Concatenate and print all combinations
for combination in all_combinations:
    combined_df = pd.concat(combination, axis=0).drop_duplicates().reset_index()
    All_Lexicons.append(combined_df)
    
test_result_df = pd.DataFrame()

start_results = time.time()

print("--- %s seconds ---" % (start_results - start))

for n in range(0, len(All_names)):
    test_lexicon_result = test_lexicon(Text_df_test, results_df[results_df["Lexicon"] == All_names[n]], All_Lexicons[All_names.index(All_names[n])], All_names[n])
    test_result_df = pd.concat([test_result_df, test_lexicon_result])
    
print("--- %s seconds ---" % (time.time() - start_results))

In [None]:
test_result_df.sort_values(by = "Test Accuracy", ascending = False)

In [None]:
test_result_df.to_parquet("Classification_Results\Lexicon_Tagging_Test_Results.parquet")

## 3.2. Bert

In [None]:
bert_test_df = df_test.copy()

In [None]:
bert1 = get_metrics_df_hugging_face(bert_test_df, 'Text',"climatebert/environmental-claims",512, "climatebert/environmental-claims")
bert2 = get_metrics_df_hugging_face(bert_test_df, "Text", "climatebert/distilroberta-base-climate-detector", 512, "climatebert/distilroberta-base-climate-detector")
bert_df = pd.concat([bert1, bert2]).reset_index(drop = True)

In [None]:
bert_df

In [None]:
bert_df.to_parquet("Classification_Results\Bert_Tagging_Test_Results.parquet")

# 4. Comparing Results

In [23]:
Lexicon_train_result_df = pd.read_parquet("Classification_Results\Lexicon_Tagging_Train_Results.parquet")
Lexicon_test_result_df = pd.read_parquet("Classification_Results\Lexicon_Tagging_Test_Results.parquet")
Bert_test_result_df = pd.read_parquet("Classification_Results\Bert_Tagging_Test_Results.parquet")

In [72]:
Lexicon_train_result_df.sort_values(by = "Accuracy", ascending = False).head(10).reset_index(drop = True)

Unnamed: 0,Lexicon,Technique,Threshold,Accuracy,Precision,Recall,F1 Score
0,EPA_BBC,Absolute Frequency,7.0,0.928571,0.9,0.739726,0.81203
1,EPA_Wikipedia_BBC,Absolute Frequency,7.0,0.928571,0.9,0.739726,0.81203
2,EPA_BBC,Relative Frequency,0.5,0.925714,0.790123,0.876712,0.831169
3,UNDP_EPA_Wikipedia,Relative Frequency,0.6,0.925714,0.813333,0.835616,0.824324
4,EPA,Absolute Frequency,6.0,0.925714,0.861538,0.767123,0.811594
5,UNDP_EPA,Absolute Frequency,6.0,0.925714,0.861538,0.767123,0.811594
6,UNDP_EPA_BBC,Relative Frequency,0.5,0.925714,0.783133,0.890411,0.833333
7,UNDP_EPA,Relative Frequency,0.6,0.925714,0.821918,0.821918,0.821918
8,UNDP_EPA_BBC,Absolute Frequency,6.0,0.925714,0.850746,0.780822,0.814286
9,EPA_Wikipedia,Absolute Frequency,7.0,0.925714,0.898305,0.726027,0.80303


In [73]:
Lexicon_test_result_df.sort_values(by = ["Test Precision", "Test Accuracy"], ascending = False).head(10).reset_index(drop = True)[["Lexicon", "Technique", "Threshold", "Test Precision", "Test Accuracy", "Test Recall"]]

Unnamed: 0,Lexicon,Technique,Threshold,Test Precision,Test Accuracy,Test Recall
0,Wikipedia,Absolute Frequency,7.0,0.944444,0.933333,0.653846
1,EPA_BBC,Absolute Frequency,7.0,0.92,0.966667,0.884615
2,UNDP_Wikipedia,Absolute Frequency,7.0,0.909091,0.946667,0.769231
3,Wikipedia_BBC,Absolute Frequency,7.0,0.909091,0.946667,0.769231
4,UNDP_Wikipedia_BBC,Absolute Frequency,7.0,0.909091,0.946667,0.769231
5,UNDP_Wikipedia_BBC,Relative Present,0.4,0.9,0.933333,0.692308
6,BBC,Relative Present,0.2,0.894737,0.926667,0.653846
7,Wikipedia_BBC,Relative Present,0.4,0.894737,0.926667,0.653846
8,Global_Change_UNDP_Wikipedia,Relative Frequency,1.1,0.894737,0.926667,0.653846
9,EPA_Wikipedia_BBC,Absolute Frequency,7.0,0.884615,0.96,0.884615


In [74]:
Lexicon_test_result_df.sort_values(by = ["Test Accuracy", "Test Precision"], ascending = False).head(10).reset_index(drop = True)[["Lexicon", "Technique", "Threshold", "Test Accuracy", "Test Precision", "Test Recall"]]

Unnamed: 0,Lexicon,Technique,Threshold,Test Accuracy,Test Precision,Test Recall
0,EPA_BBC,Absolute Frequency,7.0,0.966667,0.92,0.884615
1,EPA_Wikipedia_BBC,Absolute Frequency,7.0,0.96,0.884615,0.884615
2,UNDP_EPA_Wikipedia_BBC,Absolute Frequency,7.0,0.96,0.884615,0.884615
3,EPA,Absolute Frequency,6.0,0.96,0.857143,0.923077
4,UNDP_EPA,Absolute Frequency,6.0,0.96,0.857143,0.923077
5,UNDP_EPA_Wikipedia,Absolute Frequency,6.0,0.96,0.857143,0.923077
6,UNDP_EPA_BBC,Absolute Frequency,6.0,0.96,0.833333,0.961538
7,EPA_Wikipedia,Absolute Frequency,7.0,0.953333,0.88,0.846154
8,BBC,Relative Frequency,0.2,0.953333,0.827586,0.923077
9,UNDP_Wikipedia,Absolute Frequency,7.0,0.946667,0.909091,0.769231


In [75]:
Lexicon_test_result_df.sort_values(by = ["Test F1 Score", "Test Accuracy", "Test Precision"], ascending = False).head(10).reset_index(drop = True)[["Lexicon", "Technique", "Threshold", "Test Accuracy", "Test Precision", "Test Recall", "Test F1 Score"]]

Unnamed: 0,Lexicon,Technique,Threshold,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,EPA_BBC,Absolute Frequency,7.0,0.966667,0.92,0.884615,0.901961
1,UNDP_EPA_BBC,Absolute Frequency,6.0,0.96,0.833333,0.961538,0.892857
2,EPA,Absolute Frequency,6.0,0.96,0.857143,0.923077,0.888889
3,UNDP_EPA,Absolute Frequency,6.0,0.96,0.857143,0.923077,0.888889
4,UNDP_EPA_Wikipedia,Absolute Frequency,6.0,0.96,0.857143,0.923077,0.888889
5,EPA_Wikipedia_BBC,Absolute Frequency,7.0,0.96,0.884615,0.884615,0.884615
6,UNDP_EPA_Wikipedia_BBC,Absolute Frequency,7.0,0.96,0.884615,0.884615,0.884615
7,BBC,Relative Frequency,0.2,0.953333,0.827586,0.923077,0.872727
8,EPA_Wikipedia,Absolute Frequency,7.0,0.953333,0.88,0.846154,0.862745
9,EPA,Relative Frequency,0.5,0.946667,0.78125,0.961538,0.862069


In [None]:
Lexicons_names = ["BBC", "EPA", "Wikipedia", "UNDP", "IPCC", "Global_Change"]
Solo_Lexicons_df = Lexicon_test_result_df[Lexicon_test_result_df["Lexicon"].isin(Lexicons_names)]

In [63]:
df = pd.DataFrame(Solo_Lexicons_df.groupby("Lexicon")["Test Accuracy"].max()).sort_values(by = "Test Accuracy", ascending = False).reset_index("Lexicon")

In [65]:
idx = Solo_Lexicons_df.groupby(["Lexicon"])['Test Accuracy'].transform(max) == Solo_Lexicons_df['Test Accuracy']

In [70]:
Solo_Lexicons_df[idx].sort_values(by = "Test Accuracy", ascending = False).drop("Test F1 Score", axis = 1)

Unnamed: 0,Lexicon,Technique,Threshold,Test Accuracy,Test Precision,Test Recall
12,EPA,Absolute Frequency,6.0,0.96,0.857143,0.923077
22,BBC,Relative Frequency,0.2,0.953333,0.827586,0.923077
8,UNDP,Absolute Frequency,4.0,0.94,0.774194,0.923077
18,Wikipedia,Relative Frequency,0.5,0.94,0.774194,0.923077
2,IPCC,Relative Frequency,1.3,0.92,0.75,0.807692
4,Global_Change,Absolute Frequency,5.0,0.853333,0.583333,0.538462
7,Global_Change,Relative Present,0.5,0.853333,0.7,0.269231


In [57]:
Solo_Lexicons_df = Solo_Lexicons_df.reset_index(drop = True)

In [None]:
Lexicon_test_result_df.groupby("Technique")["Test Accuracy"].max()

In [76]:
data = get_most_words_used(Text_df_test, EPA_Lexicon)
EPA_words_used = pd.DataFrame.from_dict(data, orient='index', columns=['Count']).reset_index().rename(columns={'index': 'Word'})

data = get_most_words_used(Text_df_test, BBC_Lexicon)
BBC_words_used = pd.DataFrame.from_dict(data, orient='index', columns=['Count']).reset_index().rename(columns={'index': 'Word'})

data = get_most_words_used(Text_df_test, UNDP_Lexicon)
UNDP_words_used = pd.DataFrame.from_dict(data, orient='index', columns=['Count']).reset_index().rename(columns={'index': 'Word'})

data = get_most_words_used(Text_df_test, Wikipedia_Lexicon)
Wikipedia_words_used = pd.DataFrame.from_dict(data, orient='index', columns=['Count']).reset_index().rename(columns={'index': 'Word'})

data = get_most_words_used(Text_df_test, IPCC_Lexicon)
IPCC_words_used = pd.DataFrame.from_dict(data, orient='index', columns=['Count']).reset_index().rename(columns={'index': 'Word'})

data = get_most_words_used(Text_df_test, Global_Change_Lexicon)
Global_Change_words_used = pd.DataFrame.from_dict(data, orient='index', columns=['Count']).reset_index().rename(columns={'index': 'Word'})

100%|███████████████████████████████████████████████████████████████████████████████| 150/150 [00:01<00:00, 139.78it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 150/150 [00:00<00:00, 371.36it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 150/150 [00:00<00:00, 495.21it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 150/150 [00:01<00:00, 138.88it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 150/150 [00:02<00:00, 71.40it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 150/150 [00:00<00:00, 227.32it/s]


In [77]:
EPA_top = EPA_words_used.sort_values("Count", ascending = False).head(10).reset_index(drop = True)
EPA_top.columns = ["EPA", "Count"]
BBC_top = BBC_words_used.sort_values("Count", ascending = False).head(10).reset_index(drop = True)
BBC_top.columns = ["BBC", "Count"]
Wikipedia_top = Wikipedia_words_used.sort_values("Count", ascending = False).head(10).reset_index(drop = True)
Wikipedia_top.columns = ["Wikipedia", "Count"]
UNDP_top = UNDP_words_used.sort_values("Count", ascending = False).head(10).reset_index(drop = True)
UNDP_top.columns = ["UNDP", "Count"]
IPCC_top = IPCC_words_used.sort_values("Count", ascending = False).head(10).reset_index(drop = True)
IPCC_top.columns = ["IPCC", "Count"]
Global_Change_top = Global_Change_words_used.sort_values("Count", ascending = False).head(10).reset_index(drop = True)
Global_Change_top.columns = ["Global Change", "Count"]

top_df = pd.concat([EPA_top, BBC_top, Wikipedia_top, UNDP_top, IPCC_top, Global_Change_top], axis = 1)

In [78]:
top_df

Unnamed: 0,EPA,Count,BBC,Count.1,Wikipedia,Count.2,UNDP,Count.3,IPCC,Count.4,Global Change,Count.5
0,climate,62,climate change,59,climate,62,climate,62,climate,62,climate change,59
1,climate change,59,fossil fuels,7,climate change,59,climate change,59,climate change,59,risk,38
2,emissions,13,global warming,7,weather,10,weather,10,risk,38,value,26
3,weather,10,renewable energy,5,fossil fuel,8,global warming,7,agreement,30,evolution,13
4,fossil fuel,8,methane,3,global warming,7,renewable energy,5,social,28,uncertainty,12
5,global warming,7,greenhouse gases,2,greenhouse gas,5,transparency,3,region,24,global warming,7
6,atmosphere,6,mitigation,2,ozone,4,adaptation,2,model,24,feedback,5
7,renewable energy,5,adaptation,2,argo,4,mitigation,2,policies,23,scenario,5
8,greenhouse gas,5,carbon dioxide,2,methane,3,climate crisis,2,institution,17,sink,4
9,ozone,4,hockey stick,1,climate crisis,2,decarbonization,2,evidence,16,ozone,4


In [79]:
data = get_most_words_used(Text_df_test[Text_df_test["Target"] == "Yes"], EPA_Lexicon)
EPA_words_used_Yes = pd.DataFrame.from_dict(data, orient='index', columns=['Count']).reset_index().rename(columns={'index': 'Word'})

data = get_most_words_used(Text_df_test[Text_df_test["Target"] == "Yes"], BBC_Lexicon)
BBC_words_used_Yes = pd.DataFrame.from_dict(data, orient='index', columns=['Count']).reset_index().rename(columns={'index': 'Word'})

data = get_most_words_used(Text_df_test[Text_df_test["Target"] == "Yes"], UNDP_Lexicon)
UNDP_words_used_Yes = pd.DataFrame.from_dict(data, orient='index', columns=['Count']).reset_index().rename(columns={'index': 'Word'})

data = get_most_words_used(Text_df_test[Text_df_test["Target"] == "Yes"], Wikipedia_Lexicon)
Wikipedia_words_used_Yes = pd.DataFrame.from_dict(data, orient='index', columns=['Count']).reset_index().rename(columns={'index': 'Word'})

data = get_most_words_used(Text_df_test[Text_df_test["Target"] == "Yes"], IPCC_Lexicon)
IPCC_words_used_Yes = pd.DataFrame.from_dict(data, orient='index', columns=['Count']).reset_index().rename(columns={'index': 'Word'})

data = get_most_words_used(Text_df_test[Text_df_test["Target"] == "Yes"], Global_Change_Lexicon)
Global_Change_words_used_Yes = pd.DataFrame.from_dict(data, orient='index', columns=['Count']).reset_index().rename(columns={'index': 'Word'})

100%|█████████████████████████████████████████████████████████████████████████████████| 26/26 [00:00<00:00, 156.07it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 26/26 [00:00<00:00, 397.29it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 26/26 [00:00<00:00, 579.35it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 26/26 [00:00<00:00, 152.84it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:00<00:00, 79.72it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 26/26 [00:00<00:00, 248.38it/s]


In [80]:
EPA_top_Yes = EPA_words_used_Yes.sort_values("Count", ascending = False).head(10).reset_index(drop = True)
EPA_top_Yes.columns = ["EPA", "Count"]
BBC_top_Yes = BBC_words_used_Yes.sort_values("Count", ascending = False).head(10).reset_index(drop = True)
BBC_top_Yes.columns = ["BBC", "Count"]
Wikipedia_top_Yes = Wikipedia_words_used_Yes.sort_values("Count", ascending = False).head(10).reset_index(drop = True)
Wikipedia_top_Yes.columns = ["Wikipedia", "Count"]
UNDP_top_Yes = UNDP_words_used_Yes.sort_values("Count", ascending = False).head(10).reset_index(drop = True)
UNDP_top_Yes.columns = ["UNDP", "Count"]
IPCC_top_Yes = IPCC_words_used_Yes.sort_values("Count", ascending = False).head(10).reset_index(drop = True)
IPCC_top_Yes.columns = ["IPCC", "Count"]
Global_Change_top_Yes = Global_Change_words_used_Yes.sort_values("Count", ascending = False).head(10).reset_index(drop = True)
Global_Change_top_Yes.columns = ["Global Change", "Count"]

top_df_Yes = pd.concat([EPA_top_Yes, BBC_top_Yes, Wikipedia_top_Yes, UNDP_top_Yes, IPCC_top_Yes, Global_Change_top_Yes], axis = 1)

In [81]:
top_df_Yes

Unnamed: 0,EPA,Count,BBC,Count.1,Wikipedia,Count.2,UNDP,Count.3,IPCC,Count.4,Global Change,Count.5
0,climate,25,climate change,25,climate,25,climate,25,climate change,25,climate change,25
1,climate change,25,fossil fuels,6,climate change,25,climate change,25,climate,25,risk,10
2,emissions,13,global warming,6,fossil fuel,7,global warming,6,agreement,10,value,8
3,fossil fuel,7,renewable energy,4,global warming,6,weather,6,risk,10,global warming,6
4,global warming,6,methane,3,weather,6,renewable energy,4,region,8,scenario,4
5,weather,6,greenhouse gases,2,greenhouse gas,5,greenhouse gases,2,policies,7,uncertainty,3
6,greenhouse gas,5,mitigation,2,methane,3,decarbonization,2,fossil fuels,6,evolution,3
7,renewable energy,4,carbon dioxide,2,argo,2,mitigation,2,model,6,greenhouse gases,2
8,natural gas,4,hockey stick,1,ozone,2,paris agreement,2,global warming,6,mitigation,2
9,atmosphere,4,carbon neutral,1,carbon dioxide,2,tipping point,1,greenhouse gas,5,ozone,2


In [82]:
data = get_most_words_used(Text_df_test[Text_df_test["Target"] == "No"], EPA_Lexicon)
EPA_words_used_No = pd.DataFrame.from_dict(data, orient='index', columns=['Count']).reset_index().rename(columns={'index': 'Word'})

data = get_most_words_used(Text_df_test[Text_df_test["Target"] == "No"], BBC_Lexicon)
BBC_words_used_No = pd.DataFrame.from_dict(data, orient='index', columns=['Count']).reset_index().rename(columns={'index': 'Word'})

data = get_most_words_used(Text_df_test[Text_df_test["Target"] == "No"], UNDP_Lexicon)
UNDP_words_used_No = pd.DataFrame.from_dict(data, orient='index', columns=['Count']).reset_index().rename(columns={'index': 'Word'})

data = get_most_words_used(Text_df_test[Text_df_test["Target"] == "No"], Wikipedia_Lexicon)
Wikipedia_words_used_No = pd.DataFrame.from_dict(data, orient='index', columns=['Count']).reset_index().rename(columns={'index': 'Word'})

data = get_most_words_used(Text_df_test[Text_df_test["Target"] == "No"], IPCC_Lexicon)
IPCC_words_used_No = pd.DataFrame.from_dict(data, orient='index', columns=['Count']).reset_index().rename(columns={'index': 'Word'})

data = get_most_words_used(Text_df_test[Text_df_test["Target"] == "No"], Global_Change_Lexicon)
Global_Change_words_used_No = pd.DataFrame.from_dict(data, orient='index', columns=['Count']).reset_index().rename(columns={'index': 'Word'})

100%|███████████████████████████████████████████████████████████████████████████████| 124/124 [00:00<00:00, 150.21it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 124/124 [00:00<00:00, 371.74it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 124/124 [00:00<00:00, 500.52it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 124/124 [00:00<00:00, 143.70it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 124/124 [00:01<00:00, 70.85it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 124/124 [00:00<00:00, 227.80it/s]


In [83]:
EPA_top_No = EPA_words_used_No.sort_values("Count", ascending = False).head(10).reset_index(drop = True)
EPA_top_No.columns = ["EPA", "Count"]
BBC_top_No = BBC_words_used_No.sort_values("Count", ascending = False).head(10).reset_index(drop = True)
BBC_top_No.columns = ["BBC", "Count"]
Wikipedia_top_No = Wikipedia_words_used_No.sort_values("Count", ascending = False).head(10).reset_index(drop = True)
Wikipedia_top_No.columns = ["Wikipedia", "Count"]
UNDP_top_No = UNDP_words_used_No.sort_values("Count", ascending = False).head(10).reset_index(drop = True)
UNDP_top_No.columns = ["UNDP", "Count"]
IPCC_top_No = IPCC_words_used_No.sort_values("Count", ascending = False).head(10).reset_index(drop = True)
IPCC_top_No.columns = ["IPCC", "Count"]
Global_Change_top_No = Global_Change_words_used_No.sort_values("Count", ascending = False).head(10).reset_index(drop = True)
Global_Change_top_No.columns = ["Global Change", "Count"]

top_df_No = pd.concat([EPA_top_No, BBC_top_No, Wikipedia_top_No, UNDP_top_No, IPCC_top_No, Global_Change_top_No], axis = 1)

In [84]:
top_df_No

Unnamed: 0,EPA,Count,BBC,Count.1,Wikipedia,Count.2,UNDP,Count.3,IPCC,Count.4,Global Change,Count.5
0,climate,37,climate change,34.0,climate,37,climate,37,climate,37,climate change,34
1,climate change,34,adaptation,2.0,climate change,34,climate change,34,climate change,34,risk,28
2,weather,4,renewable energy,1.0,weather,4,weather,4,risk,28,value,18
3,concentration,3,feedback loop,1.0,ozone,2,transparency,3,social,24,evolution,10
4,sink,3,global warming,1.0,argo,2,adaptation,2,agreement,20,uncertainty,9
5,ozone,2,fossil fuels,1.0,adaptation,2,resilience,1,model,18,feedback,5
6,vulnerability,2,,,climate crisis,1,climate crisis,1,policies,16,sink,3
7,metric ton,2,,,proxy,1,reforestation,1,region,16,fitness,3
8,adaptation,2,,,carbon tax,1,renewable energy,1,justice,13,ozone,2
9,energy efficiency,2,,,global warming,1,feedback loop,1,evidence,13,vulnerability,2


In [28]:
#IPCC_UNDP_BBC
Lexicon_cross = pd.concat([EPA_Lexicon, BBC_Lexicon]).drop_duplicates()
get_cross_table(Text_df_test, Lexicon_cross, 7, "Absolute Frequency", True)

100%|████████████████████████████████████████████████████████████████████████████████| 150/150 [00:02<00:00, 51.94it/s]


Estimate,No,Yes,All
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,122,2,124
Yes,3,23,26
All,125,25,150


In [24]:
get_cross_table(Text_df_test, EPA_Lexicon, 0.6, "Relative Frequency", True)

NameError: name 'Text_df_test' is not defined

In [None]:
Bert_test_result_df

# 5. Classify Articles

In [None]:
WP = pd.read_parquet("C:/Users/Boedt/OneDrive/Bureaublad/Scraped_Articles/Final/WP_Final_Articles.parquet")
WP_clean = WP.copy()
WP_clean["Text"] = WP_clean["Text"].apply(preprocess_text) 

In [None]:
WSJ = pd.read_parquet("C:/Users/Boedt/OneDrive/Bureaublad/Scraped_Articles/Final/WSJ_Final")
WSJ_clean = WSJ.copy()
WSJ_clean["Text"] = WSJ_clean["Text"].apply(preprocess_text)

In [None]:
Lexicon = pd.concat([EPA_Lexicon, BBC_Lexicon]).drop_duplicates()
test = count_lexicon_words(WSJ_clean, Lexicon)

In [None]:
test["Climate"] = "No"
test.loc[test["Absolute Frequency"] >= 7, "Climate"] = "Yes"

In [None]:
WSJ

In [None]:
WSJ["Text"] = WSJ["Text"].apply(text_cleaning_final)

In [None]:
WSJ

In [None]:
WSJ[WSJ["Year"] == 2020].reset_index(drop = True).to_parquet("C:/Users/Boedt/OneDrive/Bureaublad/Scraped_Articles/Final/WSJ_Final_2020.parquet")

In [9]:
tag_climate_df["Length"] = tag_climate_df["Text"].str.split(" ").str.len()

In [78]:
data = tag_climate_df[tag_climate_df["Final_Climate_Change_Level_Label"] == "High"].sort_values("Length", ascending = False).reset_index(drop = True)
data["Length"] = data["Text"].str.split(" ").str.len()

In [79]:
data["Text"] = data["Text"].apply(text_cleaning_append)
data

Unnamed: 0,Text,Link,Final_Climate_Change_Level_Label,Final_Sentiment_Label,Target,Length
0,A Democratic presidential-campaign proposal to...,https://www.wsj.com/articles/what-would-happen...,High,-1,Yes,3270
1,"This year, in case you hadn’t heard, marks the...",https://www.washingtonpost.com/news/energy-env...,High,-1,Yes,2198
2,Charles Koch had been talking for more than an...,https://www.washingtonpost.com/news/energy-env...,High,-1,Yes,1821
3,Biden would reverse 100 Trump administration p...,https://www.washingtonpost.com/politics/biden-...,High,1,Yes,1777
4,It’s no surprise that populations living in th...,https://www.washingtonpost.com/news/energy-env...,High,-1,Yes,1525
5,Dozens of surveys and studies have attempted t...,https://www.washingtonpost.com/news/energy-env...,High,0,Yes,1446
6,So much about the planet’s future will depend ...,https://www.washingtonpost.com/news/energy-env...,High,-1,Yes,1439
7,The drought-stricken Colorado River Basin has ...,https://www.washingtonpost.com/news/energy-env...,High,-1,Yes,1360
8,This story has been updated. New York City has...,https://www.washingtonpost.com/news/energy-env...,High,-1,Yes,1352
9,2016 is the 200th anniversary of the so-called...,https://www.washingtonpost.com/news/capital-we...,High,-1,Yes,1295


In [89]:
data.loc[25, "Text"]

'Galileo Galilei was tried in 1633 for spreading the heretical view that the Earth orbits the sun, convicted by the Roman Catholic Inquisition, and remained under house arrest until his death. Today’s inquisitors seek their quarry’s imprisonment and financial ruin. As the scientific case for a climate-change catastrophe wanes, proponents of big-ticket climate policies are increasingly focused on punishing dissent from an asserted “consensus” view that the only way to address global warming is to restructure society—how it harnesses and uses energy. That we might muddle through a couple degrees’ of global warming over decades or even centuries, without any major disruption, is the new heresy and must be suppressed. The Climate Inquisition began with Michael Mann’s 2012 lawsuit against critics of his “hockey stick” research—a holy text to climate alarmists. The suggestion that Prof. Mann’s famous diagram showing rapid recent warming was an artifact of his statistical methods, rather than

In [83]:
data[data["Link"].str.contains("wsj")]

Unnamed: 0,Text,Link,Final_Climate_Change_Level_Label,Final_Sentiment_Label,Target,Length
0,A Democratic presidential-campaign proposal to...,https://www.wsj.com/articles/what-would-happen...,High,-1,Yes,3270
10,"MENDAWAI, Indonesia—Eleven years ago Dharsono ...",https://www.wsj.com/articles/one-mans-money-dr...,High,0,Yes,1266
13,President-elect Joe Biden said Thursday he pla...,https://www.wsj.com/articles/biden-picks-north...,High,1,Yes,1218
19,Coal is clinging to the top spot in power gene...,https://www.wsj.com/articles/why-coals-power-p...,High,-1,Yes,1057
20,"For climate scientists, the pandemic has made ...",https://www.wsj.com/articles/covid-19s-environ...,High,-1,Yes,990
21,"YOKOSUKA, Japan— Shinjiro Koizumi has led a ch...",https://www.wsj.com/articles/prime-minister-ab...,High,-1,Yes,950
22,WASHINGTON—President-elect Joe Biden is nearin...,https://www.wsj.com/articles/biden-closes-in-o...,High,0,Yes,947
25,Galileo Galilei was tried in 1633 for spreadin...,http://www.wsj.com/articles/punishing-climate-...,High,-1,Yes,921
26,"Rapid, far-reaching changes to almost every fa...",https://www.wsj.com/articles/u-n-panel-warns-d...,High,-1,Yes,912
27,WASHINGTON—President Trump would press forward...,https://www.wsj.com/articles/trump-will-roll-b...,High,1,Yes,907
