# 0. Packages and Functions

## 0.1. Packages

In [None]:
import pandas as pd
import string
from nltk import ngrams
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix

## 0.2. Functions

In [None]:
#pre-process text for lexicon based approaches
def preprocess_text(text):
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # convert to lower case
    text = text.lower()
    # remove blank spaces
    text = ' '.join(text.split())
    # remove newline characters
    text = text.replace('\n', '')
    return text

### Absolute Count with Frequency

In [212]:
#Create function that take the dataframe, lexicon and n-gram value (how many n-grams should be considered) and determine the 
#count of words in dataframe text that match the lexicon
def count_lexicon_words_1(text_df, lexicon):
    lexicon = lexicon["Lexicon"]
    count = []

    for text in text_df["Text"]:
        lexicon_counts = 0

        for word in lexicon:
            lexicon_counts += text.lower().count(word.lower())
        
        count.append(lexicon_counts)
        
    text_df["Lexicon Count"] = count
    
    return(text_df)

#create a function that used the lexicon approach to determine with the target is yes or no
def lexicon_target_classifier_1(df, treshold):
    target = []
    
    count = df["Lexicon Count"]
    
    for c in count:
        if c < treshold:
            target.append("No")
        else:
            target.append("Yes")
            
    df["Target Lexicon"] = target
    return(df)

#Combine both functions to classify articles based on the lexicon
def lexicon_climate_classifier_1(text_df, lexicon, treshold):
    df = count_lexicon_words_1(text_df, lexicon)
    
    return(lexicon_target_classifier_1(df, treshold))

def threshold_metrics_1(df_text, lexicon, min_treshhold, max_treshhold):
    for i in range(min_treshhold, max_treshhold + 1):
        df = lexicon_climate_classifier_1(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)
        
        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / cross_table.iloc[0,1] if cross_table.iloc[0,1] != 0 else 0
        recall = cross_table.iloc[1,1] / cross_table.iloc[1,0] if cross_table.iloc[1,0] != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

        # print the metrics
        print("Threshhold:", i)
        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 score:", f1_score)
        print(cross_table)
        print("\n")
        
def get_metrics_df_1(df_text, lexicon, min_treshhold, max_treshhold, Lexicon_name):    
    Accuracy = []
    Precision = []
    Recall = []
    F1_score = []
    Name = []
    Treshhold = []
    
    for i in range(min_treshhold, max_treshhold + 1):
        df = lexicon_climate_classifier_1(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / cross_table.iloc[0,1] if cross_table.iloc[0,1] != 0 else 0
        recall = cross_table.iloc[1,1] / cross_table.iloc[1,0] if cross_table.iloc[1,0] != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
        
        Accuracy.append(accuracy)
        Precision.append(precision)
        Recall.append(recall)
        F1_score.append(f1_score)
        Name.append(Lexicon_name)
        Treshhold.append(i)
        
    return(pd.DataFrame({"Lexicon" : Name, "Treshhold" : Treshhold, "Accuracy" : Accuracy, "Precision" : Precision, "Recall" : Recall, "F1 Score" : F1_score}))
    


### Relative Count with Frequency

In [None]:
#Create function that take the dataframe, lexicon and n-gram value (how many n-grams should be considered) and determine the 
#count of words in dataframe text that match the lexicon
def count_lexicon_words_2(text_df, lexicon):
    lexicon = lexicon["Lexicon"]
    count = []
    
    for text in text_df["Text"]:
        lexicon_counts = 0

        for word in lexicon:
            lexicon_counts += text.lower().count(word.lower())
        
        word_list = text.split() 
        word_count = len(word_list)
        count.append((lexicon_counts/word_count)*100)

    text_df["Lexicon Count"] = count
    
    return(text_df)

#create a function that used the lexicon approach to determine with the target is yes or no
def lexicon_target_classifier_2(df, treshold):
    target = []
    
    count = df["Lexicon Count"]
    
    for c in count:
        if c < treshold:
            target.append("No")
        else:
            target.append("Yes")
            
    df["Target Lexicon"] = target
    return(df)

#Combine both functions to classify articles based on the lexicon
def lexicon_climate_classifier_2(text_df, lexicon, treshold):
    df = count_lexicon_words_2(text_df, lexicon)
    
    return(lexicon_target_classifier_2(df, treshold))

def threshold_metrics_2(df_text, lexicon, min_treshhold, max_treshhold, jump):
    for num in range(int((max_treshhold - min_treshhold) / jump) + 1):
        i = min_treshhold + num * jump
        df = lexicon_climate_classifier_2(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)

        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0, 0] + cross_table.iloc[1, 1]) / cross_table.loc['All', 'All'] if cross_table.shape == (3,3) else cross_table.iloc[1,0] / (cross_table.iloc[1, 0] + cross_table.iloc[0,0]) 
        precision = cross_table.iloc[1,1] / cross_table.iloc[0,1] if cross_table.iloc[0,1] != 0 else 0
        recall = cross_table.iloc[1,1] / cross_table.iloc[1,0] if cross_table.iloc[1,0] != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

        # print the metrics
        print("Threshhold:", i)
        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 score:", f1_score)
        print(cross_table)
        print("\n")


Accuracy: This metric measures the overall performance of a model. It is defined as the number of correct predictions divided by the total number of predictions. Accuracy is a good metric to use when the classes are roughly balanced, meaning there are about the same number of positive and negative examples in the dataset.

Precision: This metric measures how many of the positive predictions made by a model are actually correct. It is defined as the number of true positives divided by the total number of positive predictions. Precision is a good metric to use when we care more about avoiding false positives than false negatives.

Recall: This metric measures how many of the positive examples in the dataset are correctly predicted by the model. It is defined as the number of true positives divided by the total number of actual positive examples. Recall is a good metric to use when we care more about avoiding false negatives than false positives.

F1 score: This metric is a weighted average of precision and recall, where the weight is determined by the beta parameter. The most common value for beta is 1, which gives equal weight to precision and recall. The F1 score is a good metric to use when we want to balance precision and recall, and when the classes are imbalanced.

# 1. Import Label Dataset

In [None]:
tag_climate_df = pd.read_parquet("Final_Label_Table.parquet")
tag_climate_df.head(5)

In [None]:
#Only keep the required columns
tag_climate_df = tag_climate_df[["Text", "Climate_Change_Topic", "Level_Climate_Change_Topic"]]

In [None]:
#Clean the tabel
tag_climate_df['Level_Climate_Change_Topic'] = tag_climate_df['Level_Climate_Change_Topic'].str.strip()
tag_climate_df[tag_climate_df["Level_Climate_Change_Topic"] == "NA"] = "Na"
tag_climate_df[tag_climate_df["Level_Climate_Change_Topic"] == "0"] = "Na"
tag_climate_df["Target"] = tag_climate_df["Level_Climate_Change_Topic"].apply(lambda x: "Yes" if x in ["High", "Medium"] else "No")

In [None]:
overview_labels_hms = tag_climate_df.groupby("Level_Climate_Change_Topic")["Text"].count().reset_index()

In [None]:
overview_labels_hms

In [None]:
overview_labels = tag_climate_df.groupby("Target")["Text"].count().reset_index()

In [None]:
overview_labels

In [None]:
#overview_labels_hms.to_csv("C:/Users/Boedt/OneDrive/Bureaublad/R Thesis/overview_tag_labels_hms", index = False)
#overview_labels.to_csv("C:/Users/Boedt/OneDrive/Bureaublad/R Thesis/overview_tag_labels", index = False)

# 2. Taggers

## 2.1. Tagger 1 - Global Change Lexicon

Uitleg

In [None]:
#Store the dataframe in a different one, for the purpose of this lexicon. This way there is no confusion.
tag1_Global_Change = tag_climate_df

In [None]:
#Load the lexicon
Global_Change_Lexicon = pd.read_excel('lexicons-used.xlsx', sheet_name = "Global Change", header = None)
Global_Change_Lexicon.columns = ["Lexicon"]

In [None]:
#Clean the text
tag1_Global_Change["Text"] = tag1_Global_Change["Text"].apply(preprocess_text)

In [None]:
threshold_metrics_1(tag1_Global_Change, Global_Change_Lexicon, 0, 8)

In [214]:
get_metrics_df_1(tag1_Global_Change, Global_Change_Lexicon, 0, 20, "Global Change")

Unnamed: 0,Lexicon,Treshhold,Accuracy,Precision,Recall,F1 Score
0,Global Change,0,0.512438,1.05102,1.0,1.024876
1,Global Change,1,0.532338,1.095745,0.0,0.0
2,Global Change,2,0.636816,1.517241,5.866667,2.410959
3,Global Change,3,0.671642,2.193548,1.942857,2.060606
4,Global Change,4,0.661692,2.458333,1.340909,1.735294
5,Global Change,5,0.641791,3.214286,0.775862,1.25
6,Global Change,6,0.631841,4.222222,0.584615,1.027027
7,Global Change,7,0.641791,16.5,0.471429,0.916667
8,Global Change,8,0.621891,0.0,0.355263,0.0
9,Global Change,9,0.606965,0.0,0.303797,0.0


## 2.2. Tagger 2 - IPCC Lexicon

Uitleg

In [None]:
#Store the dataframe in a different one, for the purpose of this lexicon. This way there is no confusion.
tag2_IPCC = tag_climate_df

In [None]:
#Load the lexicon
IPCC_Lexicon = pd.read_excel('lexicons-used.xlsx', sheet_name = "IPCC", header = None)
IPCC_Lexicon.columns = ["Lexicon"]

IPCC_Lexicon

In [None]:
#Clean the text
tag2_IPCC["Text"] = tag2_IPCC["Text"].apply(preprocess_text)

In [None]:
threshold_metrics_1(tag2_IPCC, IPCC_Lexicon, 8, 12)

## 2.3. Tagger 3 - Wikipedia Lexicon

Uitleg

In [None]:
#Store the dataframe in a different one, for the purpose of this lexicon. This way there is no confusion.
tag3_Wikipedia = tag_climate_df

In [None]:
#Load the lexicon
Wikipedia_Lexicon = pd.read_excel('lexicons-used.xlsx', sheet_name = "Wikipedia", header = None)
Wikipedia_Lexicon = pd.DataFrame(Wikipedia_Lexicon[0])
Wikipedia_Lexicon.columns = ["Lexicon"]

Wikipedia_Lexicon

In [None]:
#Clean the text
tag3_Wikipedia["Text"] = tag3_Wikipedia["Text"].apply(preprocess_text)

In [None]:
threshold_metrics_1(tag3_Wikipedia, Wikipedia_Lexicon, 3, 5)

In [None]:
threshold_metrics_2(tag3_Wikipedia, Wikipedia_Lexicon, 0, 1, 0.1)

In [215]:
get_metrics_df_1(tag3_Wikipedia, Wikipedia_Lexicon, 0, 20, "Wikipedia")

Unnamed: 0,Lexicon,Treshhold,Accuracy,Precision,Recall,F1 Score
0,Wikipedia,0,0.512438,1.05102,1.0,1.024876
1,Wikipedia,1,0.527363,1.084211,0.0,0.0
2,Wikipedia,2,0.527363,1.084211,0.0,0.0
3,Wikipedia,3,0.810945,3.166667,11.875,5.0
4,Wikipedia,4,0.830846,5.6,4.421053,4.941176
5,Wikipedia,5,0.800995,8.875,2.21875,3.55
6,Wikipedia,6,0.771144,12.4,1.512195,2.695652
7,Wikipedia,7,0.741294,13.75,1.145833,2.115385
8,Wikipedia,8,0.726368,25.0,0.943396,1.818182
9,Wikipedia,9,0.701493,22.5,0.775862,1.5


## 2.4. Tagger 4 - EPA Lexicon

Uitleg

In [None]:
tag4_EPA = tag_climate_df

In [None]:
#Load the lexicon
EPA_Lexicon = pd.read_excel('lexicons-used.xlsx', sheet_name = "EPA", header = None)
EPA_Lexicon = pd.DataFrame(EPA_Lexicon[0])
EPA_Lexicon.columns = ["Lexicon"]

EPA_Lexicon

In [None]:
#Clean the text
tag4_EPA["Text"] = tag4_EPA["Text"].apply(preprocess_text)

In [None]:
threshold_metrics_1(tag4_EPA, EPA_Lexicon, 3, 7)

## 2.5. Tagger 5 - BBC Lexicon

Uitleg

In [None]:
tag_BBC = tag_climate_df

In [None]:
#Load the lexicon
BBC_Lexicon = pd.read_excel('lexicons-used.xlsx', sheet_name = "EPA", header = None)
BBC_Lexicon = pd.DataFrame(BBC_Lexicon[0])
BBC_Lexicon.columns = ["Lexicon"]

BBC_Lexicon

In [None]:
#Clean the text
tag_BBC["Text"] = tag_BBC["Text"].apply(preprocess_text)

In [None]:
threshold_metrics_1(tag_BBC, BBC_Lexicon, 1, 8)

In [None]:
threshold_metrics_2(tag_BBC, BBC_Lexicon, 0, 1, 0.1)

## 2.6. Tagger 6 - UNDP Lexicon

Uitleg

In [None]:
tag_UNDP = tag_climate_df

In [None]:
#Load the lexicon
UNDP_Lexicon = pd.read_excel('lexicons-used.xlsx', sheet_name = "UNDP", header = None)
UNDP_Lexicon = pd.DataFrame(UNDP_Lexicon[0])
UNDP_Lexicon.columns = ["Lexicon"]

UNDP_Lexicon

In [None]:
#Clean the text
tag_UNDP["Text"] = tag_UNDP["Text"].apply(preprocess_text)

In [None]:
threshold_metrics_1(tag_UNDP, UNDP_Lexicon, 1, 8)

In [None]:
threshold_metrics_2(tag_UNDP, UNDP_Lexicon, 0, 1, 0.1)

## 2.5. Tagger 5 - Global Change Lexicon & IPCC Lexicon

uitleg

In [None]:
tag5_GC_IPCC = tag_climate_df

In [None]:
#Load the lexicon
l1 = pd.read_excel('lexicons-used.xlsx', sheet_name = "Global Change", header = None)
l2 = pd.read_excel('lexicons-used.xlsx', sheet_name = "IPCC", header = None)

GC_IPCC_Lexicon = pd.concat([l1, l2]).reset_index(drop = True)
GC_IPCC_Lexicon = pd.DataFrame(GC_IPCC_Lexicon[0])
GC_IPCC_Lexicon.columns = ["Lexicon"]

GC_IPCC_Lexicon

In [None]:
tag5_GC_IPCC["Text"] = tag5_GC_IPCC["Text"].apply(preprocess_text)

In [None]:
threshold_metrics_1(tag5_GC_IPCC, GC_IPCC_Lexicon, 1, 8)

## 2.6. Tagger 6 - Global Change Lexicon & Wikipedia Lexicon

In [None]:
tag5_GC_W = tag_climate_df

In [None]:
#Load the lexicon
l1 = pd.read_excel('lexicons-used.xlsx', sheet_name = "Global Change", header = None)
l2 = pd.read_excel('lexicons-used.xlsx', sheet_name = "Wikipedia", header = None)

GC_W_Lexicon = pd.concat([l1, l2]).reset_index(drop = True)
GC_W_Lexicon = pd.DataFrame(GC_W_Lexicon[0])
GC_W_Lexicon.columns = ["Lexicon"]

GC_W_Lexicon

In [None]:
tag5_GC_W["Text"] = tag5_GC_W["Text"].apply(preprocess_text)

In [None]:
threshold_metrics_1(tag5_GC_W, GC_W_Lexicon, 1, 8)

## 2.7. Tagger 7 - IPCC Lexicon & Wikipedia Lexicon

Uitleg

In [None]:
tag5_IPCC_W = tag_climate_df

In [None]:
#Load the lexicon
l1 = pd.read_excel('lexicons-used.xlsx', sheet_name = "IPCC", header = None)
l2 = pd.read_excel('lexicons-used.xlsx', sheet_name = "Wikipedia", header = None)

IPCC_W_Lexicon = pd.concat([l1, l2]).reset_index(drop = True)
IPCC_W_Lexicon = pd.DataFrame(IPCC_W_Lexicon[0])
IPCC_W_Lexicon.columns = ["Lexicon"]

IPCC_W_Lexicon

In [None]:
tag5_IPCC_W["Text"] = tag5_IPCC_W["Text"].apply(preprocess_text)

In [None]:
threshold_metrics_1(tag5_IPCC_W, IPCC_W_Lexicon, 1, 8)

## Tagger 8 - Wikipedia Lexicon & EPA Lexicon

Uitleg

In [None]:
tag8_W_EPA = tag_climate_df

In [None]:
#Load the lexicon
l1 = pd.read_excel('lexicons-used.xlsx', sheet_name = "Wikipedia", header = None)
l2 = pd.read_excel('lexicons-used.xlsx', sheet_name = "EPA", header = None)

W_EPA_Lexicon = pd.concat([l1, l2]).reset_index(drop = True)
W_EPA_Lexicon = pd.DataFrame(W_EPA_Lexicon[0])
W_EPA_Lexicon.columns = ["Lexicon"]

W_EPA_Lexicon = W_EPA_Lexicon.drop_duplicates().reset_index(drop = True)

In [None]:
tag8_W_EPA["Text"] = tag8_W_EPA["Text"].apply(preprocess_text)

In [None]:
threshold_metrics_1(tag8_W_EPA, W_EPA_Lexicon, 5, 10)

## 2.9. Tagger 9 - UNDP Lexicon & Wikipedia Lexicon

In [None]:
tag_W_UNDP = tag_climate_df

In [None]:
#Load the lexicon
l1 = pd.read_excel('lexicons-used.xlsx', sheet_name = "Wikipedia", header = None)
l2 = pd.read_excel('lexicons-used.xlsx', sheet_name = "UNDP", header = None)

W_UNDP_Lexicon = pd.concat([l1, l2]).reset_index(drop = True)
W_UNDP_Lexicon = pd.DataFrame(W_UNDP_Lexicon[0])
W_UNDP_Lexicon.columns = ["Lexicon"]

W_UNDP_Lexicon = W_UNDP_Lexicon.drop_duplicates().reset_index(drop = True)

In [None]:
tag_W_UNDP["Text"] = tag_W_UNDP["Text"].apply(preprocess_text)

In [None]:
threshold_metrics_1(tag_W_UNDP, W_UNDP_Lexicon, 1, 10)

## 2.10. Tagger 10 - UNDP Lexicon & EPA Lexicon

Uitleg

In [None]:
tag_UNDP_EPA = tag_climate_df

In [None]:
#Load the lexicon
l1 = pd.read_excel('lexicons-used.xlsx', sheet_name = "EPA", header = None)
l2 = pd.read_excel('lexicons-used.xlsx', sheet_name = "UNDP", header = None)

EPA_UNDP_Lexicon = pd.concat([l1, l2]).reset_index(drop = True)
EPA_UNDP_Lexicon = pd.DataFrame(EPA_UNDP_Lexicon[0])
EPA_UNDP_Lexicon.columns = ["Lexicon"]

EPA_UNDP_Lexicon = EPA_UNDP_Lexicon.drop_duplicates().reset_index(drop = True)
EPA_UNDP_Lexicon

In [None]:
tag_UNDP_EPA["Text"] = tag_UNDP_EPA["Text"].apply(preprocess_text)

In [None]:
threshold_metrics_1(tag_UNDP_EPA, EPA_UNDP_Lexicon, 10, 20)

In [None]:
threshold_metrics_2(tag_UNDP_EPA, EPA_UNDP_Lexicon, 0, 1, 0.1)

## 2.11. Tagger 11 - UNDP Lexicon & EPA Lexicon & Wikipedia

Uitleg

In [None]:
tag_UNDP_EPA_W = tag_climate_df

In [None]:
#Load the lexicon
l1 = pd.read_excel('lexicons-used.xlsx', sheet_name = "EPA", header = None)
l2 = pd.read_excel('lexicons-used.xlsx', sheet_name = "UNDP", header = None)
l3 = pd.read_excel('lexicons-used.xlsx', sheet_name = "Wikipedia", header = None)

W_EPA_UNDP_Lexicon = pd.concat([l1, l2, l3]).reset_index(drop = True)
W_EPA_UNDP_Lexicon = pd.DataFrame(W_EPA_UNDP_Lexicon[0])
W_EPA_UNDP_Lexicon.columns = ["Lexicon"]

W_EPA_UNDP_Lexicon = W_EPA_UNDP_Lexicon.drop_duplicates().reset_index(drop = True)
W_EPA_UNDP_Lexicon

In [None]:
tag_UNDP_EPA_W["Text"] = tag_UNDP_EPA_W["Text"].apply(preprocess_text)

In [None]:
threshold_metrics_2(tag_UNDP_EPA_W, W_EPA_UNDP_Lexicon, 1, 2, 0.1)

In [None]:
threshold_metrics_1(tag_UNDP_EPA_W, W_EPA_UNDP_Lexicon, 0, 10)

# 3. Tagging Articles

## 3.1. Import Articles