# 0. Packages and Functions

## 0.1. Packages

In [28]:
import pandas as pd
import string
from nltk import ngrams
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix

## 0.2. Functions

In [29]:
#pre-process text for lexicon based approaches
def preprocess_text(text):
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # convert to lower case
    text = text.lower()
    # remove blank spaces
    text = ' '.join(text.split())
    # remove newline characters
    text = text.replace('\n', '')
    return text

In [30]:
#Create function that take the dataframe, lexicon and n-gram value (how many n-grams should be considered) and determine the 
#count of words in dataframe text that match the lexicon
def count_lexicon_words(text_df, lexicon):
    lexicon = lexicon["Lexicon"]
    count = []
    
    for text in text_df["Text"]:
        lexicon_matches = 0
    
        for word in lexicon:
            if word.lower() in text:
                lexicon_matches += 1
        
        count.append(lexicon_matches)
        
    text_df["Lexicon Count"] = count
    
    return(text_df)

#create a function that used the lexicon approach to determine with the target is yes or no
def lexicon_target_classifier(df, treshold):
    target = []
    
    count = df["Lexicon Count"]
    
    for c in count:
        if c < treshold:
            target.append("No")
        else:
            target.append("Yes")
            
    df["Target Lexicon"] = target
    return(df)

#Combine both functions to classify articles based on the lexicon
def lexicon_climate_classifier(text_df, lexicon, treshold):
    df = count_lexicon_words(text_df, lexicon)
    
    return(lexicon_target_classifier(df, treshold))

def threshold_metrics(df_text, lexicon, min_treshhold, max_treshhold):
    for i in range(min_treshhold, max_treshhold + 1):
        df = lexicon_climate_classifier(df_text, lexicon, i)
        cross_table = pd.crosstab(df['Target'], df['Target Lexicon'], margins=True)
        
        # calculate classification metrics using scikit-learn
        accuracy = (cross_table.iloc[0,0] + cross_table.iloc[1,1]) / cross_table.loc['All','All']
        precision = cross_table.iloc[1,1] / cross_table.iloc[0,1] if cross_table.iloc[0,1] != 0 else 0
        recall = cross_table.iloc[1,1] / cross_table.iloc[1,0] if cross_table.iloc[1,0] != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

        # print the metrics
        print("Threshhold:", i)
        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 score:", f1_score)
        print("\n")


Accuracy: This metric measures the overall performance of a model. It is defined as the number of correct predictions divided by the total number of predictions. Accuracy is a good metric to use when the classes are roughly balanced, meaning there are about the same number of positive and negative examples in the dataset.

Precision: This metric measures how many of the positive predictions made by a model are actually correct. It is defined as the number of true positives divided by the total number of positive predictions. Precision is a good metric to use when we care more about avoiding false positives than false negatives.

Recall: This metric measures how many of the positive examples in the dataset are correctly predicted by the model. It is defined as the number of true positives divided by the total number of actual positive examples. Recall is a good metric to use when we care more about avoiding false negatives than false positives.

F1 score: This metric is a weighted average of precision and recall, where the weight is determined by the beta parameter. The most common value for beta is 1, which gives equal weight to precision and recall. The F1 score is a good metric to use when we want to balance precision and recall, and when the classes are imbalanced.

# 1. Import Label Dataset

In [31]:
tag_climate_df = pd.read_parquet("Final_Label_Table.parquet")
tag_climate_df.head(5)

Unnamed: 0,Text,Link,Is_climate,Sentiment_Label,Climate_Change_Topic,Level_Climate_Change_Topic,doubt,Sentiment_Label_R,Climate_Change_Topic_R,Level_Climate_Change_Topic_R
0,More than a dozen state attorneys general gath...,https://www.washingtonpost.com/news/energy-env...,True,1,Yes,Medium,False,TBD,TBD,TBD
1,Sen. Jeff Merkley of Oregon endorsed Bernie S...,http://www.wsj.com/articles/campaign-wire-1460...,True,0,Yes,Small,False,TBD,TBD,TBD
2,When Carmen Luna moved to a neighborhood on t...,https://www.wsj.com/articles/mexico-city-strug...,True,-1,Yes,Medium,False,TBD,TBD,TBD
3,As ocean warming continues to trigger widespre...,https://www.washingtonpost.com/national/health...,True,1,Yes,High,False,TBD,TBD,TBD
4,PG&E Corp. told California regulators that it...,https://www.wsj.com/articles/pg-e-equipment-mi...,True,-1,Yes,Medium,False,TBD,TBD,TBD


In [32]:
#Only keep the required columns
tag_climate_df = tag_climate_df[["Text", "Climate_Change_Topic", "Level_Climate_Change_Topic"]]

In [33]:
#Clean the tabel
tag_climate_df['Level_Climate_Change_Topic'] = tag_climate_df['Level_Climate_Change_Topic'].str.strip()
tag_climate_df[tag_climate_df["Level_Climate_Change_Topic"] == "NA"] = "Na"
tag_climate_df[tag_climate_df["Level_Climate_Change_Topic"] == "0"] = "Na"
tag_climate_df["Target"] = tag_climate_df["Level_Climate_Change_Topic"].apply(lambda x: "Yes" if x in ["High", "Medium"] else "No")

In [34]:
overview_labels_hms = tag_climate_df.groupby("Level_Climate_Change_Topic")["Text"].count().reset_index()

In [35]:
overview_labels_hms

Unnamed: 0,Level_Climate_Change_Topic,Text
0,High,71
1,Medium,32
2,Na,43
3,Small,55


In [36]:
overview_labels = tag_climate_df.groupby("Target")["Text"].count().reset_index()

In [37]:
overview_labels

Unnamed: 0,Target,Text
0,No,98
1,Yes,103


In [None]:
#overview_labels_hms.to_csv("C:/Users/Boedt/OneDrive/Bureaublad/R Thesis/overview_tag_labels_hms", index = False)
#overview_labels.to_csv("C:/Users/Boedt/OneDrive/Bureaublad/R Thesis/overview_tag_labels", index = False)

# 2. Taggers

## 2.1. Tagger 1 - Global Change Lexicon

Uitleg

In [None]:
#Store the dataframe in a different one, for the purpose of this lexicon. This way there is no confusion.
tag1_Global_Change = tag_climate_df

In [None]:
#Load the lexicon
Global_Change_Lexicon = pd.read_excel('lexicons-used.xlsx', sheet_name = "Global Change", header = None)
Global_Change_Lexicon.columns = ["Lexicon"]

In [None]:
#Clean the text
tag1_Global_Change["Text"] = tag1_Global_Change["Text"].apply(preprocess_text)

In [None]:
threshold_metrics(tag1_Global_Change, Global_Change_Lexicon, 1, 8)

## 2.2. Tagger 2 - IPCC Lexicon

Uitleg

In [None]:
#Store the dataframe in a different one, for the purpose of this lexicon. This way there is no confusion.
tag2_IPCC = tag_climate_df

In [None]:
#Load the lexicon
IPCC_Lexicon = pd.read_excel('lexicons-used.xlsx', sheet_name = "IPCC", header = None)
IPCC_Lexicon.columns = ["Lexicon"]

IPCC_Lexicon

In [None]:
#Clean the text
tag2_IPCC["Text"] = tag2_IPCC["Text"].apply(preprocess_text)

In [None]:
threshold_metrics(tag2_IPCC, IPCC_Lexicon, 1, 8)

## 2.3. Tagger 3 - Wikipedia Lexicon

Uitleg

In [38]:
#Store the dataframe in a different one, for the purpose of this lexicon. This way there is no confusion.
tag3_Wikipedia = tag_climate_df

In [39]:
#Load the lexicon
Wikipedia_Lexicon = pd.read_excel('lexicons-used.xlsx', sheet_name = "Wikipedia", header = None)
Wikipedia_Lexicon = pd.DataFrame(Wikipedia_Lexicon[0])
Wikipedia_Lexicon.columns = ["Lexicon"]

Wikipedia_Lexicon

Unnamed: 0,Lexicon
0,"100,000-year problem"
1,adaptation
2,additionality
3,albedo
4,anoxic event
...,...
159,volcanism
160,water vapor
161,weather
162,World Climate Report


In [40]:
#Clean the text
tag3_Wikipedia["Text"] = tag3_Wikipedia["Text"].apply(preprocess_text)

In [41]:
threshold_metrics(tag3_Wikipedia, Wikipedia_Lexicon, 1, 8)

Threshhold: 1
Accuracy: 0.527363184079602
Precision: 1.0842105263157895
Recall: 0
F1 score: 0.0


Threshhold: 2
Accuracy: 0.527363184079602
Precision: 1.0842105263157895
Recall: 0
F1 score: 0.0


Threshhold: 3
Accuracy: 0.746268656716418
Precision: 4.714285714285714
Recall: 1.7837837837837838
F1 score: 2.588235294117647


Threshhold: 4
Accuracy: 0.6567164179104478
Precision: 18.0
Recall: 0.5373134328358209
F1 score: 1.0434782608695652


Threshhold: 5
Accuracy: 0.5671641791044776
Precision: 17.0
Recall: 0.19767441860465115
F1 score: 0.3908045977011494


Threshhold: 6
Accuracy: 0.5174129353233831
Precision: 0
Recall: 0.061855670103092786
F1 score: 0.0


Threshhold: 7
Accuracy: 0.4975124378109453
Precision: 0
Recall: 0.019801980198019802
F1 score: 0.0


Threshhold: 8
Accuracy: 0.4975124378109453
Precision: 0
Recall: 0.019801980198019802
F1 score: 0.0




## 2.4. Tagger 4 - Twitter Lexicon

Uitleg

## 2.5. Tagger 5 - Global Change Lexicon & IPCC Lexicon

uitleg

In [None]:
tag5_GC_IPCC = tag_climate_df

In [None]:
#Load the lexicon
l1 = pd.read_excel('lexicons-used.xlsx', sheet_name = "Global Change", header = None)
l2 = pd.read_excel('lexicons-used.xlsx', sheet_name = "IPCC", header = None)

GC_IPCC_Lexicon = pd.concat([l1, l2]).reset_index(drop = True)
GC_IPCC_Lexicon = pd.DataFrame(GC_IPCC_Lexicon[0])
GC_IPCC_Lexicon.columns = ["Lexicon"]

GC_IPCC_Lexicon

In [None]:
tag5_GC_IPCC["Text"] = tag5_GC_IPCC["Text"].apply(preprocess_text)

In [None]:
threshold_metrics(tag5_GC_IPCC, GC_IPCC_Lexicon, 1, 8)

## 2.6. Tagger 6 - Global Change Lexicon & Wikipedia Lexicon

In [None]:
tag5_GC_W = tag_climate_df

In [None]:
#Load the lexicon
l1 = pd.read_excel('lexicons-used.xlsx', sheet_name = "Global Change", header = None)
l2 = pd.read_excel('lexicons-used.xlsx', sheet_name = "Wikipedia", header = None)

GC_W_Lexicon = pd.concat([l1, l2]).reset_index(drop = True)
GC_W_Lexicon = pd.DataFrame(GC_W_Lexicon[0])
GC_W_Lexicon.columns = ["Lexicon"]

GC_W_Lexicon

In [None]:
tag5_GC_W["Text"] = tag5_GC_W["Text"].apply(preprocess_text)

In [None]:
threshold_metrics(tag5_GC_W, GC_W_Lexicon, 1, 8)

## 2.7. Tagger 7 - IPCC Lexicon & Wikipedia Lexicon

Uitleg

In [None]:
tag5_IPCC_W = tag_climate_df

In [None]:
#Load the lexicon
l1 = pd.read_excel('lexicons-used.xlsx', sheet_name = "IPCC", header = None)
l2 = pd.read_excel('lexicons-used.xlsx', sheet_name = "Wikipedia", header = None)

IPCC_W_Lexicon = pd.concat([l1, l2]).reset_index(drop = True)
IPCC_W_Lexicon = pd.DataFrame(IPCC_W_Lexicon[0])
IPCC_W_Lexicon.columns = ["Lexicon"]

IPCC_W_Lexicon

In [None]:
tag5_IPCC_W["Text"] = tag5_IPCC_W["Text"].apply(preprocess_text)

In [None]:
threshold_metrics(tag5_IPCC_W, IPCC_W_Lexicon, 1, 8)

# 3. Tagging Articles

## 3.1. Import Articles