In [2]:
import pandas as pd
import numpy as np
from collections import Counter
import os

### Combine old and new datasets

In [None]:
df = pd.read_csv('data_old.csv')
df.head()
# df = old dataset

In [None]:
news = df["text_original"].to_list()

In [None]:
clean_text = [text.replace ("\n", "") for text in news]
clean_text = [" ".join(text.split()) for text in news] 

In [None]:
df["text_clean"] = clean_text

In [None]:
df_min = df[["filename", "text_original", "text_clean","date", "year", "month", "outlet", "country", "south"]]

In [None]:
df2 = pd.read_csv('data_new.csv')
df2.drop(labels = ['Unnamed: 0'], axis = 1, inplace = True)
df2.head()
# df2 = new dataset

In [None]:
df3 = pd.concat([df_min, df2])
df3 = df3.reset_index().drop(['index'], axis=1)

In [None]:
len(df3)

In [None]:
df3.to_csv("data_full.csv")

### Remove duplicates

In [8]:
df = pd.read_csv("data_full.csv")

In [9]:
df.drop(labels = ['Unnamed: 0'], axis = 1, inplace = True)
df.head()

Unnamed: 0,filename,text_original,text_clean,date,year,month,outlet,country,south
0,'Act now'_ APRA issues climate change advice.rtf,"\nNovember 27, 2021 Saturday\nAustralian3 Edit...","November 27, 2021 Saturday Australian3 Edition...",2021-11-01,2021,11,The Australian,Australia,0
1,'Australia must keep 95pc coal in ground'.rtf,"\nSeptember 9, 2021 Thursday\nAustralian Editi...","September 9, 2021 Thursday Australian Edition ...",2021-09-01,2021,9,The Australian,Australia,0
2,"'Ban new coalmines, gas and oil fields to hit ...","\nMay 19, 2021 Wednesday\nAustralian Edition\n...","May 19, 2021 Wednesday Australian Edition Copy...",2021-05-01,2021,5,The Australian,Australia,0
3,"'Big picture' Bill trips again on the details,...","\nMay 1, 2019 Wednesday\nAustralian Edition\n\...","May 1, 2019 Wednesday Australian Edition Copyr...",2019-05-01,2019,5,The Australian,Australia,0
4,'Business is trying to regain the trust of the...,"\nJuly 2, 2019 Tuesday\nAustralian Edition\n\n...","July 2, 2019 Tuesday Australian Edition Copyri...",2019-07-01,2019,7,The Australian,Australia,0


In [10]:
len(df)

124535

In [5]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Derived from: https://saturncloud.io/blog/algorithm-to-detect-similar-documents-in-python-script/

In [6]:
documents = df["text_clean"].to_list()

threshold = 0.9
vectorizer = TfidfVectorizer()
similar_index = []

In [7]:
for i in range(len(documents)):
    text1 = documents[i][200:3200]
    for j in range(i+1, len(documents)):
        text2 = documents[j][200:3200]
        vectors = vectorizer.fit_transform([text1, text2])
        similarity = cosine_similarity(vectors)
        if similarity[1][0] > threshold:
            print (i, j, similarity[1][0])
            similar_index.append(i)

KeyboardInterrupt: 

In [None]:
df.drop(labels = ['level_0'], axis = 1, inplace = True)
df = df.drop(similar_index).reset_index()
len(df)

**Enable multiple cores**

In [None]:
documents = df["text_clean"].to_list()

threshold = 0.9
vectorizer = TfidfVectorizer()
similar_index = []

In [None]:
def cossim(i, text1):
    for j in range(i+1, len(documents)):
        text2 = documents[j][200:3200]
        vectors = vectorizer.fit_transform([text1, text2])
        similarity = cosine_similarity(vectors)
        if similarity[1][0] > threshold:
            return i, j, similarity[1][0]

In [None]:
def multiprocessing_func(i):
    text1 = documents[i][200:3200]
    y = cossim(i, text1)
    if y is not None:
        print(y)
        similar_index.append(i)

how to use multipreocesing & diffference between Processor and Pool: https://urban-institute.medium.com/using-multiprocessing-to-make-python-code-faster-23ea5ef996ba

Use ```multiprocess``` instead of ```multiprocessing```for ipykernel, or save the functions as .py then use it through ipykernel

To append to a list between multiple processers: https://stackoverflow.com/questions/42490368/appending-to-the-same-list-from-different-processes-using-multiprocessing

In [None]:
if __name__ == '__main__':
    
    with mp.Manager() as manager:
        similar_index = manager.list() 
    
        pool = mp.Pool(7)
        pool.map(multiprocessing_func,range(len(documents)))
        pool.close()
        
        similar_index = list(similar_index)

In [None]:
len(Counter(similar_index))

In [None]:
#df.drop(labels = ['level_0'], axis = 1, inplace = True)
df = df.drop(similar_index).reset_index()
len(df)

In [None]:
df.to_csv("data_dedup.csv")

### Create dataset for manual coding

In [5]:
terms = ["tuval","climate change","global warming","globalwarming","greenhouse","pollution","air pollution",
         "water pollution","noise pollution","animal protection","ipcc","copenhagen","kyoto","forest",
         "two degrees","carbon","climate warming","climatic change","warming climate","climatic disruption",
         "climate catastrophe","climate chaos","climate crisis","climate disaster","climate emergency",
         "global heating","climate breakdown","climate threat"]

In [123]:
sampled_df = df[df['text_original'].str.contains('|'.join(terms), case=False)].groupby('outlet', group_keys=False).sample(27)
df2 = df.drop(sampled_df.index)
sampled_df.reset_index(inplace=True)
os.mkdir("/Users/xiyan/Documents/Research/climate_compounds/Labeling/manual_benchmark")
sampled_df.to_csv("manual_benchmark/metadata_manual_benchmark.csv")

In [124]:
i = 1
for text in sampled_df["text_original"]:
    with open(f'manual_benchmark/Text({i}).txt', 'w') as f:
        f.write(text)
    i += 1

In [119]:
df3 = df2[df2['text_original'].str.contains('|'.join(terms), case=False)].groupby('outlet', group_keys=False).sample(1)
df2 = df2.drop(df3.index)
df3.reset_index(inplace = True)
os.mkdir("/Users/xiyan/Documents/Research/climate_compounds/Labeling/coder_training")
df3.to_csv("coder_training/metadata_coder_training.csv")

In [120]:
i = 1
for text in df3["text_original"]:
    with open(f'coder_training/Text({i}).txt', 'w') as f:
        f.write(text)
    i += 1

In [121]:
df4 = df2[df2['text_original'].str.contains('|'.join(terms), case=False)].groupby('outlet', group_keys=False).sample(3)
df4.reset_index(inplace = True)
os.mkdir("/Users/xiyan/Documents/Research/climate_compounds/Labeling/coder_test")
df4.to_csv("coder_test/metadata_coder_test.csv")

In [122]:
i = 1
for text in df4["text_original"]:
    with open(f'coder_test/Text({i}).txt', 'w') as f:
        f.write(text)
    i += 1

### Intercoder reliability

In [None]:
import pandas as pd
import krippendorff

In [None]:
cod1 = pd.read_excel("/Users/xiyan/Downloads/Coding_CoderTest3_Damiano.xlsx")
cod2 = pd.read_excel("/Users/xiyan/Downloads/Coding_CoderTest3_Pascal.xlsx")

In [None]:
cod1 = cod1.rename(columns={'RA': 'RA1'})
cod2 = cod2.rename(columns={'RA': 'RA2'})

In [None]:
coders = cod1.merge(cod2, on='ID')
coder.head()

In [None]:
len(coders)

In [None]:
coders.groupby(['RA1','RA2'])['ID'].count()

In [None]:
krippendorff.alpha((coders['RA1'].values.tolist(), 
                    coders['RA2'].values.tolist()), level_of_measurement="nominal")

In [None]:
coders.groupby(['RA1','RA2']).get_group((1,0))

In [None]:
coders.groupby(['RA1','RA2']).get_group((0,1))

### Inductive label identification

In [17]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk import bigrams
import re
from collections import Counter

In [3]:
df = pd.read_csv('data_final.csv')
df.drop(labels = ['Unnamed: 0'], axis = 1, inplace = True)
pd.set_option('display.max_columns', None)

In [4]:
df.head()

Unnamed: 0,index,filename,text_clean,date,year,month,outlet,country,south,text_original,climate change,global warming,greenhouse effect,climate warming,climatic change,greenhouse warming,warming climate,climatic disruption,climate catastrophe,climate chaos,climate crisis,climate disaster,climate emergency,global heating,climate breakdown,climate threat,label_sum,neutral,urgent,climate change_binary,global warming_binary,greenhouse effect_binary,climate warming_binary,climatic change_binary,greenhouse warming_binary,warming climate_binary,climatic disruption_binary,climate catastrophe_binary,climate chaos_binary,climate crisis_binary,climate disaster_binary,climate emergency_binary,global heating_binary,climate breakdown_binary,climate threat_binary,neutral_binary,urgent_binary,date_new
0,0,'Act now'_ APRA issues climate change advice.rtf,"November 27, 2021 Saturday Australian3 Edition...",2021-11-01,2021,11,The Australian,Australia,0,"\nNovember 27, 2021 Saturday\nAustralian3 Edit...",4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2021-11-01
1,1,'Australia must keep 95pc coal in ground'.rtf,"September 9, 2021 Thursday Australian Edition ...",2021-09-01,2021,9,The Australian,Australia,0,"\nSeptember 9, 2021 Thursday\nAustralian Editi...",3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,5,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2021-09-01
2,2,"'Ban new coalmines, gas and oil fields to hit ...","May 19, 2021 Wednesday Australian Edition Copy...",2021-05-01,2021,5,The Australian,Australia,0,"\nMay 19, 2021 Wednesday\nAustralian Edition\n...",3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,4,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2021-05-01
3,7,'Climate change_ It's really not for us'.rtf,"July 11, 2019 Thursday Australian Edition Copy...",2019-07-01,2019,7,The Australian,Australia,0,"\nJuly 11, 2019 Thursday\nAustralian Edition\n...",6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,6,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2019-07-01
4,8,'Climate changes investment' - GLOBAL FOOD FOR...,"March 21, 2019 Thursday Australian Edition Cop...",2019-03-01,2019,3,The Australian,Australia,0,"\nMarch 21, 2019 Thursday\nAustralian Edition\...",4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2019-03-01


In [7]:
texts = df["text_clean"].to_list()
texts =[x.lower() for x in texts]
pattern = re.compile(r"climat[e]*|heat[ing]*|warm[ing]*|greenhous[e]", re.IGNORECASE)

In [22]:
all_bigrams = []

for article in texts:
    matches = re.findall(pattern, article)
    words = word_tokenize(article)
    target_bigrams = list(bigrams(words))
    target_bigrams = [(word1, word2) for word1, word2 in target_bigrams if word1 in matches or word2 in matches]
    all_bigrams.extend(target_bigrams)

In [25]:
element_counts = Counter(all_bigrams)
filtered_list = [element for element, count in element_counts.items() if count >= 100]

In [27]:
len(filtered_list)

712

In [26]:
filtered_list

[('on', 'climate'),
 ('climate', 'change'),
 ('to', 'climate'),
 ('of', 'climate'),
 ('global', 'warming'),
 ('warming', 'to'),
 ('cop26', 'climate'),
 ('climate', 'summit'),
 ("'s", 'climate'),
 ('and', 'climate'),
 ('climate', 'scientist'),
 ('safe', 'climate'),
 ('for', 'climate'),
 ('climate', ','),
 ('climate', 'policy'),
 ('warming', 'at'),
 ('their', 'climate'),
 ('un', 'climate'),
 ('climate', 'conference'),
 ('that', 'climate'),
 ('while', 'climate'),
 ('heating', 'of'),
 ('said', 'climate'),
 ('global', 'climate'),
 ('body', 'climate'),
 ('global', 'greenhouse'),
 ('greenhouse', 'gas'),
 ('tackle', 'climate'),
 ('curb', 'climate'),
 ('changing', 'climate'),
 ('reduce', 'greenhouse'),
 ('not', 'climate'),
 ('former', 'climate'),
 ('climate', 'commissioner'),
 ('in', 'climate'),
 ('climate', '.'),
 ('.', 'climate'),
 ('the', 'climate'),
 ('paris', 'climate'),
 ('with', 'climate'),
 ('our', 'climate'),
 ('warming', 'with'),
 ('from', 'climate'),
 ('climate', 'as'),
 ('climate', 

In [36]:
def extract_sentence(text, query):
    #import re
    #text = str(text).lower()
    sentences = ""
    #query = str(query).lower()
    
    for sentence in text.split('.'):
        newsen = re.sub(r'[^\w\s.]', ' ', sentence)
        if query in newsen:
            sentences = sentences+'.'+sentence
    return sentences

In [32]:
names = ["changing climate", 'climate risk', 'climate problem', 'severe climate', 'climate extremes', 
         'extreme climate', 'dangerous climate', 'climate challenge', 'climate risks', 'climate wars',
        'climate issue', 'climate concerns', 'climate changes', 'catastrophic climate', 'climate hazards',
        'warmer climate', 'climate disasters', 'abrupt climate', 'climate damage', 'catastrophic warming',
        'planetary warming', 'atmospheric warming', 'climate warms', 'greenhouse warming']

In [53]:
sentences_dict = {}

for name in names:
    sentences_dict[name] = []
    for article in texts:
        extracted_sen = extract_sentence(article, name)
        if len(extracted_sen) != 0:
            sentences_dict[name].append(extracted_sen)

In [56]:
import pickle

with open('inductive_identification_sentences.pkl', 'wb') as fp:
    pickle.dump(sentences_dict, fp)

In [81]:
sentences_dict["climate problem"]

['. methane was also a focus of discussion, with a new satellite due to launch in 2023 that will help monitor concentrations of the harmful emission, the second-largest contributor to greenhouse warming after carbon dioxide',
 '. once begun, sulfate aerosol injection would have to continue indefinitely in order to avoid a large, ecologically devastating spike in global temperatures from continued greenhouse warming — and ever-increasing amounts of sulfate would be needed over time because the aerosols will tend to accrete into larger droplets, becoming less reflective and more prone to falling back to earth',
 ". even if greenhouse warming is a looming crisis, assigning canada emission reduction targets that are identical to other countries turns canada into a sacrificial lamb to global environmentalism. canada's 30 million people could stop living tomorrow, and the trend of greenhouse warming would not change",
 '."" the article says few people realize how many ifs, ands and buts over

In [89]:
len(sentences_dict["greenhouse warming"])

130