## Check with some actual inputs

In [22]:
from typing import List
from typing import Tuple
import pandas as pd
import numpy as np

def find_term_indexes(string, lexicon):
    term_indexes = []
    string_length = len(string)
    
    for term in lexicon:
        term_length = len(term)
        start_index = 0
        
        while start_index < string_length:
            index = string.find(term, start_index)
            
            if index == -1:
                break
            
            term_indexes.append((index, index + term_length))
            start_index = index + 1
    
    return term_indexes


START_TOKEN = " [B-ASP]"
END_TOKEN = "[E-ASP] "

def highlight_focuswords(text: str, indices: List[Tuple[int, int]]) -> str:
    """
    Wraps [B-ASP] before and [E-ASP] after focuswords in the text,
    such that the ABSA model knows which words to do sentiment analysis on.

    Args:
        text (str): text where focuswords will be highlighted
        indices (List): start and end position of each focusword in the text

    Returns:
        str: Text with highlighted focuswords ready for sentiment analysis.
    """
    text_start = 0

    text_buffer = []

    for focusword_start, focusword_end in indices:
        # add text between focuswords
        text_buffer.append(text[text_start:focusword_start])
        # add focusword
        text_buffer.append(START_TOKEN)
        text_buffer.append(text[focusword_start:focusword_end])
        text_buffer.append(END_TOKEN)
        text_start = focusword_end

    text_buffer.append(text[text_start:])

    return ''.join(text_buffer)

In [6]:
# my_string = "This is a sample string with some terms in the lexicon."
# my_lexicon = ["sample string", "terms", "lexicon"]

# indexes = find_term_indexes(my_string, my_lexicon)
# print(indexes)

# highlight_focuswords(my_string, indexes)

In [15]:
df

Unnamed: 0,Text,Link,Final_Climate_Change_Level_Label,Final_Sentiment_Label,indexes,highlighted_text
0,More than a dozen state attorneys general gath...,https://www.washingtonpost.com/news/energy-env...,Medium,-1,"[(2548, 2561), (251, 258), (528, 535), (908, 9...",More than a dozen state attorneys general gath...
1,When Carmen Luna moved to a neighborhood on t...,https://www.wsj.com/articles/mexico-city-strug...,Medium,-1,"[(6518, 6525), (6518, 6532), (6191, 6198)]",When Carmen Luna moved to a neighborhood on t...
2,As ocean warming continues to trigger widespre...,https://www.washingtonpost.com/national/health...,High,-1,"[(2646, 2653), (3455, 3462), (2646, 2660), (34...",As ocean warming continues to trigger widespre...
3,PG&E Corp. told California regulators that it...,https://www.wsj.com/articles/pg-e-equipment-mi...,Medium,-1,"[(2586, 2593), (2586, 2600), (2332, 2339)]",PG&E Corp. told California regulators that it...
4,The world’s top central banks can do more to ...,https://www.wsj.com/articles/central-banks-cou...,High,1,"[(93, 100), (538, 545), (1164, 1171), (1893, 1...",The world’s top central banks can do more to ...
...,...,...,...,...,...,...
95,Environmentalists are not happy with the Trump...,https://www.washingtonpost.com/news/monkey-cag...,High,-1,"[(141, 148), (482, 489), (141, 155)]",Environmentalists are not happy with the Trump...
96,President-elect Joe Biden said Thursday he pl...,https://www.wsj.com/articles/biden-picks-north...,High,1,"[(860, 867), (1035, 1042), (1241, 1248), (1600...",President-elect Joe Biden said Thursday he pl...
97,WASHINGTON—President-elect Joe Biden is neari...,https://www.wsj.com/articles/biden-closes-in-o...,High,0,"[(232, 239), (487, 494), (2111, 2118), (4492, ...",WASHINGTON—President-elect Joe Biden is neari...
98,President Trump’s recent blowup over General ...,https://www.wsj.com/articles/the-electric-kool...,Medium,-1,"[(1042, 1049), (3116, 3123), (3116, 3130), (31...",President Trump’s recent blowup over General ...


In [32]:
lexicon = pd.read_csv('EPA_Lexicon')
lexicon=lexicon['Lexicon'].to_list()
lexicon

df = pd.read_parquet('Climate_Labels_Dataset.parquet')
df= df[(df['Final_Climate_Change_Level_Label']=='High') | (df['Final_Climate_Change_Level_Label']=='Medium')].reset_index(drop=True)
df = df.drop(['Sentiment_Label', 'Sentiment_Label_R', 'Level_Climate_Change_Topic', 'Level_Climate_Change_Topic_R', 'was_I_retarded?'], axis=1)
# df['Text'] = '@S@ ' + df['Text']
df['indexes'] = df['Text'].apply(lambda x: find_term_indexes(x, lexicon))
df['highlighted_text'] = df.apply(lambda x: highlight_focuswords(x['Text'], x['indexes']), axis=1)
text_list = df['highlighted_text'][:10].to_list()
text_list_clean = df['Text'][:10].to_list()

df['highlighted_text'][:10]
# {id}.{dataset name}.{type}.dat.apc

df['Text'][3]

' PG&E Corp. told California regulators that its power equipment might have contributed to igniting a recent wildfire that has killed four people. The utility disclosed in securities filings that it notified the California Public Utilities Commission on Friday it had recorded alarms on certain equipment supporting a power line that served an area east of Redding, Calif., where the Zogg Fire is believed to have originated in Shasta County, near Oregon. The fire has burned more than 56,000 acres and destroyed 204 structures since it started late last month, according to the California Department of Forestry and Fire Protection. The blaze, which forced evacuations in the area, was almost fully contained as of Friday, according to Cal Fire. PG&E said state fire investigators have taken possession of some of its equipment as part of their probe into the cause of the fire. The company said the information is preliminary and that it has no information about the cause of the fire. It said it d

In [27]:
df['highlighted_text'].to_csv('climate.climate.valid.dat.apc', index=False)
inference_sets = 'climate.climate.valid.dat.apc'


In [16]:
classifier = APC.SentimentClassifier('english',
                                     auto_device=True,  # False means load model on CPU
                                     cal_perplexity=True,
                                     )


[2023-05-18 12:11:05] (2.3.1) [32mDownloading checkpoint:english [0m
[2023-05-18 12:11:05] (2.3.1) [31mNotice: The pretrained model are used for testing, it is recommended to train the model on your own custom datasets[0m
[2023-05-18 12:11:05] (2.3.1) Checkpoint already downloaded, skip
[2023-05-18 12:11:15] (2.3.1) Load sentiment classifier from checkpoints\APC_ENGLISH_CHECKPOINT\fast_lcf_bert_English_acc_84.65_f1_82.39
[2023-05-18 12:11:15] (2.3.1) config: checkpoints\APC_ENGLISH_CHECKPOINT\fast_lcf_bert_English_acc_84.65_f1_82.39\fast_lcf_bert.config
[2023-05-18 12:11:15] (2.3.1) state_dict: checkpoints\APC_ENGLISH_CHECKPOINT\fast_lcf_bert_English_acc_84.65_f1_82.39\fast_lcf_bert.state_dict
[2023-05-18 12:11:15] (2.3.1) model: None
[2023-05-18 12:11:15] (2.3.1) tokenizer: checkpoints\APC_ENGLISH_CHECKPOINT\fast_lcf_bert_English_acc_84.65_f1_82.39\fast_lcf_bert.tokenizer
[2023-05-18 12:11:27] (2.3.1) Set Model Device: cpu
[2023-05-18 12:11:27] (2.3.1) Device Name: Unknown


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT e

In [None]:
text_list


 " When Carmen Luna moved to a neighborhood on the outskirts of Mexico City in 1975, there was no sewage system. To get water, she carried buckets to and from a faucet in the street. At the end of the 1980s, her house was connected to the grid; her family would get tamarind-colored water three days a week. Last year, Ms. Luna signed up for a new rainwater-harvesting program led by Mexico City Mayor Claudia Sheinbaum, an environmental scientist. The city government had teamed up with local nonprofit Isla Urbana to install 100,000 of its rainwater-harvesting systems in the districts of Xochimilco and Iztapalapa. Today, 20% of the water used in Ms. Luna’s eight-member household comes from a 2,500-liter collection tank on the ground floor. While not potable, the water is available seven days a week and can be used for household chores and bathing, easing strain on the grid. Harvesting rainwater is a centuries-old practice, but Isla Urbana’s system is cheaper, faster and easier to install t

In [31]:
from pyabsa.utils.file_utils import convert_apc_set_to_atepc_set
from pyabsa.functional import ABSADatasetList

convert_apc_set_to_atepc_set(inference_sets)

ImportError: cannot import name 'convert_apc_set_to_atepc_set' from 'pyabsa.utils.file_utils' (c:\ProgramData\Anaconda3\envs\pyabsa\lib\site-packages\pyabsa\utils\file_utils\__init__.py)

In [35]:

# instance inference
apc_result = classifier.predict(text_list,
                   save_result=True,
                   print_result=True,  # print the result
                   ignore_error=True,  # ignore the error when the model cannot predict the input
                   )

len(apc_result)



Want to know how your actions can help make a difference for our planet? Sign up for the Climate Coach newsletter, in your inbox every Tuesday and Thursday.
But the undercurrent of Tuesday’s public announcement, which included former vice president and climate activist Al Gore taking a turn at the podium, was anything but subtle: New York Attorney General Eric Schneiderman and his counterparts from around the country vowed to “collectively, collaboratively and aggressively” investigate whether fossil fuel companies such as ExxonMobil have misled shareholders and the public about what they knew — and when — about the risks of climate change.
“We have heard the scientists; we know what’s happening to the planet,” Schneiderman said. “But there is confusion, sowed by those with an interest in profiting from the confusion and creating misperceptions in the eyes of the American public.”
Advertisement
Schneiderman began investigating ExxonMobil last fall, subpoenaing documents from the compan

  stack = [e for item, _ in tokens_and_encodings for e in item[key]]
preparing apc inference dataloader: 100%|██████████| 136/136 [00:03<00:00, 44.35it/s]
run inference:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
# # Example DataFrame
# df = pd.DataFrame({
#     'text': ['ab', 'bc'],
#     'aspect': [['climate change', 'climate change', 'climate change'], ['emissions']],
#     'sentiment': [['Negative', 'Negative', 'Negative', 'Negative', 'Negative'], ['Neutral']],
#     'confidence': [[0.5525599122047424, 0.5525599122047424, 0.5525599122047424, 0.5525599122047424, 0.5525599122047424], [0.6038094162940979]],
#     'probs': [[[0.5525599, 0.13998558, 0.30745444], [0.5525599, 0.13998558, 0.30745444]], [[0.071085736, 0.6038094, 0.32510495]]],
#     'ref_sentiment': [[-100, -100, -100, -100, -100, -100, -100, -100], [-100]],
#     'ref_check': [[], []],
#     'perplexity': [1.159379, 1.159271]
# })

# text_list = ['a ab bcaa', 'bbbbb', 'cccccc']

# # Add a column called 'original index'
# df['original index'] = df['text'].apply(lambda x: next((i for i, text in enumerate(text_list) if x in text), None))

# # Display the updated DataFrame
# text_list_clean



 " When Carmen Luna moved to a neighborhood on the outskirts of Mexico City in 1975, there was no sewage system. To get water, she carried buckets to and from a faucet in the street. At the end of the 1980s, her house was connected to the grid; her family would get tamarind-colored water three days a week. Last year, Ms. Luna signed up for a new rainwater-harvesting program led by Mexico City Mayor Claudia Sheinbaum, an environmental scientist. The city government had teamed up with local nonprofit Isla Urbana to install 100,000 of its rainwater-harvesting systems in the districts of Xochimilco and Iztapalapa. Today, 20% of the water used in Ms. Luna’s eight-member household comes from a 2,500-liter collection tank on the ground floor. While not potable, the water is available seven days a week and can be used for household chores and bathing, easing strain on the grid. Harvesting rainwater is a centuries-old practice, but Isla Urbana’s system is cheaper, faster and easier to install t

In [None]:
# for i in range(0, len(text_list_clean)):
#     print(text_bit in text_list_clean[1])

# text_list_clean[2]
# text_bit
df = pd.DataFrame(apc_result[1:],columns=apc_result[0])

df = pd.DataFrame(apc_result[1:],columns=apc_result[0])

In [None]:
df = pd.DataFrame(apc_result[1:],columns=apc_result[0])
#print(df['text'][0])
print(text_list_clean[0])

More than a dozen state attorneys general gathered in New York earlier this week, ostensibly to announce their support for President Obama’s efforts to combat global warming and to underscore their intention to collaborate on investigations involving climate-related issues.
Want to know how your actions can help make a difference for our planet? Sign up for the Climate Coach newsletter, in your inbox every Tuesday and Thursday.
But the undercurrent of Tuesday’s public announcement, which included former vice president and climate activist Al Gore taking a turn at the podium, was anything but subtle: New York Attorney General Eric Schneiderman and his counterparts from around the country vowed to “collectively, collaboratively and aggressively” investigate whether fossil fuel companies such as ExxonMobil have misled shareholders and the public about what they knew — and when — about the risks of climate change.
“We have heard the scientists; we know what’s happening to the planet,” Schn

In [329]:
search = df['text'][0].replace(" ", "")
colors = list(map(lambda x: x.lower(), text_list_clean))
# search = str.strip(df['text'][3])replace(" ", "")

result = [color for color in colors if search in color]  
#print(str.strip(df['text'][3]))
print(str.strip(colors[3]))
result

str.strip(df['text'][0]).replace(" ", "") in str.strip(colors[0]).replace(" ", "")

# print(str.strip(df['text'][3]).replace(" ", ""))
# print(str.strip(colors[3]).replace(" ", ""))
result

pg&e corp. told california regulators that its power equipment might have contributed to igniting a recent wildfire that has killed four people. the utility disclosed in securities filings that it notified the california public utilities commission on friday it had recorded alarms on certain equipment supporting a power line that served an area east of redding, calif., where the zogg fire is believed to have originated in shasta county, near oregon. the fire has burned more than 56,000 acres and destroyed 204 structures since it started late last month, according to the california department of forestry and fire protection. the blaze, which forced evacuations in the area, was almost fully contained as of friday, according to cal fire. pg&e said state fire investigators have taken possession of some of its equipment as part of their probe into the cause of the fire. the company said the information is preliminary and that it has no information about the cause of the fire. it said it doe

[]

In [293]:
colors = list(map(lambda x: x.lower(), text_list_clean))
# search = str.strip(df['text'][3])replace(" ", "")

result = [color for color in colors if search in color]  
#print(str.strip(df['text'][3]))
print(str.strip(colors[3]))
result

str.strip(df['text'][3]).replace(" ", "") == str.strip(colors[3]).replace(" ", "")

pg&e corp. told california regulators that its power equipment might have contributed to igniting a recent wildfire that has killed four people. the utility disclosed in securities filings that it notified the california public utilities commission on friday it had recorded alarms on certain equipment supporting a power line that served an area east of redding, calif., where the zogg fire is believed to have originated in shasta county, near oregon. the fire has burned more than 56,000 acres and destroyed 204 structures since it started late last month, according to the california department of forestry and fire protection. the blaze, which forced evacuations in the area, was almost fully contained as of friday, according to cal fire. pg&e said state fire investigators have taken possession of some of its equipment as part of their probe into the cause of the fire. the company said the information is preliminary and that it has no information about the cause of the fire. it said it doe

False

In [307]:
# new_list = [elem.strip().lower().replace(" ", "") for elem in text_list_clean]
#
text_list_clean = list(map(lambda x: x.lower(), text_list_clean))
df = pd.DataFrame(apc_result[1:],columns=apc_result[0])
df['text'] = df['text'].str.lower().str.replace(" ", "").str.strip()
df['original index'] = df['text'].apply(lambda x: [i for i, val in enumerate(text_list_clean) if x in val])
df

Unnamed: 0,text,aspect,sentiment,confidence,probs,ref_sentiment,ref_check,perplexity,original index
0,morethanadozenstateattorneysgeneralgatheredinn...,"[ climate , climate , climate , climate , ...","[Negative, Negative, Negative, Negative, Negat...","[0.5562916398048401, 0.5562916398048401, 0.556...","[[0.55629164, 0.13986734, 0.30384097], [0.5562...","[-100, -100, -100, -100, -100, -100, -100, -10...","[, , , , , , , , , ]",1.161239,[]
1,morethanadozenstateattorneysgeneralgatheredinn...,"[ climate change , climate change , climate ...","[Negative, Negative, Negative, Negative, Negat...","[0.5525599122047424, 0.5525599122047424, 0.552...","[[0.5525599, 0.13998558, 0.30745444], [0.55255...","[-100, -100, -100, -100, -100, -100, -100, -100]","[, , , , , , , ]",1.161239,[]
2,emissionsfossilfuelglobalwarmingandtounderscor...,[ emissions ],[Neutral],[0.5405316948890686],"[[0.10012412, 0.5405317, 0.35934418]]",[-100],[],1.161316,[]
3,fossilfuelglobalwarmingandtounderscoretheirint...,[ fossil fuel ],[Neutral],[0.6148998737335205],"[[0.12043682, 0.6148999, 0.2646633]]",[-100],[],1.161125,[]
4,globalwarmingandtounderscoretheirintentiontoco...,[ global warming ],[Neutral],[0.5071446299552917],"[[0.07538332, 0.50714463, 0.41747203]]",[-100],[],1.161097,[]
5,climateclimatechangeweathergetslesspredictable...,[ climate ],[Negative],[0.508012592792511],"[[0.5080126, 0.40992013, 0.0820673]]",[-100],[],1.161786,[]
6,"climatechangeweathergetslesspredictable,househ...",[ climate change ],[Negative],[0.4667898118495941],"[[0.4667898, 0.4180928, 0.11511733]]",[-100],[],1.161848,[]
7,"weathergetslesspredictable,householdsinsomepar...",[ weather ],[Negative],[0.594940185546875],"[[0.5949402, 0.28947398, 0.115585804]]",[-100],[],1.161863,[]
8,asoceanwarmingcontinuestotriggerwidespreaddest...,"[ climate , climate ]","[Negative, Negative]","[0.7205307483673096, 0.7205307483673096]","[[0.72053075, 0.06050208, 0.2189672], [0.72053...","[-100, -100]","[, ]",1.161162,[]
9,"climatechangeisgoingtobetheendofreefs,’andiabs...","[ climate change , climate change ]","[Positive, Positive]","[0.5153318643569946, 0.5153318643569946]","[[0.2517378, 0.23293038, 0.51533186], [0.25173...","[-100, -100]","[, ]",1.161312,[]


In [255]:
# import pandas as pd
# len(apc_result)
# df = pd.DataFrame(apc_result[1:],columns=apc_result[0])
# df

# # Iterate over the rows to concatenate the values

# # Iterate over the DataFrame rows
# for i in range(len(df)):
#     if df.loc[i, 'text'].startswith('@S@'):
#         j = i + 1  # Start index of the next row
#         while j < len(df) and not df.loc[j, 'text'].startswith('@S@'):
#             # Append sentiment and confidence lists to the current row
#             df.loc[i, 'sentiment'].extend(df.loc[j, 'sentiment'])
#             df.loc[i, 'confidence'].extend(df.loc[j, 'confidence'])
#             j += 1

# # Delete rows that do not start with '@S@'
# df = df[df['text'].str.startswith('@S@')].reset_index(drop=True)
# len(df)

# # Change 'text' column to only contain the first 20 words
# df['text'] = df['text'].str.split().str[:15].str.join(' ')

# # Iterate over the DataFrame rows
# for i in range(len(df)):
#     if df.loc[i, 'text'].startswith('@S@'):
#         current_text = df.loc[i, 'text']
#         j = i + 1  # Start index of the next row
#         while j < len(df) and df.loc[j, 'text'].startswith(current_text):
#             # Append sentiment and confidence lists to the current row
#             df.loc[i, 'sentiment'].extend(df.loc[j, 'sentiment'])
#             df.loc[i, 'confidence'].extend(df.loc[j, 'confidence'])
#             j += 1

# # Drop the rows that start with the same value in the 'text' column
# df = df.drop_duplicates(subset='text').reset_index(drop=True)

# # Display the updated DataFrame
# df_result = df
# df_result

Unnamed: 0,text,aspect,sentiment,confidence,probs,ref_sentiment,ref_check,perplexity


In [236]:
from statistics import mean
# prob * sent 
# Map sentiment values to numerical values
sentiment_mapping = {"Positive": 1, "Negative": -1, "Neutral": 0}

# Create a new column 'original_text' in the dataframe
df_result['original_text'] = ""

# Iterate over each value in the 'Text' column
for idx, value in df_result['text'].items():
    for text in text_list:
        if value in text:
            df_result.loc[idx, 'original_text'] = text
            break

# Convert sentiment values to numeric and multiply with confidence values
df_result['final_sentiment'] = df_result.apply(lambda row: mean([sentiment_mapping[s] * c for s, c in zip(row['sentiment'], row['confidence'])]), axis=1)
df_result




ValueError: Cannot set a DataFrame with multiple columns to the single column final_sentiment

In [None]:
df = pd.read_parquet('Climate_Labels_Dataset.parquet')
df= df[(df['Final_Climate_Change_Level_Label']=='High') | (df['Final_Climate_Change_Level_Label']=='Medium')].reset_index(drop=True)

lexicon = pd.read_csv('EPA_Lexicon')
lexicon=lexicon['Lexicon'].to_list()
lexicon
df['indexes'] = df['Text'].apply(lambda x: find_term_indexes(x, lexicon))
df['highlighted_text'] = df.apply(lambda x: highlight_focuswords(x['Text'], x['indexes']), axis=1)
text_list = df['highlighted_text'].to_list()
len(text_list)

100