## Extracting sentences with spacy from the SemEval dataset

### Dev files 2023 dataset - templates

In [None]:
%pip install spacy


In [91]:
import os
import re
import spacy

# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

directory_dev = '..\\SemEval_23_en\\en\\dev-labels-subtask-3-spans'
articles_dev = '..\\SemEval_23_en\\en\\dev-articles-subtask-3'

def read_labels_file(labels_file_path):
    with open(labels_file_path, 'r',  encoding='utf-8') as labels_file:
        # Read all lines from the file
        lines = labels_file.readlines()

        # Remove leading and trailing whitespaces from each line
        cleaned_lines = [line.strip() for line in lines]

        # Return the list of cleaned lines
        return cleaned_lines

def read_text_file(text_file_path, start_position, end_position):
    with open(text_file_path, 'r', encoding='utf-8') as text_file:
        text_file.seek(start_position)
        text = text_file.read(end_position - start_position)
        return text

def extract_article_number(file_path):
    # Extract the sequence of numbers from the file path
    match = re.search(r'article(\d+)', file_path)
    if match:
        return match.group(1)
    else:
        return None

def process_text_with_spacy(text, text_data):
    # Process the text with spaCy
    doc = nlp(text)

    # Extract sentences containing the extracted fragments
    relevant_sentences = []
    for sent in doc.sents:
        if any(str(fragment).lower().strip() in sent.text.lower().strip() for fragment in text_data):

            relevant_sentences.append(sent.text)
        else:
             for fragment in text_data:
                fragment=fragment.split(".")
                for part in fragment:
                    if str(part).lower().strip() in sent.text.lower().strip() :

                        if len(relevant_sentences)>0:

                            relevant_sentences[0]=relevant_sentences[0]+sent.text
                        else:
                            relevant_sentences.append(sent.text)

    return relevant_sentences

def main():
    txt_files = []
    label_files = []
    

    for filename in os.listdir(directory_dev):
        f = os.path.join(directory_dev, filename)
        if os.path.isfile(f):
            f = str(f).replace('\\', '/')
            if f.endswith('.txt'):
                label_files.append(str(f))

    for filename in os.listdir(articles_dev):
        f = os.path.join(articles_dev, filename)
        if os.path.isfile(f):
            f = str(f).replace('\\', '/')
            if f.endswith('.txt'):
                txt_files.append(str(f))

    with open("..\\txt\\dev_23_semEval.txt", "w", encoding='utf-8') as output:

     for text_file_path in label_files:
        labels_line = read_labels_file(text_file_path)
        columns = []
        for line in labels_line:
            l = line.split('\t')
            columns.append(l)

        for list in columns:
            text_data = []
            if   "Whataboutism" in list[1]:
                
                start_position, end_position = map(int, list[2:])
                article = extract_article_number(text_file_path)
                text_file_path = text_file_path.replace('-spans', '')
                text_file_path = text_file_path.replace('labels', 'articles')
                text_file_path = text_file_path.replace('-articles-subtask-3.txt', '.txt')
                extracted_text = read_text_file(text_file_path, start_position, end_position)
                extracted_text=extracted_text.replace('“','')
                extracted_text=extracted_text.replace('”','')
                extracted_text=extracted_text.replace('"','')
                extracted_text=extracted_text.replace('[…]','')
                extracted_text=extracted_text.replace('U.S.','')
                
                a=read_text_file(text_file_path, 0,-1)
                # Print the extracted text and article number
                print(f"\nArticle {article} - Technique: {list[1]} - Extracted Text:", extracted_text)
                text_data.append(extracted_text)

                # Process the extracted text with spaCy
                relevant_sentences = process_text_with_spacy(a, text_data)

                # Print or save relevant sentences
                for sentence in relevant_sentences:
                    print(f"\nRelevant Sentence: {sentence}")
                    sentence=sentence.replace('\n','')
                    output.write(str(sentence) + "\n\n")
            
            elif len(list) >= 2 and "Red_Herring" in list[1]:
                start_position, end_position = map(int, list[2:])
                article = extract_article_number(text_file_path)
                text_file_path = text_file_path.replace('-spans', '')
                text_file_path = text_file_path.replace('labels', 'articles')
                text_file_path = text_file_path.replace('-articles-subtask-3.txt', '.txt')
                extracted_text = read_text_file(text_file_path, start_position, end_position)
                extracted_text=extracted_text.replace('“','')
                extracted_text=extracted_text.replace('”','')
                extracted_text=extracted_text.replace('"','')
                extracted_text=extracted_text.replace('[…]','')
                extracted_text=extracted_text.replace('U.S.','')

                a=read_text_file(text_file_path, 0,-1)
                # Print the extracted text and article number
                print(f"\nArticle {article} - Technique: {list[1]} - Extracted Text:", extracted_text)
                text_data.append(extracted_text)

                # Process the extracted text with spaCy
                relevant_sentences = process_text_with_spacy(a, text_data)

                # Print or save relevant sentences
                for sentence in relevant_sentences:
                    print(f"\nRelevant Sentence: {sentence}")
                    sentence=sentence.replace('\n','')
                    output.write(str(sentence) + "\n\n")
                
if __name__ == "__main__":
    main()





Article 813452859 - Technique: Red_Herring - Extracted Text:  people; that's a piece of technology which is manufactured in China, uses American technology and these are two countries we deal with on WTO terms, this isn't a fantasy, stuck in a port somewhere, there isn't a massive tariff, this is the world that really exi

Relevant Sentence: I often use the example of an iPhone to people; that's a piece of technology which is manufactured in China, uses American technology and these are two countries we deal with on WTO terms, this isn't a fantasy, stuck in a port somewhere, there isn't a massive tariff, this is the world that really exists today.


Article 813494037 - Technique: Red_Herring - Extracted Text: ke Dyer (@Miked2372Mike) 31 декабря 2018 г.
Someone 

Relevant Sentence: Another shameless attempt at using party politics on what is supposed to be a happy occasion — droneguy (@shelbyguitars) 1 января 2019 г.
Politicising another innocent event that should be no different to an

### Train files - templates

In [92]:
import os
import re
import spacy

# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

directory_dev = '..\\SemEval_23_en\\en\\train-labels-subtask-3-spans'
articles_dev = '..\\SemEval_23_en\\en\\train-articles-subtask-3'

def read_labels_file(labels_file_path):
    with open(labels_file_path, 'r', encoding='utf-8') as labels_file:
        # Read all lines from the file
        lines = labels_file.readlines()

        # Remove leading and trailing whitespaces from each line
        cleaned_lines = [line.strip() for line in lines]

        # Return the list of cleaned lines
        return cleaned_lines

def read_text_file(text_file_path, start_position, end_position):
    with open(text_file_path, 'r', encoding='utf-8') as text_file:
        text_file.seek(start_position)
        text = text_file.read(end_position - start_position)
        return text

def extract_article_number(file_path):
    # Extract the sequence of numbers from the file path
    match = re.search(r'article(\d+)', file_path)
    if match:
        return match.group(1)
    else:
        return None

def process_text_with_spacy(text, text_data):
    # Process the text with spaCy
    text=text.replace('“','')
    text=text.replace('”','')
    text=text.replace('"','')
    text=text.replace('[…]','')
    doc = nlp(text)

    # Extract sentences containing the extracted fragments
    relevant_sentences = []
    for sent in doc.sents:
        if any(str(fragment).lower() in sent.text.lower() for fragment in text_data):
            if  not sent.text.strip().startswith('.'):
                relevant_sentences.append(sent.text)
                
            
        
        else:
             for fragment in text_data:
                fragment= re.split(r'\.[\n\r\s]+', fragment)
                for part in fragment:
                    if len(str(part))>15:
                        if str(part).lower() in sent.text.lower() :

                            relevant_sentences.append(sent.text)

    return relevant_sentences

def main():
    txt_files = []
    label_files = []
    #text_data = []

    for filename in os.listdir(directory_dev):
        f = os.path.join(directory_dev, filename)
        if os.path.isfile(f):
            f = str(f).replace('\\', '/')
            if f.endswith('.txt'):
                label_files.append(str(f))

    for filename in os.listdir(articles_dev):
        f = os.path.join(articles_dev, filename)
        if os.path.isfile(f):
            f = str(f).replace('\\', '/')
            if f.endswith('.txt'):
                txt_files.append(str(f))
    
    with open("..\\txt\\train_23_semEval.txt", "w", encoding='utf-8') as output:

     for text_file_path in label_files:
       
        labels_line = read_labels_file(text_file_path)
        columns = []
        for line in labels_line:
            l = line.split('\t')
            columns.append(l)

        for list in columns:
            text_data=[]
            if   "Whataboutism" in list[1]:
                
                start_position, end_position = map(int, list[2:])
                article = extract_article_number(text_file_path)
                text_file_path = text_file_path.replace('-spans', '')
                text_file_path = text_file_path.replace('labels', 'articles')
                text_file_path = text_file_path.replace('-articles-subtask-3.txt', '.txt')
                extracted_text = read_text_file(text_file_path, start_position, end_position)
                extracted_text=extracted_text.replace('“','')
                extracted_text=extracted_text.replace('”','')
                extracted_text=extracted_text.replace('"','')
                extracted_text=extracted_text.replace('[…]','')
                extracted_text=extracted_text.replace('U.S.','')

                
                a=read_text_file(text_file_path, 0,-1)
                # Print the extracted text and article number
                print(f"\nArticle {article} - Technique: {list[1]} - Extracted Text:", extracted_text)

                
                text_data.append(extracted_text)

                # Process the extracted text with spaCy
                relevant_sentences = process_text_with_spacy(a, text_data)

                # Print or save relevant sentences
                for sentence in relevant_sentences:
                    print(f"\nRelevant Sentence: {sentence}")
                
                
                    sentence=sentence.replace('\n','')
                    output.write(str(sentence) + "\n\n")

            elif len(list) >= 2 and "Red_Herring" in list[1]:
                start_position, end_position = map(int, list[2:])
                article = extract_article_number(text_file_path)
                text_file_path = text_file_path.replace('-spans', '')
                text_file_path = text_file_path.replace('labels', 'articles')
                text_file_path = text_file_path.replace('-articles-subtask-3.txt', '.txt')
                extracted_text = read_text_file(text_file_path, start_position, end_position)
                extracted_text=extracted_text.replace('“','')
                extracted_text=extracted_text.replace('”','')
                extracted_text=extracted_text.replace('"','')
                extracted_text=extracted_text.replace('[…]','')
                extracted_text=extracted_text.replace('U.S.','')
                
                if start_position>100:
                     a=read_text_file(text_file_path, start_position-100, end_position+100)
                else:
                        a=read_text_file(text_file_path, 0, end_position+100)

                #a=read_text_file(text_file_path, 0,-1)
                # Print the extracted text and article number
                print(f"\nArticle {article} - Technique: {list[1]} - Extracted Text:", extracted_text)
                text_data.append(extracted_text)

                # Process the extracted text with spaCy
                relevant_sentences = process_text_with_spacy(a, text_data)

                # Print or save relevant sentences
                
                for sentence in relevant_sentences:
                    print(f"\nRelevant Sentence: {sentence}")
                    sentence=sentence.replace('\n','')
                    output.write(str(sentence) + "\n\n")

    print(len(relevant_sentences))

if __name__ == "__main__":
    main()


Article 111111132 - Technique: Red_Herring - Extracted Text:  or on Facebook posts. Instead, the question will be

Relevant Sentence: But the program’s fate will not be resolved by popular vote or on Facebook posts.

Relevant Sentence: Instead, the question will be tried in the courtroom of Judge David A. Faber of the U.S. District Court for the Southern District of

Article 696246189 - Technique: Red_Herring - Extracted Text: et was talking about has been identified. While officials did not reveal the woman’s name, th

Relevant Sentence: According to ABC News, the ‘mystery woman’ everyone on the internet was talking about has been identified.

Relevant Sentence: While officials did not reveal the woman’s name, they did tell reporters she was a prostitute.

Article 701837665 - Technique: Red_Herring - Extracted Text: or assassination aro

Relevant Sentence: same motive that motivated the CIA and Pentagon to target other political leaders for regime change or assassination around that 

## Selecting texts without whataboutsm nor red herring

In [69]:
%pip install chardet

Note: you may need to restart the kernel to use updated packages.Defaulting to user installation because normal site-packages is not writeable
Collecting chardet

  Downloading chardet-5.2.0-py3-none-any.whl (199 kB)


You should consider upgrading via the 'c:\Program Files\Python310\python.exe -m pip install --upgrade pip' command.


Installing collected packages: chardet
Successfully installed chardet-5.2.0


In [None]:
import os
import re
import spacy
import chardet

# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

directory_dev = '..\\SemEval_23_en\\en\\train-labels-subtask-3-spans'
articles_dev = '..\\SemEval_23_en\\en\\train-articles-subtask-3'

def read_labels_file(labels_file_path):
    with open(labels_file_path, 'r', encoding='utf-8') as labels_file:
        # Read all lines from the file
        lines = labels_file.readlines()

        # Remove leading and trailing whitespaces from each line
        cleaned_lines = [line.strip() for line in lines]

        # Return the list of cleaned lines
        return cleaned_lines

def read_text_file(text_file_path, start_position, end_position):
    
    with open(text_file_path, 'rb') as text_file:
        raw_data = text_file.read()
        result = chardet.detect(raw_data)
        encoding = result['encoding']
    
    with open(text_file_path, 'r', encoding=encoding, errors='replace') as text_file:
        text_file.seek(start_position)
        text = text_file.read(end_position - start_position)
        return text

def extract_article_number(file_path):
    # Extract the sequence of numbers from the file path
    match = re.search(r'article(\d+)', file_path)
    if match:
        return match.group(1)
    else:
        return None

def process_text_with_spacy(text, text_data):
    # Process the text with spaCy
    text=text.replace('“','')
    text=text.replace('”','')
    text=text.replace('"','')
    text=text.replace('[…]','')
    doc = nlp(text)

    # Extract sentences containing the extracted fragments
    relevant_sentences = []
    for sent in doc.sents:
        if any(str(fragment).lower() in sent.text.lower() for fragment in text_data):
            if not sent.text.strip().startswith('.'):
                relevant_sentences.append(sent.text)
                            
        
        else:
            for fragment in text_data:
                fragment= re.split(r'\.[\n\r\s]+', fragment)
                for part in fragment:
                    if len(str(part)) > 15:
                        if str(part).lower() in sent.text.lower() :
                            relevant_sentences.append(sent.text)

    return relevant_sentences

def main():
    txt_files = []
    label_files = []

    for filename in os.listdir(directory_dev):
        f = os.path.join(directory_dev, filename)
        if os.path.isfile(f):
            f = str(f).replace('\\', '/')
            if f.endswith('.txt'):
                label_files.append(str(f))

    myset = set()
    i = 0

    while i < 500:

        for text_file_path in label_files:
           
            labels_line = read_labels_file(text_file_path)
            columns = []

            for line in labels_line:
                l = line.split('\t')
                columns.append(l)

            for list in columns:
                text_data=[]

                if "Whataboutism" not in list[1] and "Red_Herring" not in list[1]:
                    
                    start_position, end_position = map(int, list[2:])
                    article = extract_article_number(text_file_path)
                    text_file_path = text_file_path.replace('-spans', '')
                    text_file_path = text_file_path.replace('labels', 'articles')
                    text_file_path = text_file_path.replace('-articles-subtask-3.txt', '.txt')
                    extracted_text = read_text_file(text_file_path, start_position, end_position)
                    extracted_text=extracted_text.replace('“','')
                    extracted_text=extracted_text.replace('”','')
                    extracted_text=extracted_text.replace('"','')
                    extracted_text=extracted_text.replace('[…]','')
                    extracted_text=extracted_text.replace('U.S.','')

                    if start_position>50:
                     a=read_text_file(text_file_path, start_position-50, end_position+50)
                    else:
                        a=read_text_file(text_file_path, 0, end_position+50)
                    text_data.append(extracted_text)

                    # Process the extracted text with spaCy
                    relevant_sentences = process_text_with_spacy(a, text_data)

                    # Print or save relevant sentences
                    
                        
                        
                        

                    if i < 500:
                        for sentence in relevant_sentences:
                            myset.add(sentence)
                            i+=1
                            
                    
                    else:
                        #print(i, myset)
                        break       


    #print(len(myset))
    with open("..\\txt\\500_extracted_semeval.txt", "w", encoding='utf-8') as output:
        for s in myset:
            #print(s)
            #try removing newlines from the extracted texts because they will create more csv lines
            s=s.replace('\n','')
            output.write(str(s) + "\n\n")

if __name__ == "__main__":
    main()


# 2021

In [12]:
import os
import re
import spacy

directory = 'C:\\Users\\lored\\OneDrive\\Desktop\\DigitalHumanities\\benaltrismo\\data\\Data\\datasets-v5\\tasks-2-3\\train'
txt_files = []
label_files = []
text_data = []
result_set=set()

# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

def read_labels_file(labels_file_path):
    with open(labels_file_path, 'r') as labels_file:
        # Read all lines from the file
        lines = labels_file.readlines()

        # Remove leading and trailing whitespaces from each line
        cleaned_lines = [line.strip() for line in lines]

        # Return the list of cleaned lines
        return cleaned_lines
    
def read_text_file(text_file_path, start_position, end_position):
    with open(text_file_path, 'r', encoding='utf-8') as text_file:
        text_file.seek(start_position)
        text = text_file.read(end_position - start_position)
        return text

def extract_article_number(file_path):
    # Extract the sequence of numbers from the file path
    match = re.search(r'article(\d+)', file_path)
    if match:
        return match.group(1)
    else:
        return None

def process_text_with_spacy(text, text_data):
    # Process the text with spaCy
    text=text.replace('“','')
    text=text.replace('”','')
    text=text.replace('"','')
    text=text.replace('[…]','')
    doc = nlp(text)

    # Extract sentences containing the extracted fragments
    relevant_sentences = []
    for sent in doc.sents:
        if any(str(fragment).lower() in sent.text.lower() for fragment in text_data):
            if not sent.text.strip().startswith('.'):
                relevant_sentences.append(sent.text)
                            
        
        else:
            for fragment in text_data:
                fragment= re.split(r'\.[\n\r\s]+', fragment)
                for part in fragment:
                    if len(str(part)) > 15:
                        if str(part).lower() in sent.text.lower() :
                            relevant_sentences.append(sent.text)

    return relevant_sentences

def main():
   with open("..\\txt\\train_2021.txt", "w", encoding='utf-8') as output:
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        if os.path.isfile(f):
            f = str(f).replace('\\', '/')
            if f.endswith('.txt'):
                txt_files.append(str(f))
            elif f.endswith('.task3.labels'):
                label_files.append(str(f))

    for text in label_files:
        # Read the line from the labels file
        labels_line = read_labels_file(text)
        columns = []

        # Extract information from the labels line
        for line in labels_line:
            l = line.split('\t')
            columns.append(l)

        for list in columns:
            if len(list) >= 2 and "Whataboutism" in list[1]:
                start_position, end_position = map(int, list[2:])
                text_file_path = text.replace('.task3.labels', '.txt')
                article = extract_article_number(text)
                extracted_text = read_text_file(text_file_path, start_position, end_position)

                # Process the extracted text with spaCy
                #relevant_sentences = process_text_with_spacy(extracted_text)

                # Print or save relevant sentences
                #for sentence in relevant_sentences:
                    #print(f"Article {article} - Technique: {list[1]} - Extracted Text:", extracted_text)
                    #print(f"Relevant Sentence: {sentence}")
                    #sentence = sentence.replace('\n', '')
                    #text_data.append(sentence)

                a=read_text_file(text_file_path, 0,-1)
                # Print the extracted text and article number
                print(f"\nArticle {article} - Technique: {list[1]} - Extracted Text:", extracted_text)

                
                text_data.append(extracted_text)

                # Process the extracted text with spaCy
                relevant_sentences = process_text_with_spacy(a, text_data)

                # Print or save relevant sentences
                for sentence in relevant_sentences:
                    print(f"\nRelevant Sentence: {sentence}")
                
                
                    sentence=sentence.replace('\n','')
                    #output.write(str(sentence) + "\n\n")
                    result_set.add(str(sentence))


            elif len(list) >= 2 and "Red_Herring" in list[1]:
                start_position, end_position = map(int, list[2:])
                text_file_path = text.replace('.task3.labels', '.txt')
                article = extract_article_number(text)
                extracted_text = read_text_file(text_file_path, start_position, end_position)

                # Process the extracted text with spaCy
                #relevant_sentences = process_text_with_spacy(extracted_text)

                # Print or save relevant sentences
                #for sentence in relevant_sentences:
                   # print(f"Article {article} - Technique: {list[1]} - Extracted Text:", extracted_text)
                    #print(f"Relevant Sentence: {sentence}")
                    #sentence = sentence.replace('\n', '')
                    #text_data.append(sentence)


                a=read_text_file(text_file_path, 0,-1)
                # Print the extracted text and article number
                print(f"\nArticle {article} - Technique: {list[1]} - Extracted Text:", extracted_text)

                
                text_data.append(extracted_text)

                # Process the extracted text with spaCy
                relevant_sentences = process_text_with_spacy(a, text_data)

                # Print or save relevant sentences
                for sentence in relevant_sentences:
                    print(f"\nRelevant Sentence: {sentence}")
                
                
                    sentence=sentence.replace('\n','')
                    #output.write(str(sentence) + "\n\n")
                    result_set.add(str(sentence))

    for sentence in result_set:

        output.write(str(sentence) + "\n\n")
        
if __name__ == "__main__":
    main()

# Save the relevant sentences to a file
#with open("..\\txt\\_2021.txt", "w", encoding='utf-8') as output:
   # for sentence in text_data:
       # output.write(str(sentence) + "\n\n")



Article 111111113 - Technique: Whataboutism - Extracted Text: able that politicians across this country continue to endanger the lives of Americans with sanctuary policies while ignoring the harm inflicted on their cons

Relevant Sentence: It is unconscionable that politicians across this country continue to endanger the lives of Americans with sanctuary policies while ignoring the harm inflicted on their constituents.



Article 111111132 - Technique: Red_Herring - Extracted Text: e or on Facebook posts.
Instead, the question will be 

Relevant Sentence: But the program’s fate will not be resolved by popular vote or on Facebook posts.


Relevant Sentence: Instead, the question will be tried in the courtroom of Judge David A. Faber of the U.S. District Court for the Southern District of West Virginia in Bluefield.


Article 696246189 - Technique: Red_Herring - Extracted Text:  was talking about has been identified.
While officials did not reveal the woman’s name, they 

Relevant Sente