In [6]:
import re

# Read the text file
with open('2024-MRS-Spring-Meeting-Abstract-Program-3-28-2024.txt', 'r',encoding='utf-8') as file:
    data = file.read()

# Split the data into individual records based on the pattern (e.g., time format like '8:15 AM')
records = re.split(r'\d{1,2}:\d{2} AM', data)

# List to store extracted results
extracted_data = []

# Process each record
for record in records:
    # Skip empty records
    if not record.strip():
        continue
    
    # Extract country (last word of the title line)
    country_match = re.search(r";\s*University of .*?,\s*(\w+)", record)
    country = country_match.group(1) if country_match else "Not Found"
    
    # Extract abstract (everything after the title line and before the next blank line)
    abstract_match = re.search(r"\n\s*(.*?)\n\n", record, re.DOTALL)
    abstract = abstract_match.group(1).strip() if abstract_match else "Not Found"
    
    # Save results if both country and abstract are found
    if country != "Not Found" and abstract != "Not Found":
        extracted_data.append({'Country': country, 'Abstract': abstract})

# Output the extracted data
for item in extracted_data:
    print(f"Country: {item['Country']}\nAbstract: {item['Abstract']}\n")


Country: United
Abstract: Superconducting Thin-Films for Quantum Devices with Off-Line Quality Assessment Clara M. Barker, Finn Squires and Susannah C. Speller;

Country: United
Abstract: Teaching Density Functional Theory to Students of Art Conservation Joseph W. Bennett; University of Maryland Baltimore County, United States 
 
As an assistant professor that identifies as LBGTQIA+ and is employed at a minority serving institution, I am well aware of the multitude of issues 
surrounding diversity, equity, inclusion at the university level. My chosen field in STEM is computational materials chemistry and an issue that plagues 
this field is diversity in students, specifically their identities and backgrounds. This is why I’ve spent 3 summers as an instructor for the Baltimore SCIART 
program, teaching computational methods to students interested in art conservation science. While many of the students in the program identify as 
LGBTQIA+, the majority of students that have learned to co

In [10]:
import re
import pdfplumber

def extract_text_from_pdf(pdf_file):
    """Extracts text from a PDF file."""
    with pdfplumber.open(pdf_file) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

def process_text(data):
    """Processes text to extract country and abstract."""
    # Split the data into individual records based on the time format (e.g., '8:15 AM')
    records = re.split(r'\d{1,2}:\d{2} AM', data)

    # List to store extracted results
    extracted_data = []

    for record in records:
        # Skip empty records
        if not record.strip():
            continue

        # Extract country (last word of the title line)
        country_match = re.search(r";\s*University of .*?,\s*(\w+)", record)
        country = country_match.group(1) if country_match else "Not Found"

        # Extract abstract (everything after the title line and before the next blank line)
        abstract_match = re.search(r"\n\s*(.*?)\n\n", record, re.DOTALL)
        abstract = abstract_match.group(1).strip() if abstract_match else "Not Found"

        if country != "Not Found" and abstract != "Not Found":
            extracted_data.append({'Country': country, 'Abstract': abstract})

    return extracted_data


In [11]:
# Specify the PDF file path
pdf_file = "2024-MRS-Spring-Meeting-Abstract-Program-3-28-2024.pdf"

# Step 1: Extract text from the PDF
data = extract_text_from_pdf(pdf_file)

# Step 2: Process text to extract country and abstract
results = process_text(data)

# Step 3: Print the extracted data
for item in results:
    print(f"Country: {item['Country']}\nAbstract: {item['Abstract']}\n")

## NEW


In [1]:
import re

def clean_conference_text(text):
    # Split the text into individual lines
    lines = text.split('\n')
    cleaned = []
    current_line = []

    for line in lines:
        stripped = line.strip()
        # Check if the line is empty (after stripping whitespace)
        if not stripped:
            # If there's content in current_line, join and add to cleaned
            if current_line:
                cleaned.append(' '.join(current_line))
                current_line = []
            # Add an empty line to preserve the line break
            cleaned.append('')
        else:
            # Add the stripped line to the current paragraph
            current_line.append(stripped)
    
    # Add any remaining content in current_line
    if current_line:
        cleaned.append(' '.join(current_line))
    
    # Join the cleaned lines, replacing empty strings with actual newlines
    result = '\n'.join(cleaned)
    # Remove excessive empty lines (optional, adjust as needed)
    result = re.sub(r'\n{3,}', '\n\n', result)

    return result

def clean_text(text):
    # Step 1: Replace single line breaks (but keep double line breaks for paragraphs)
    # text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)

    # Step 2: Remove unwanted session metadata
    # text = re.sub(r"Final Program\s*–\s*\d{1,2}\.\d{1,2}\.\d{2,4}", "", text)
    # text = re.sub(r"SESSION\s+[A-Z]+\d+\.\d+\s*:\s*.*", "", text)
    text = re.sub(r"Session Chairs?:\s*.*", "", text)
    text = re.sub(r"[A-Za-z]+day\s*(Morning|Afternoon|Evening),?\s*[A-Za-z]+\s+\d{1,2},\s*\d{4}", "", text)

    # Step 3: Remove extra spaces
    # text = re.sub(r"\s+", " ", text).strip()

    return text

def merge_lines(text):
    pattern = re.compile(r'([-,])([^\S\n]*)\n+([^\S\n]*)', flags=re.MULTILINE)
    return pattern.sub(r'\1\2\3', text)

def process_text(text):
    # Remove lines that start with 'Acknowledgement', 'References' or '[1]'
    text = re.sub(r'(?m)^(Acknowledgement|References|\[\d+\]).*$', '', text)

    # Split text into lines and process them
    lines = text.strip().split("\n")
    merged_lines = []
    current_para = ""

    time_pattern = re.compile(r"^\d{1,2}:\d{2} [APM]{2}")  # Match time format

    for line in lines:
        line = line.strip()
        if not line:
            continue  # Skip blank lines
        
        if time_pattern.match(line):  # New paragraph starts
            if current_para:
                merged_lines.append(current_para)  # Store the previous paragraph
            current_para = line  # Start a new paragraph
        else:
            current_para += " " + line  # Append to the current paragraph
        
    if current_para:
        merged_lines.append(current_para)  # Add the last paragraph

    return "\n\n".join(merged_lines)



In [8]:
# Example usage
with open("2020-mrs-fall-meeting-abstracts.txt", "r", encoding="utf-8") as file:
    raw_text = file.read()

# cleaned_text = clean_conference_text(raw_text)
# cleaned_text = clean_text(raw_text)
cleaned_text = process_text(raw_text)
# cleaned_text = merge_lines(cleaned_text)
# Save the cleaned text
with open("2020-spring-fall-final.txt", "w", encoding="utf-8") as file:
    file.write(cleaned_text)

print("Text cleaning complete. Saved as 'cleaned_conference_text.txt'.")

Text cleaning complete. Saved as 'cleaned_conference_text.txt'.


In [None]:
# import re

# def clean_conference_text(text):
#     # Step 1: Replace single line breaks (but keep double line breaks for paragraphs)
#     # text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)

#     # Step 2: Remove unwanted session metadata
#     text = re.sub(r"Final Program\s*–\s*\d{1,2}\.\d{1,2}\.\d{2,4}", "", text)
#     text = re.sub(r"SESSION\s+[A-Z]+\d+\.\d+\s*:\s*.*", "", text)
#     text = re.sub(r"Session Chairs?:\s*.*", "", text)
#     text = re.sub(r"[A-Za-z]+day\s*(Morning|Afternoon|Evening),?\s*[A-Za-z]+\s+\d{1,2},\s*\d{4}", "", text)

#     # Step 3: Remove extra spaces
#     text = re.sub(r"\s+", " ", text).strip()

#     return text

# # Example usage
# with open("23-fall.txt", "r", encoding="utf-8") as file:
#     raw_text = file.read()

# cleaned_text = clean_conference_text(raw_text)

# # Save the cleaned text
# with open("2023-fall.txt", "w", encoding="utf-8") as file:
#     file.write(cleaned_text)

# print("Text cleaning complete. Saved as 'cleaned_conference_text.txt'.")


Text cleaning complete. Saved as 'cleaned_conference_text.txt'.


In [21]:
# import re

# def merge_lines(text):
#     pattern = re.compile(r'([-,])([^\S\n]*)\n+([^\S\n]*)', flags=re.MULTILINE)
#     return pattern.sub(r'\1\2\3', text)

# # Example usage:
# with open("2023-fall.txt", "r", encoding="utf-8") as file:
#     text = file.read()

# cleaned_text = merge_lines(text)

# # Save the cleaned text
# with open("2023-fall_meet.txt", "w", encoding="utf-8") as file:
#     file.write(cleaned_text)

# print("Text cleaning complete. Saved as 'cleaned_conference_text.txt'.")

Text cleaning complete. Saved as 'cleaned_conference_text.txt'.


In [27]:
# import re

# def process_text(text):
#     # Remove lines that start with 'Acknowledgement', 'References' or '[1]'
#     text = re.sub(r'(?m)^(Acknowledgement|References|\[\d+\]).*$', '', text)

#     # Split text into lines and process them
#     lines = text.strip().split("\n")
#     merged_lines = []
#     current_para = ""

#     time_pattern = re.compile(r"^\d{1,2}:\d{2} [APM]{2}")  # Match time format

#     for line in lines:
#         line = line.strip()
#         if not line:
#             continue  # Skip blank lines
        
#         if time_pattern.match(line):  # New paragraph starts
#             if current_para:
#                 merged_lines.append(current_para)  # Store the previous paragraph
#             current_para = line  # Start a new paragraph
#         else:
#             current_para += " " + line  # Append to the current paragraph
        
#     if current_para:
#         merged_lines.append(current_para)  # Add the last paragraph

#     return "\n\n".join(merged_lines)

# # Example text

# with open("2023-fall_meet.txt", "r", encoding="utf-8") as file:
#     text = file.read()

# cleaned_text = process_text(text)

# # Save the cleaned text
# with open("2023-fall_meet_final.txt", "w", encoding="utf-8") as file:
#     file.write(cleaned_text)

# print("Text cleaning complete. Saved as 'cleaned_conference_text.txt'.")

Text cleaning complete. Saved as 'cleaned_conference_text.txt'.


In [7]:
def filter_articles(article_names_file, articles_file, output_file):
    # Read article names from the first file
    with open(article_names_file, "r", encoding="utf-8") as file:
        article_names = set(line.replace("Article: ", "").strip() for line in file if line.startswith("Article: "))

    # Read all articles from the second file
    with open(articles_file, "r", encoding="utf-8") as file:
        articles = file.readlines()

    # Filter articles that contain any of the article names
    filtered_articles = []
    for article in articles:
        if any(name in article for name in article_names):
            filtered_articles.append(article)

    # Write the filtered articles to a new file
    with open(output_file, "w", encoding="utf-8") as file:
        file.writelines(filtered_articles)

    print(f"Filtered articles saved to {output_file}")

# Example usage
article_names_file = "../data_scraped/2023-fall.txt"
articles_file = "23-fall-final.txt"
output_file = "2023-fall-final.txt"
filter_articles(article_names_file, articles_file, output_file)


Filtered articles saved to 2023-fall-final.txt


In [1]:
import re

def split_on_pattern(text):
    """Splits lines where a pattern like EL01.12.06 or *EL01.14.08 is detected."""
    pattern = r'(\*?[A-Z]\.[A-Z]{4}\.\d{2})'  # Matches patterns like EL01.12.06 or *EL01.12.06
    paragraphs = text.strip().split("\n\n")  # Preserve one-line gaps
    processed_paragraphs = []
    
    for para in paragraphs:
        parts = re.split(pattern, para)  # Split at every occurrence of the pattern
        processed_paragraphs.extend([part.strip() for part in parts if part.strip()])  # Remove empty parts
    
    return "\n".join(processed_paragraphs)  # Restore one-line gaps


with open("2020-spring-fall-final.txt", "r", encoding="utf-8") as file:
    raw_text = file.read()
# Process the text
split_text = split_on_pattern(raw_text)
# Example usage

with open("2020-spring-fall-final.txt", "w", encoding="utf-8") as file:
    file.write(split_text)

print("Text cleaning complete. Saved as 'cleaned_conference_text.txt'.")


Text cleaning complete. Saved as 'cleaned_conference_text.txt'.


In [35]:
def remove_text_after_keywords(text):
    """
    Removes text after the words 'Reference' or 'Acknowledgement' in each line.
    Keeps the text before these words intact.
    """
    pattern = r'\b(Acknowledgments).*'  # Match "Reference" or "Acknowledgement" and everything after
    lines = text.strip().split("\n")  # Split text into lines
    cleaned_lines = [re.sub(pattern, r'\1', line, flags=re.IGNORECASE) for line in lines]  # Remove text after match
    return "\n".join(cleaned_lines)

with open("2023-spring-final.txt", "r", encoding="utf-8") as file:
    raw_text = file.read()
# Process the text
split_text = remove_text_after_keywords(raw_text)
# Example usage

with open("2023-spring-final.txt", "w", encoding="utf-8") as file:
    file.write(split_text)

print("Text cleaning complete. Saved as 'cleaned_conference_text.txt'.")

Text cleaning complete. Saved as 'cleaned_conference_text.txt'.


In [2]:
def remove_lines(text):
    """"Removes lines with less than 110 characters."""
    lines = text.strip().split("\n")  # Split text into lines
    filtered_lines = [line for line in lines if len(line) >= 250]  # Filter lines
    return "\n".join(filtered_lines)

with open("2020-spring-fall-final.txt", "r", encoding="utf-8") as file:
    raw_text = file.read()
# Process the text
split_text = remove_lines(raw_text)
# Example usage

with open("2020-spring-fall-final.txt", "w", encoding="utf-8") as file:
    file.write(split_text)

print("Text cleaning complete. Saved as 'cleaned_conference_text.txt'.")

Text cleaning complete. Saved as 'cleaned_conference_text.txt'.
