In [134]:
import json
import re

# List of first-level headings to apply the logic to
level_1_keys = [
    "Ideas, Information, and Inquiry (Triple-I) (FY-TRIPLE)", "Global Language (GLBL-LANG)", 
    "Aesthetic and Interpretive Analysis (FC-AESTH)", "Creative Expression, Practice, and Production (FC-CREATE)",
    "Engagement with the Human Past (FC-PAST)", "Ethical and Civic Values (FC-VALUES)", 
    "Global Understanding and Engagement (FC-GLOBAL)", "Natural Scientic Investigation (FC-NATSCI)", 
    "Power, Difference, and Inequality (FC-POWER)", "Quantitative Reasoning (FC-QUANT)", 
    "Ways of Knowing (FC-KNOWING)", "Empirical Investigation Lab", "Research and Discovery (RESEARCH)"
]

def reformat_json(data):
    # General Regex pattern to match the course code, number, and everything up to the next course number
    course_pattern = re.compile(r'(\b\w{4}\b)\s*(\d{3})(.+?)(?=\s*\b\w{4}\b\s*\d{3}|$)', re.DOTALL)

    def split_courses(courses_text):
        # Find all course entries
        course_entries = course_pattern.findall(courses_text)
        # Sort the courses into buckets based on the course number
        course_buckets = {}
        for code, number, details in course_entries:
            course_number = int(number)
            # Determine the course number range
            range_key = f"{(course_number // 100) * 100} to {((course_number // 100) + 1) * 100 - 1}"
            # Add course string to the correct bucket
            course_description = f"{code} {number}{details.strip()}"
            course_buckets.setdefault(range_key, []).append(course_description)
        return course_buckets

    # Iterate over each first-level heading
    for major_heading, content_list in data.items():
        if major_heading in level_1_keys and isinstance(content_list, list):
            # Process each item in the content list only if the heading is in the specified keys
            for i, content in enumerate(content_list):
                if isinstance(content, dict):
                    # Iterate over each dictionary in the list
                    for sub_key, sub_value in content.items():
                        if isinstance(sub_value, str):
                            # Apply the split_courses function to strings with course listings
                            split_courses_data = split_courses(sub_value)
                            if split_courses_data:
                                content[sub_key] = split_courses_data
                        elif isinstance(sub_value, list):
                            # If the value is a list, process each item in the list
                            content[sub_key] = [split_courses(item) if isinstance(item, str) else item for item in sub_value]
                elif isinstance(content, str):
                    # Directly apply the split_courses function to strings at the list level if necessary
                    split_courses_data = split_courses(content)
                    if split_courses_data:
                        content_list[i] = split_courses_data

    return data

# Assuming 'json_data' is your original JSON data
with open('chunk_1032_1240_new.json', 'r') as file:
    json_data = json.load(file)

reformatted_json = reformat_json(json_data)

# Write the modified JSON to a new file
with open('modified_chunk_1032_1240_new.json', 'w') as file:
    json.dump(reformatted_json, file, indent=4)


In [135]:
import json
import re

level_1_keys = [
    "Ideas, Information, and Inquiry (Triple-I) (FY-TRIPLE)", "Global Language (GLBL-LANG)", 
    "Aesthetic and Interpretive Analysis (FC-AESTH)", "Creative Expression, Practice, and Production (FC-CREATE)",
    "Engagement with the Human Past (FC-PAST)", "Ethical and Civic Values (FC-VALUES)", 
    "Global Understanding and Engagement (FC-GLOBAL)", "Natural Scientic Investigation (FC-NATSCI)", 
    "Power, Difference, and Inequality (FC-POWER)", "Quantitative Reasoning (FC-QUANT)", 
    "Ways of Knowing (FC-KNOWING)", "Empirical Investigation Lab", "Research and Discovery (RESEARCH)"
]

def reformat_json(data):
    # Regex pattern to match the course code, number, and everything up to the next course number
    course_pattern = re.compile(r'(\b\w{4}\b)\s*(\d{3})(.+?)(?=\s*\b\w{4}\b\s*\d{3}|$)', re.DOTALL)

    def split_courses(courses_text):
        # Find all course entries
        course_entries = course_pattern.findall(courses_text)
        # Sort the courses into buckets based on the course code
        course_buckets = {}
        for code, number, details in course_entries:
            # Add course string to the correct bucket
            course_description = f"{code} {number}{details.strip()}"
            course_buckets.setdefault(code, []).append(course_description)
        return course_buckets

    # Iterate over each first-level heading
    for major_heading, content_list in data.items():
        if major_heading in level_1_keys and isinstance(content_list, list):
            # Process each item in the content list
            for i, content in enumerate(content_list):
                if isinstance(content, dict):
                    # Iterate over each dictionary
                    for sub_key, sub_value in content.items():
                        if isinstance(sub_value, str):
                            # Apply the split_courses function
                            split_courses_data = split_courses(sub_value)
                            if split_courses_data:
                                content[sub_key] = split_courses_data
                        elif isinstance(sub_value, list):
                            # Process each item in the list
                            content[sub_key] = [split_courses(item) if isinstance(item, str) else item for item in sub_value]
                elif isinstance(content, str):
                    # Apply the split_courses function to strings at the list level
                    split_courses_data = split_courses(content)
                    if split_courses_data:
                        content_list[i] = split_courses_data

    return data

# Assuming 'json_data' is your original JSON data
with open('chunk_1032_1240_new.json', 'r') as file:
    json_data = json.load(file)

reformatted_json = reformat_json(json_data)

# Write the modified JSON to a new file
with open('modified_chunk_1032_1240_new.json', 'w') as file:
    json.dump(reformatted_json, file, indent=4)


In [136]:
import json

def combine_consecutive_strings(json_data):
    if isinstance(json_data, dict):
        for key, value in json_data.items():
            json_data[key] = combine_consecutive_strings(value)
    elif isinstance(json_data, list):
        new_list = []
        accumulated_string = ''
        for item in json_data:
            if isinstance(item, (dict, list)):
                if accumulated_string:
                    new_list.append(accumulated_string.strip())
                    accumulated_string = ''
                new_list.append(combine_consecutive_strings(item))
            elif isinstance(item, str):
                accumulated_string += ' ' + item
            else:
                if accumulated_string:
                    new_list.append(accumulated_string.strip())
                    accumulated_string = ''
                new_list.append(item)
        if accumulated_string:
            new_list.append(accumulated_string.strip())
        return new_list
    return json_data

# Load your JSON data from a file
with open('modified_chunk_1032_1240_new.json', 'r', encoding='utf-8') as f:
    json_data = json.load(f)

combined_json_data = combine_consecutive_strings(json_data)

# Save the combined JSON data to a new file
with open('modified_chunk_1032_1240_new.json', 'w', encoding='utf-8') as f:
    json.dump(combined_json_data, f, indent=4, ensure_ascii=False)


In [137]:
import json
import re

# List of second-level headings to apply the logic to
second_level_keys = [
    "First-Year Seminar",
    "First-Year Launch"
]

# Updated Regex pattern to match the course code, number, and everything up to the next course number
# The pattern now captures any text after the course number until it encounters another course code and number or the end of the data chunk
course_pattern = re.compile(r'(\b\w{4}\b)\s*(\d{2,3})(.+?)(?=\s*\b\w{4}\b\s*\d{2,3}|$)', re.DOTALL)

def split_courses(courses_text):
    # Find all course entries
    course_entries = course_pattern.findall(courses_text)
    # Sort the courses into buckets based on the course number
    course_buckets = {}
    for code, number, details in course_entries:
        course_number = int(number)
        # Determine the course number range
        range_key = f"{(course_number // 100) * 100} to {((course_number // 100) + 1) * 100 - 1}"
        # Add course string to the correct bucket
        course_description = f"{code} {number}{details.strip()}"
        course_buckets.setdefault(range_key, []).append(course_description)
    return course_buckets

def reformat_recursive(item):
    if isinstance(item, dict):
        # Iterate over each key-value pair in the dictionary
        for key, value in item.items():
            if key in second_level_keys and isinstance(value, str):
                # Apply the split_courses function to strings with course listings under specified headings
                split_courses_data = split_courses(value)
                if split_courses_data:
                    item[key] = split_courses_data
            elif isinstance(value, dict) or isinstance(value, list):
                # Recurse if the value is a dictionary or list
                item[key] = reformat_recursive(value)
    elif isinstance(item, list):
        # If the item is a list, recurse on each element
        return [reformat_recursive(element) for element in item]
    return item

def process_json(data):
    # Apply the reformat_recursive function to the entire JSON data
    return reformat_recursive(data)

# Assuming 'json_data' is your original JSON data
with open('modified_chunk_1032_1240_new.json', 'r') as file:
    json_data = json.load(file)

reformatted_json = process_json(json_data)

# Write the modified JSON to a new file
with open('modified_chunk_1032_1240_new.json', 'w') as file:
    json.dump(reformatted_json, file, indent=4)


In [138]:
import json

def combine_consecutive_strings(json_data):
    if isinstance(json_data, dict):
        for key, value in json_data.items():
            json_data[key] = combine_consecutive_strings(value)
    elif isinstance(json_data, list):
        new_list = []
        accumulated_string = ''
        for item in json_data:
            if isinstance(item, (dict, list)):
                if accumulated_string:
                    new_list.append(accumulated_string.strip())
                    accumulated_string = ''
                new_list.append(combine_consecutive_strings(item))
            elif isinstance(item, str):
                accumulated_string += ' ' + item
            else:
                if accumulated_string:
                    new_list.append(accumulated_string.strip())
                    accumulated_string = ''
                new_list.append(item)
        if accumulated_string:
            new_list.append(accumulated_string.strip())
        return new_list
    return json_data

# Load your JSON data from a file
with open('modified_chunk_1032_1240_new.json', 'r', encoding='utf-8') as f:
    json_data = json.load(f)

combined_json_data = combine_consecutive_strings(json_data)

# Save the combined JSON data to a new file
with open('modified_chunk_1032_1240_new.json', 'w', encoding='utf-8') as f:
    json.dump(combined_json_data, f, indent=4, ensure_ascii=False)


In [141]:
import json
import re

def reformat_json(data):
    # Regex pattern to match course prefixes, numbers, and descriptions
    course_pattern = re.compile(r'\b(\w+)\s(\d{3})\.\s{2,}(.*?)(?=\s{2,}\w+\s\d{3}\.|\Z)', re.DOTALL)

    def split_courses(courses_text):
        # Find all course entries
        course_entries = course_pattern.findall(courses_text)
        # Sort the courses into buckets based on the course number
        course_buckets = {}
        for prefix, number, description in course_entries:
            course_number = int(number)
            # Determine the course number range
            range_key = f"{(course_number // 100) * 100} to {((course_number // 100) + 1) * 100 - 1}"
            # Add course string to the correct bucket
            course_buckets.setdefault(range_key, []).append(f"{prefix} {number}. {description.strip()}")
        return course_buckets

    # Recursive function to reformat the JSON data
    def reformat_recursive(item):
        if isinstance(item, dict):
            keys_to_delete = []
            new_items = {}
            # If the item is a dictionary, iterate over keys and values
            for key, value in item.items():
                keys = ["Advanced Undergraduate and Graduate-level Courses", "Undergraduate-level Courses", "AMST–American Studies", "CHER–Cherokee",
                "FOLK–Folklore", "ARTH–Art History", "EMES–Earth, Marine, and Environmental Sciences", "CMPL–Comparative Literature", "ENGL–English",
                "CHWA–Chichewa", "LGLA–Lingala", "SWAH–Kiswahili", "WOLO–Wolof", "YORU–Yoruba", "ARTS–Studio Art", "Undergraduate-level", 
                "Advanced Undergraduate and Graduate-level", "CBIO–Cell and Development Biology", "PHYI–Physiology", "CLAR–Classical Archaeology",
                "CLAS–Classics in English/Classical Civilization", "GREK–Greek", "LATN–Latin", "GEOL–Geological Sciences", "MASC–Marine Science",
                "EXSS–Exercise and Sport Science", "LFIT–Lifetime Fitness", "PHYA–Physical Activity", "GSLL–Germanic and Slavic Languages and Literatures",
                "DTCH–Dutch", "GERM–German", "BCS–Bosnian, Croatian, and Serbian", "CZCH–Czech", "HUNG–Hungarian", "MACD–Macedonian", "PLSH–Polish", "RUSS–Russian",
                "SLAV–Slavic", "LING–Linguistics", "Yucatec Maya (MAYA)", "ASTR–Astronomy", "PHYS–Physics", "NSCI-Neuroscience", "PSYC–Psychology", "PORT–Portuguese",
                "FREN–French", "ITAL–Italian", "ROML–Romance Languages", "SPAN–Spanish", "AAAD–African, African American, and Diaspora Studies"]
                if key in keys and isinstance(value, str):
                    # Process and split the courses
                    split_courses_data = split_courses(value)
                    # Add new keys with split courses to new_items
                    for range_key, courses in split_courses_data.items():
                        new_key = f"{key} {range_key}"
                        new_items[new_key] = courses
                    # Mark the original key for deletion
                    keys_to_delete.append(key)
                else:
                    # Recurse into nested dictionaries or lists
                    item[key] = reformat_recursive(value)
            # Delete the original course keys
            for key in keys_to_delete:
                del item[key]
            # Update the dictionary with the new items
            item.update(new_items)
        elif isinstance(item, list):
            # If the item is a list, recurse on each element
            return [reformat_recursive(element) for element in item]
        return item

    return reformat_recursive(data)


# Assuming 'json_data_example' is your original JSON data
with open('chunk_30_1032_new.json', 'r') as file:
    json_data = json.load(file)

reformatted_json = reformat_json(json_data)

with open('modified_chunk_30_1032_new.json', 'w') as file:
    json.dump(reformatted_json, file, indent=4)


In [143]:
import json

def combine_consecutive_strings(json_data):
    if isinstance(json_data, dict):
        for key, value in json_data.items():
            json_data[key] = combine_consecutive_strings(value)
    elif isinstance(json_data, list):
        new_list = []
        accumulated_string = ''
        for item in json_data:
            if isinstance(item, (dict, list)):
                if accumulated_string:
                    new_list.append(accumulated_string.strip())
                    accumulated_string = ''
                new_list.append(combine_consecutive_strings(item))
            elif isinstance(item, str):
                accumulated_string += ' ' + item
            else:
                if accumulated_string:
                    new_list.append(accumulated_string.strip())
                    accumulated_string = ''
                new_list.append(item)
        if accumulated_string:
            new_list.append(accumulated_string.strip())
        return new_list
    return json_data

# Load your JSON data from a file
with open('modified_chunk_1032_1240_new.json', 'r', encoding='utf-8') as f:
    json_data = json.load(f)

combined_json_data = combine_consecutive_strings(json_data)

# Save the combined JSON data to a new file
with open('modified_chunk_1032_1240_new.json', 'w', encoding='utf-8') as f:
    json.dump(combined_json_data, f, indent=4, ensure_ascii=False)


In [64]:
import json
import ftfy

def clean_json_data(data):
    """Recursively clean the JSON data using ftfy."""
    if isinstance(data, str):
        return ftfy.fix_text(data)
    elif isinstance(data, list):
        return [clean_json_data(item) for item in data]
    elif isinstance(data, dict):
        return {key: clean_json_data(value) for key, value in data.items()}
    else:
        return data

def clean_json_file_with_ftfy(file_path):
    # Load the JSON file
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Recursively clean the data
    cleaned_data = clean_json_data(data)

    # Save the cleaned data back to the JSON file
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(cleaned_data, f, ensure_ascii=False, indent=4)

# Path to your output.json file
file_path = "modified_chunk_30_1032_new.json"
clean_json_file_with_ftfy(file_path)


In [3]:
import json

class HierarchicalChunkProcessor:
    def __init__(self):
        self.data = {}

    def load_from_json_files(self, json_paths):
        for json_path in json_paths:
            with open(json_path, 'r') as file:
                data_chunk = json.load(file)
                self.data.update(data_chunk)
                print(f"Loaded data from {json_path}. Current number of keys in data: {len(self.data)}")

    def format_chunks(self):
        results = []
        for heading, sub_content_list in self.data.items():
            # Process each item in the sub_content_list
            for item in sub_content_list:
                # Check if the item is a dictionary with subheadings
                if isinstance(item, dict):
                    for sub_key, sub_values in item.items():
                        # If sub_values is a list, it might contain more dictionaries
                        if isinstance(sub_values, list):
                            for value in sub_values:
                                # If the value is a dictionary, it's another subheading
                                if isinstance(value, dict):
                                    for third_level_key, third_level_values in value.items():
                                        if isinstance(third_level_values, list):
                                            # If the third-level values are a list, iterate and format each one
                                            for val in third_level_values:
                                                results.append(f"{heading} - {sub_key} - {third_level_key}: {val}")
                                        else:
                                            # If it's a single value, format it directly
                                            results.append(f"{heading} - {sub_key} - {third_level_key}: {third_level_values}")
                                else:
                                    # If it's just a string, append it with its heading and subheading
                                    results.append(f"{heading} - {sub_key}: {value}")
                        else:
                            # If sub_values is just a string, append it with its heading
                            results.append(f"{heading} - {sub_key}: {sub_values}")
                else:
                    # If the item is just a string, append it with its main heading
                    results.append(f"{heading}: {item}")
        return results

file_list = [
    'all_jsons/chunk_7_10.json', 'all_jsons/chunk_10_13.json', 'all_jsons/chunk_15_17.json', 'all_jsons/chunk_17_19.json', 
    'all_jsons/chunk_20_29.json', 'all_jsons/modified_chunk_30_1032_new.json', 'all_jsons/modified_chunk_1032_1240_new.json', 'all_jsons/chunk_1240_1244.json',
    'all_jsons/chunk_1245_1251.json', 'all_jsons/chunk_1251_1252.json', 'all_jsons/chunk_1252_1254.json', 
    'all_jsons/chunk_1254_1255.json', 'all_jsons/chunk_1255_1256.json', 'all_jsons/chunk_1256_1264.json', 
    'all_jsons/chunk_1264_1270.json', 'all_jsons/chunk_1270_1276.json', 'all_jsons/chunk_1276_1278.json', 
    'all_jsons/chunk_1278_1280.json', 'all_jsons/chunk_1280_1286.json', 'all_jsons/chunk_1286_1287.json', 
    'all_jsons/chunk_1287_1290.json', 'all_jsons/chunk_1290_1291.json'
]  
processor = HierarchicalChunkProcessor()
processor.load_from_json_files(file_list)
formatted_chunks = processor.format_chunks()

Loaded data from all_jsons/chunk_7_10.json. Current number of keys in data: 10
Loaded data from all_jsons/chunk_10_13.json. Current number of keys in data: 25
Loaded data from all_jsons/chunk_15_17.json. Current number of keys in data: 28
Loaded data from all_jsons/chunk_17_19.json. Current number of keys in data: 31
Loaded data from all_jsons/chunk_20_29.json. Current number of keys in data: 57
Loaded data from all_jsons/modified_chunk_30_1032_new.json. Current number of keys in data: 279
Loaded data from all_jsons/modified_chunk_1032_1240_new.json. Current number of keys in data: 344
Loaded data from all_jsons/chunk_1240_1244.json. Current number of keys in data: 358
Loaded data from all_jsons/chunk_1245_1251.json. Current number of keys in data: 370
Loaded data from all_jsons/chunk_1251_1252.json. Current number of keys in data: 371
Loaded data from all_jsons/chunk_1252_1254.json. Current number of keys in data: 376
Loaded data from all_jsons/chunk_1254_1255.json. Current number of 

In [4]:
formatted_chunks

['The University Catalog: content',
 'Accreditation: content',
 'Mission Statement: The University of North Carolina at Chapel Hill: content',
 'UNC’s Commitment to Diversity and Inclusivity: content',
 'Policy on Prohibited Discrimination, Harassment and Related Misconduct Including Sexual and Gender-Based Harassment, Sexual Violence, Interpersonal Violence and Stalking ( https://policies.unc.edu/TDClient/2833/ Portal/Shared/Search/?c=all&s=Policy +on+Prohibited+Discrimination%2C +Harassment+and+Related+Misconduct ): content',
 'Policy Statement on Nondiscrimination: Educational and Employment Decisions: content',
 'Resources for Information and Assistance: content',
 'Reporting Options: content',
 'Conﬁdential Resources: content',
 'Graduation Rate: content',
 'Ofﬁce of the Chancellor: content',
 'Ofﬁce of the Provost: content',
 'College of Arts and Sciences: content',
 'Ofﬁce of Undergraduate Education: content',
 'School Deans: content',
 'Finance and Operations: content',
 'Human

In [5]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

def is_within_limit(text):
    tokens = tokenizer.tokenize(text)
    return len(tokens) <= 8192

for r in formatted_chunks:
    if is_within_limit(r) == False:
        print(r)


  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


First-Year Seminar and First-Year Launch (FY-SEMINAR) (FY-LAUNCH) - First-Year Seminar: {'0 to 99': ["AAAD 50First-Year Seminar: Defining Blackness 3 AAAD 51First-Year Seminar: Masquerades of Blackness 3 AAAD 53First-Year Seminar: Experimentalism in Global Black Music and Performance Arts 3 AAAD 54First-Year Seminar: African Migrations, Boundaries, Displacements, and Belonging 3 AAAD 55First-Year Seminar: Youth Activism, Citizenship, and Social Change in Africa 3 AAAD 58First-Year Seminar: Health Inequality in Africa and the African Diaspora 3 AAAD 89First Year Seminar: Special Topics 3 AMST 51First-Year Seminar: Navigating America 3 AMST 53First-Year Seminar: The Family and Social Change in AmericaH3 AMST 54First-Year Seminar: The Indians' New Worlds: Southeastern Histories AMST 55First-Year Seminar: Birth and Death in the United StatesH3 AMST 59First-Year Seminar: American Indian Art in the 20th Century 3 AMST 60First-Year Seminar: American Indians in History, Law, and Literature 3 A

In [9]:
weaviate_url = "https://ramvisor-4qeivit4.weaviate.network"
weaviate_key = "TJzjA9UR3jRWUTmI6qWkpt8tgmmzyvfUUEpY"

In [15]:
import weaviate
import json
import requests
import time
from weaviate.exceptions import UnexpectedStatusCodeException

def robust_batch_add(batch, attempts=5, sleep_time=5):
    """
    Attempt to flush the batch with retries on UnexpectedStatusCodeException.

    :param batch: The batch object from Weaviate client.
    :param attempts: Number of attempts to make.
    :param sleep_time: Time to sleep between attempts in seconds.
    """
    for attempt in range(attempts):
        try:
            batch.flush()  # Or any batch operation that needs retry
            break  # Success, exit the loop
        except UnexpectedStatusCodeException as e:
            if attempt < attempts - 1:  # Avoid sleeping on the last attempt
                print(f"Attempt {attempt + 1} failed with error: {e}. Retrying in {sleep_time} seconds...")
                time.sleep(sleep_time)
            else:
                print("Final attempt failed. Raising exception.")
                raise  # Re-raise the exception after the last attempt


def vector_search(filtered_text_chunks):
    # Define the schema with optional properties for subheading and final_subheading, and add combined_headings
    class_obj = {
        "class": "TextChunk",
        "vectorizer": "text2vec-openai",
        "properties": [
            {"name": "main_heading", "dataType": ["string"], "indexInverted": True},
            {"name": "subheading", "dataType": ["string"], "indexInverted": True, "optional": True},
            {"name": "final_subheading", "dataType": ["string"], "indexInverted": True, "optional": True},
            {"name": "content", "dataType": ["text"], "indexInverted": True},
            {"name": "combined_headings", "dataType": ["string"], "indexInverted": True}  # New field for combined headings
        ],
        "moduleConfig": {
            "text2vec-openai": {},
            "generative-openai": {}
        }
    }

    auth_config = weaviate.auth.AuthApiKey(api_key=weaviate_key)


    client = weaviate.Client(
        url=weaviate_url,
        auth_client_secret= auth_config,
        additional_headers={
            "X-OpenAI-Api-Key": "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",
        }
    )

    # Check if the class exists and delete it if it does
    if client.schema.exists("TextChunk"):
        client.schema.delete_class("TextChunk")

    # Create the new class with the combined_headings field
    client.schema.create_class(class_obj)

    def parse_text_chunk(text_chunk):
        # Find the index of the first colon
        colon_index = text_chunk.find(':')
        # If there's no colon, assume the entire chunk is a heading
        if colon_index == -1:
            headings_combined = text_chunk.strip()
        else:
            # Extract everything before the first colon
            headings_combined = text_chunk[:colon_index].strip()
        
        # Split by dashes to get individual headings
        headings_parts = headings_combined.split(" - ")

        # Initialize the dictionary with combined headings
        headings = {
            "main_heading": None,
            "subheading": None,
            "final_subheading": None,
            "combined_headings": headings_combined
        }
        
        # Dynamically assign the headings based on their count
        if headings_parts:
            headings["main_heading"] = headings_parts[0].strip()
        if len(headings_parts) > 1:
            headings["subheading"] = headings_parts[1].strip()
        if len(headings_parts) > 2:
            headings["final_subheading"] = " - ".join(headings_parts[2:]).strip()  # Join any remaining headings

        content = text_chunk[colon_index + 1:].strip() if colon_index != -1 else ""

        # Only return the headings without content
        return {
            "main_heading": headings["main_heading"],
            "subheading": headings["subheading"],
            "final_subheading": headings["final_subheading"],
            "combined_headings": headings["combined_headings"],
            "content": content
        }

    # Batch import all text chunks
    client.batch.configure(batch_size=200)

    # Adapted batch import to use combine_chunks function and include combined_headings
    with client.batch as batch:
        for i, combined_chunk in enumerate(filtered_text_chunks):
            # print(f"Processing combined text chunk: {i+1}")
            properties = parse_text_chunk(combined_chunk)  # Adapt the parse function if necessary

            # Debug: Print properties to ensure content is not too long
            # print("Properties being sent to Weaviate:", properties)

            try:
                batch.add_data_object(
                    data_object=properties,
                    class_name="TextChunk",
                )
            except Exception as e:
                print(f"Failed to import chunk {i+1}: {e}")
        robust_batch_add(batch)

vector_search(formatted_chunks)

Failed to import chunk 855: batch response! Unexpected status code: 502, with response body: None.
Failed to import chunk 932: batch response! Unexpected status code: 502, with response body: None.
Failed to import chunk 1009: batch response! Unexpected status code: 502, with response body: None.
Failed to import chunk 1086: batch response! Unexpected status code: 502, with response body: None.
Failed to import chunk 1163: batch response! Unexpected status code: 502, with response body: None.
Failed to import chunk 1240: batch response! Unexpected status code: 502, with response body: None.
Failed to import chunk 1317: batch response! Unexpected status code: 502, with response body: None.
Failed to import chunk 1394: batch response! Unexpected status code: 502, with response body: None.
Failed to import chunk 1471: batch response! Unexpected status code: 502, with response body: None.
Failed to import chunk 1548: batch response! Unexpected status code: 502, with response body: None.
Fa


KeyboardInterrupt



In [16]:
import weaviate

auth_config = weaviate.AuthApiKey(api_key=weaviate_key)

client = weaviate.Client(
    url=weaviate_url,
    auth_client_secret=auth_config,
    additional_headers={
        "X-OpenAI-Api-Key": "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",
    }
)

# Define your search query
query = {
    "concepts": ["biology"],
    "properties": ["main_heading", "subheading", "final_subheading"]  # Properties you want to search within
}

# Perform the vector search
result = client.query.get("TextChunk", ["main_heading", "subheading", "final_subheading", "content"]).with_near_text(query).do()

# # Process the results
for hit in result['data']['Get']['TextChunk']:
    # Access the properties of each hit as needed, for example:
    print(hit['main_heading'], hit['subheading'], hit['final_subheading'], hit['content'])




Biology Major, B.S. (BIOL) None None Biology is the study of life from both basic and applied perspectives across a broad range of analytical levels, from the molecule and cell to the organism and ecosystem. This program is designed for students who intend to continue graduate study in biological or health sciences.
Biology Major, B.A. (BIOL) None None Biology is the study of life from both basic and applied perspectives across a broad range of analytical levels, from the molecule and cell to the organism and ecosystem. The major in biology provides a broad education directed toward an appreciation of the complexity of nature and prepares students for careers in the biological, environmental, and medical sciences. This program is designed to provide greater flexibility than the B.S. degree in meeting broad student interests.
Department of Biology (BIOL) (BS, BA, Minor) Introduction None Biology is the study of life from both basic and applied perspectives across a broad range of analyt