# Load dataset in Datasets folder

In [25]:
from datasets import load_dataset

cache_directory  = "../Datasets"

# Now, load_dataset will use the specified cache directory
# It will automatically download from huggingface if the data is not available locally.
ds = load_dataset("microsoft/ms_marco", "v2.1", cache_dir=cache_directory)

## Examine data

In [31]:
print(ds.keys())
print(ds['train'])
print(ds['test'])
print(ds['validation'])

data = ds['validation']

dict_keys(['validation', 'train', 'test'])
Dataset({
    features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
    num_rows: 808731
})
Dataset({
    features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
    num_rows: 101092
})
Dataset({
    features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
    num_rows: 101093
})


## Preparation for conversion

In [60]:
# Number of rows
total_rows = len(data)  # Should be 808731

n = 1 # Number of files. If d = 1, n=80 is the best split. Specifically, n*d >= 80 is best.

d = 1000 # Scale down the number of data points (i.e. total_rows divided by d). It must be scaled down if not it will be too expensive in terms of time and money.

# Calculate how many rows each file should contain
rows_per_file = total_rows // (d*n)  # Floor division

input_files_dir = "GraphRAG/input/"
input_files = []

for i in range(n):
    input_files.append(f"{input_files_dir}dataset{i}.txt")

print(input_files)

['GraphRAG/input/dataset0.txt']


## Conversion to .txt

In [61]:
# Open all files ahead of time
files = [open(input_file, "w") for input_file in input_files]


try:
    # Iterate over the entire dataset row by row
    for idx, row in enumerate(data):
        # Determine which file to write to based on the row index
        file_index = idx // rows_per_file  # Integer division to get the file index (0-3)

        if idx > total_rows / d:
            print(f"number of rows converted: {idx-1}")
            break

        # Ensure the last part takes the remaining rows
        if file_index >= n:
            file_index = n-1

        # Write to the correct file
        f = files[file_index]

        # Extract the query, answers, and passages
        query = row['query']
        query_id = row['query_id']
        query_type = row['query_type']
        answers = row['answers']  # Answers
        wf_answers = row['wellFormedAnswers']  # Well-formed Answers
        passages = row['passages']  # Passages relevant to the query

        # Structure: Query followed by answers and passages
        f.write(f"Query: {query}\n")
        f.write(f"Query_id: {query_id}\n")
        f.write(f"Query_type: {query_type}\n")
        f.write(f"Answers: {' | '.join(answers)}\n")
        f.write(f"Well_Formed_Answers: {' | '.join(wf_answers)}\n")

        for p in passages:
            # Convert all elements to string and join them
            if p == 'passage_text':
                f.write(f"{p}: " + ' '.join(map(str, passages[p])) + "\n")
            elif p == 'is_selected':
                f.write(f"{p}: " + ', '.join(map(str, passages[p])) + "\n")
            elif p == 'url':
                f.write(f"{p}: " + ', '.join(map(str, passages[p])) + "\n")

        f.write("\n" + "-"*10 + "\n")  # Add a separator between entries
finally:
    # Close all files
    for f in files:
        f.close()

print(f"Data successfully split into {n} files!")

number of rows converted: 101
Data successfully split into 1 files!


In [65]:
# Open the file in read mode
input_file = "GraphRAG/input/dataset0.txt"

# Open the file in read mode
with open(input_file, 'r') as file:
    # Read the first N lines (e.g., 5 lines)
    for i in range(10):
        print(file.readline().strip())


Query: . what is a corporation?
Query_id: 1102432
Query_type: DESCRIPTION
Answers: A corporation is a company or group of people authorized to act as a single entity and recognized as such in law.
Well_Formed_Answers:
is_selected: 0, 0, 0, 0, 0, 1, 0, 0, 0, 0
passage_text: A company is incorporated in a specific nation, often within the bounds of a smaller subset of that nation, such as a state or province. The corporation is then governed by the laws of incorporation in that state. A corporation may issue stock, either private or public, or may be classified as a non-stock corporation. If stock is issued, the corporation will usually be governed by its shareholders, either directly or indirectly. Today, there is a growing community of more than 2,100 Certified B Corps from 50 countries and over 130 industries working together toward 1 unifying goal: to redefine success in business. Join the Movement Corporation definition, an association of individuals, created by law or under authori

In [64]:
# For comparison with dataset0.txt
data[101]

{'answers': ['No Answer Present.'],
 'passages': {'is_selected': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  'passage_text': ["Average Administrative Assistant Salaries. The average salary for administrative assistant jobs is $39,000. Average administrative assistant salaries can vary greatly due to company, location, industry, experience and benefits.This salary was calculated using the average salary for all jobs with the term administrative assistant anywhere in the job listing.his free salary calculator uses salary data from millions of job listings indexed by Simply Hired's job search engine. The administrative assistant salary comparisons feature allows job seekers to get the information required to make important career decisions and negotiate salary ranges.",
   "For retirement, eligibility is at age 62. If a person reaches age 62 in 2015, for example, then 2015 is the person's year of eligibility. We always index an individual's earnings to the average wage level two years prior to the 