# Inspect Parquet

In [6]:
import pyarrow.parquet as pq
import pandas as pd

def inspect_parquet(parquet_path, num_rows=5):
    """
    Reads a local Parquet file, prints basic information about the data,
    and shows the first few rows (default 5).
    """
    print(f"Reading Parquet file: {parquet_path}")
    table = pq.read_table(parquet_path)
    print(f"Schema:\n{table.schema}\n")

    df = table.to_pandas()
    print(f"Total rows: {len(df)}")
    print(f"Columns: {df.columns.tolist()}\n")

    # Show a few rows
    print(f"First {num_rows} rows:\n")
    print(df.head(num_rows))
    
    output_csv_path = parquet_path.replace('.parquet', '_head.csv')
    df.head(num_rows).to_csv(output_csv_path, index=False)
    print(f"Saved first {num_rows} rows to CSV: {output_csv_path}")


# Example usage: adapt the paths to your local environment
parquet_file_1 = "/home/s2652867/llm-self-play-liquidhaskell/data/jtatman-python-code-dataset-500k/train-00000-of-00002.parquet"
parquet_file_2 = "/home/s2652867/llm-self-play-liquidhaskell/data/jtatman-python-code-dataset-500k/train-00001-of-00002.parquet"

# Inspect the first file
inspect_parquet(parquet_file_1, num_rows=5)
print("\n" + "="*50 + "\n")
# Inspect the second file
inspect_parquet(parquet_file_2, num_rows=5)



Reading Parquet file: /home/s2652867/llm-self-play-liquidhaskell/data/jtatman-python-code-dataset-500k/train-00000-of-00002.parquet
Schema:
output: string
instruction: string
system: string
-- schema metadata --
huggingface: '{"info": {"features": {"output": {"dtype": "string", "_type' + 117

Total rows: 279758
Columns: ['output', 'instruction', 'system']

First 5 rows:

                                              output  \
0  Here is an example of a nested loop in Python ...   
1  The given problem can be solved by iterating t...   
2  Here's an example of code that attempts to sol...   
3  Here is an implementation of the function in P...   
4  Here's a possible implementation of the method...   

                                         instruction  \
0  Create a nested loop to print every combinatio...   
1  Write a function to find the number of distinc...   
2  Write code that removes spaces and punctuation...   
3  Write a function that checks if a given number...   
4  Write 

# End