In [1]:
%%pyspark
df = spark.read.load('abfss://precon@synapseinaday.dfs.core.windows.net/demo2/NYTPARQUET.parquet', format='parquet')
display(df.limit(10))

StatementMeta(SparkySpark, 0, 1, Finished, Available)

SynapseWidget(Synapse.DataFrame, c42cf7e0-4125-48c2-a413-6637b19eeaea)

<h1> Using Pyarrow we can analyze one specific file

In [2]:
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
import pyarrow.parquet as pq
from io import BytesIO
filename = "part-00001-82448f9c-6320-4d40-a5e2-a4e44ff498a3-c000.snappy.parquet"
CONNECT_STR = "DefaultEndpointsProtocol=https;AccountName=sqlstijntraining;AccountKey=***;EndpointSuffix=core.windows.net"
container_name="taxi"

blob_service_client = BlobServiceClient.from_connection_string(CONNECT_STR)
container_client=blob_service_client.get_container_client(container_name)
blob_client = container_client.get_blob_client(filename)
streamdownloader=blob_client.download_blob()

stream = BytesIO()
streamdownloader.download_to_stream(stream)

FinallyPyArrowDF = pq.ParquetFile(source=stream)


StatementMeta(SparkySpark, 0, 2, Finished, Available)

<h1> We use a data stream to load the specific file into the Pyarrow Dataframe

In [3]:
FinallyPyArrowDF.metadata

StatementMeta(SparkySpark, 0, 3, Finished, Available)

<pyarrow._parquet.FileMetaData object at 0x7ff58dcb8688>
  created_by: parquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1)
  num_columns: 20
  num_rows: 8620012
  num_row_groups: 2
  format_version: 1.0
  serialized_size: 5965

<h1> As you can see we can find the Data Body structure by reading the footer metadata from the PyArrow using the metadata function

In [6]:
FinallyPyArrowDF.metadata.row_group(0)

StatementMeta(SparkySpark, 0, 6, Finished, Available)

<pyarrow._parquet.RowGroupMetaData object at 0x7ff570cde630>
  num_columns: 20
  num_rows: 5200100
  total_byte_size: 182255572

<h1> We can look at a specific rowgroup (Starts at 0) and see the amount of columns and num of rows and size

In [15]:
FinallyPyArrowDF.schema

StatementMeta(SparkySpark, 0, 15, Finished, Available)

<pyarrow._parquet.ParquetSchema object at 0x7ff570cf8be0>
required group field_id=0 spark_schema {
  optional int32 field_id=1 Passenger_Count;
  optional double field_id=2 Trip_Distance;
  optional boolean field_id=3 store_and_forward;
  optional int32 field_id=4 Payment_Type;
  optional int64 field_id=5 mta_tax (Decimal(precision=10, scale=2));
  optional int32 field_id=6 vendorID;
  optional int32 field_id=7 PULocationID;
  optional int32 field_id=8 DOLocationID;
  optional int96 field_id=9 tpep_pickup_datetime;
  optional int96 field_id=10 tpep_dropoff_datetime;
  optional int32 field_id=11 RatecodeID;
  optional int64 field_id=12 Fare_amount (Decimal(precision=10, scale=2));
  optional double field_id=13 Extra;
  optional int64 field_id=14 Tip_amount (Decimal(precision=10, scale=2));
  optional int64 field_id=15 Tolls_amount (Decimal(precision=10, scale=2));
  optional int64 field_id=16 Total_amount (Decimal(precision=10, scale=2));
  optional int64 field_id=17 Improvement_surchar

<h1> We can read the schema and see the data types and type of fields using the schema function

In [8]:
FinallyPyArrowDF.metadata.row_group(0).column(6)

StatementMeta(SparkySpark, 0, 8, Finished, Available)

<pyarrow._parquet.ColumnChunkMetaData object at 0x7ff570cde510>
  file_offset: 13098788
  file_path: 
  physical_type: INT32
  num_values: 5200100
  path_in_schema: PULocationID
  is_stats_set: True
  statistics:
    <pyarrow._parquet.Statistics object at 0x7ff570cde480>
      has_min_max: True
      min: 2
      max: 3
      null_count: 0
      distinct_count: 0
      num_values: 5200100
      physical_type: INT32
      logical_type: None
      converted_type (legacy): NONE
  compression: SNAPPY
  encodings: ('RLE', 'BIT_PACKED', 'PLAIN_DICTIONARY')
  has_dictionary_page: False
  dictionary_page_offset: None
  data_page_offset: 13098788
  total_compressed_size: 1251
  total_uncompressed_size: 1209

<h1> We can also look at the column chunks and find out thefile offset, type, compression type etc...

In [12]:
FinallyPyArrowDF.metadata.row_group(0).column(6).statistics

StatementMeta(SparkySpark, 0, 12, Finished, Available)

<pyarrow._parquet.Statistics object at 0x7ff570cc8240>
  has_min_max: True
  min: 2
  max: 3
  null_count: 0
  distinct_count: 0
  num_values: 5200100
  physical_type: INT32
  logical_type: None
  converted_type (legacy): NONE

<h1> We can also focus on the statistics(If they are created) of a file which shows us the min and max value etc

In [9]:
FinallyPyArrowDF.metadata.row_group(0).column(1)

StatementMeta(SparkySpark, 0, 9, Finished, Available)

<pyarrow._parquet.ColumnChunkMetaData object at 0x7ff570cde6c0>
  file_offset: 1883426
  file_path: 
  physical_type: DOUBLE
  num_values: 5200100
  path_in_schema: Trip_Distance
  is_stats_set: True
  statistics:
    <pyarrow._parquet.Statistics object at 0x7ff570cdefc0>
      has_min_max: True
      min: 0.0
      max: 6805400.0
      null_count: 0
      distinct_count: 0
      num_values: 5200100
      physical_type: DOUBLE
      logical_type: None
      converted_type (legacy): NONE
  compression: SNAPPY
  encodings: ('RLE', 'BIT_PACKED', 'PLAIN_DICTIONARY')
  has_dictionary_page: False
  dictionary_page_offset: None
  data_page_offset: 1883426
  total_compressed_size: 7830206
  total_uncompressed_size: 7843705