In [1]:

# Objective of this notebook is to practice datalake access and git integration

import io
import sys
import os

import numpy as np
import pandas as pd
import seaborn as sns

from azure.identity import AzureCliCredential
from azure.storage.filedatalake import DataLakeServiceClient, DataLakeFileClient

In [2]:
# Must run this code before we can write any files. Only needs to be ran once per python script.
cred = AzureCliCredential() 
storage_account = "datalakestoragenfsk522vz" 
file_system = "datalakegen2filesystem" 

service_client = DataLakeServiceClient(
    f"https://{storage_account}.dfs.core.windows.net/", credential=cred
)

In [3]:
# check to see if a file you want exists (and path is correct)
new_file_client = service_client.get_file_client(file_system, "analytics/auto/commercial/00001_TestConnectivity/iris.csv")
new_file_client.exists() 

True

In [4]:
# ingest a simple .csv (with no odd delimiters in play)
test_iris = pd.read_csv(io.BytesIO(new_file_client.download_file().readall()))
test_iris.head()

Unnamed: 0,id,sepal_length,sepal_width,petal_length,petal_width,species
0,1,5.1,3.5,1.4,0.2,setosa
1,2,4.9,3.0,1.4,0.2,setosa
2,3,4.7,3.2,1.3,0.2,setosa
3,4,4.6,3.1,1.5,0.2,setosa
4,5,5.0,3.6,1.4,0.2,setosa


In [5]:
# ingest a parquet (set client, change pandas read function)
new_file_client = service_client.get_file_client(file_system, "analytics/auto/commercial/00001_TestConnectivity/iris.parquet")
test1_iris = pd.read_parquet(io.BytesIO(new_file_client.download_file().readall()))
test1_iris.head()

Unnamed: 0,id,sepal_length,sepal_width,petal_length,petal_width,species
0,1,5.1,3.5,1.4,0.2,setosa
1,2,4.9,3.0,1.4,0.2,setosa
2,3,4.7,3.2,1.3,0.2,setosa
3,4,4.6,3.1,1.5,0.2,setosa
4,5,5.0,3.6,1.4,0.2,setosa


In [6]:
# now practice writing to the datalake (set client with file path including format extension, use pandas)

new_file_client = service_client.get_file_client(file_system, "analytics/auto/commercial/00001_TestConnectivity/iris_leigh.csv")
new_file_client.upload_data(test_iris.to_csv(index=False), overwrite=True)

{'date': datetime.datetime(2023, 4, 13, 18, 4, 33, tzinfo=datetime.timezone.utc),
 'etag': '"0x8DB3C4989513979"',
 'last_modified': datetime.datetime(2023, 4, 13, 18, 4, 33, tzinfo=datetime.timezone.utc),
 'content_length': 0,
 'client_request_id': 'a44f4d70-da25-11ed-ba0a-000d3a041eaa',
 'request_id': '19d0391a-801f-001a-7332-6e86ac000000',
 'version': '2021-12-02',
 'request_server_encrypted': False,
 'encryption_key_sha256': None,
 'lease_renewed': None}

In [8]:
# again, write but using another data storage format (= . parquet)

new_file_client = service_client.get_file_client(file_system, "analytics/auto/commercial/00001_TestConnectivity/iris_leigh.parquet")
new_file_client.upload_data(test1_iris.to_parquet(index=False), overwrite=True)

{'date': datetime.datetime(2023, 4, 13, 18, 6, 56, tzinfo=datetime.timezone.utc),
 'etag': '"0x8DB3C49DE97C0F2"',
 'last_modified': datetime.datetime(2023, 4, 13, 18, 6, 56, tzinfo=datetime.timezone.utc),
 'content_length': 0,
 'client_request_id': 'f9aaa2d8-da25-11ed-ba0a-000d3a041eaa',
 'request_id': '19d04436-801f-001a-1432-6e86ac000000',
 'version': '2021-12-02',
 'request_server_encrypted': False,
 'encryption_key_sha256': None,
 'lease_renewed': None}

Note that both files are now in the lake on the specified path - can use explorer to verify.