### CONFIGURATION ###

In [11]:
from dotenv import load_dotenv
import os

# Carregar o arquivo .env
load_dotenv()

# Configurações
account_name = os.getenv("account_name")
account_key = os.getenv("account_key")
file_system_name = os.getenv("file_system_name")
local_file_path = os.getenv("local_file_path")
dest_file_path = os.getenv("dest_file_path")

# Imprimir os valores
print(f"account_name: {account_name}")
print(f"account_key: {'***' if account_key else None}")
print(f"file_system_name: {file_system_name}")
print(f"local_file_path: {local_file_path}")
print(f"dest_file_path: {dest_file_path}")

account_name: codegroup
account_key: ***
file_system_name: seven
local_file_path: data/desafio/raw
dest_file_path: desafio/raw


In [12]:
from azure.storage.filedatalake import DataLakeServiceClient

# Configurar a conexão com o Data Lake
def initialize_storage_account(account_name, account_key):
    service_client = DataLakeServiceClient(
        account_url=f"https://{account_name}.dfs.core.windows.net",
        credential=account_key
    )
    return service_client

# Inicializar e enviar
service_client = initialize_storage_account(account_name, account_key)

# List all files and directories in the folder
files = os.listdir(local_file_path)

# Filter out only files (optional)
only_files = [f for f in files if os.path.isfile(os.path.join(local_file_path, f))]

### UPLOAD TO DATA LAKE - EXTRA

In [13]:
# Fazer o upload do arquivo
def upload_to_datalake(service_client, file_system_name, local_file_path, dest_file_path):
    # Obter referência ao sistema de arquivos
    file_system_client = service_client.get_file_system_client(file_system=file_system_name)
    file_client = file_system_client.get_file_client(dest_file_path)

    # Ler o arquivo local e enviá-lo ao Data Lake
    with open(local_file_path, "rb") as file:
        file_contents = file.read()
        file_client.upload_data(file_contents, overwrite=True)

    print(f"Arquivo '{local_file_path}' enviado para '{dest_file_path}' no Data Lake.")


print("All files in the folder:", only_files)

for file_name in only_files:
    upload_to_datalake(service_client, file_system_name,f"{local_file_path}/{file_name}", f"{dest_file_path}/{file_name}")
    print("" + file_name + " enviado com sucesso!")
    print("-----------------------------------------------------")

All files in the folder: ['pedidos_raw.csv', 'produtos_raw.csv', 'user_raw.csv']
Arquivo 'data/desafio/raw/pedidos_raw.csv' enviado para 'desafio/raw/pedidos_raw.csv' no Data Lake.
pedidos_raw.csv enviado com sucesso!
-----------------------------------------------------
Arquivo 'data/desafio/raw/produtos_raw.csv' enviado para 'desafio/raw/produtos_raw.csv' no Data Lake.
produtos_raw.csv enviado com sucesso!
-----------------------------------------------------
Arquivo 'data/desafio/raw/user_raw.csv' enviado para 'desafio/raw/user_raw.csv' no Data Lake.
user_raw.csv enviado com sucesso!
-----------------------------------------------------


### INGESTÃO DE DADOS

In [None]:
from azure.storage.filedatalake import DataLakeServiceClient
import pandas as pd
from io import StringIO

# Ler dados do Data Lake
def read_from_datalake(service_client, file_system_name, file_path):
    # Obter referência ao sistema de arquivos
    file_system_client = service_client.get_file_system_client(file_system=file_system_name)
    
    # Obter o cliente do arquivo
    file_client = file_system_client.get_file_client(file_path)
    
    # Ler o conteúdo do arquivo
    download = file_client.download_file()
    file_contents = download.readall()
    
    # Retornar como string ou processar como necessário
    return file_contents.decode('utf-8')

# Inicializar e ler os dados
service_client = initialize_storage_account(account_name, account_key)

dict_files_contents = {}
for file_name in only_files:
    print("" + file_name + " em processamento...")
    dict_files_contents[file_name] = read_from_datalake(service_client, file_system_name, f"{dest_file_path}/{file_name}")
    print("" + file_name + " em memória")
    print(dict_files_contents[file_name])
    print("-----------------------------------------------------")

pedidos_raw.csv em processamento...
pedidos_raw.csv em memória
user_id,created_at,items,total,payment_status,payment_method,payment_date,shipping_status,shipping_status_date_awaitin
g_payment,shipping_status_date_preparing,shipping_status_date_sent,shipping_status_date_deliver
ed
1,2018-01-01,1,100.90,paid,credit_card,2018-01-01,preparing,2018-01-01,2018-01-01,2018-01-01,2018-01-01
-----------------------------------------------------
produtos_raw.csv em processamento...
produtos_raw.csv em memória
product_id,name,price,stock,created_at,description
1,camiseta,100.90,10,2018-01-01,camiseta branca
-----------------------------------------------------
user_raw.csv em processamento...
user_raw.csv em memória
user_id,name,entry_date,entry_time,update_date,e-mail,cpf
1,joao,2018-01-01,10:00:00,2018-01-01,joao@gmail.com,365.767105-84
-----------------------------------------------------


### 