# Get Pipelines

First, lets make sure we have the needed libraries installed, that is needed to run this notebook

In [None]:
#%pip install azure-identity azure-synapse-artifacts azure-mgmt-synapse

The script above, only need to run the first time, then the modules are dowloaded and installed.

But we need to define, which environment, we would like to run on:

In [None]:
# Only one of thees can be active at a time
# environment = 'dev'
# environment = 'test'
environment = 'prod'

First lets get a token we can use

In [None]:
from azure.identity import InteractiveBrowserCredential
from azure.synapse.artifacts import ArtifactsClient
import os

# Indsæt din Synapse Analytics workspace URL
workspace_url = f"https://synw-insight-{environment}.dev.azuresynapse.net"

# Opret credential ved hjælp af interaktiv login
credential = InteractiveBrowserCredential()

# Opret en ArtifactsClient
client = ArtifactsClient(endpoint=workspace_url, credential=credential)

# Eksempel på at få en bearer token
token = credential.get_token("https://dev.azuresynapse.net/.default")
print("Bearer token:", token.token)

Then use this token to get the pipelines

In [None]:
import requests
import json

def fetch_all_pipelines(endpoint, token):
    pipelines = []
    url = f"{endpoint}/pipelines?api-version=2020-12-01"
    headers = {
        'Authorization': f'Bearer {acces_token}'
    }
    
    while url:
        response = requests.get(url, headers=headers)
        data = response.json()
        pipelines.extend(data.get('value', []))
        url = data.get('nextLink')
    
    return pipelines

# Usage
endpoint = workspace_url  #'https://synw-insight-prod.dev.azuresynapse.net'
acces_token = token.token #'eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiIsIng1dCI6IllUY2VPNUlKeXlxUjZqekRTNWlBYnBlNDJKdyIsImtpZCI6IllUY2VPNUlKeXlxUjZqekRTNWlBYnBlNDJKdyJ9.eyJhdWQiOiJodHRwczovL2Rldi5henVyZXN5bmFwc2UubmV0LyIsImlzcyI6Imh0dHBzOi8vc3RzLndpbmRvd3MubmV0LzZlZTBkMmY1LTg5OWQtNDhjOC1iMmEzLTMzMDYxZjU2NjViYy8iLCJpYXQiOjE3MzgwNjk4NzQsIm5iZiI6MTczODA2OTg3NCwiZXhwIjoxNzM4MDczNzc0LCJhaW8iOiJrMlJnWUhEMXVTVHg0bHZuN2YxN0ZxeWRlR3BaUGdBPSIsImFwcGlkIjoiMmE1YzM5MGYtZWJmMC00YjE4LWJiZGItMWM2N2RjY2U4YzcxIiwiYXBwaWRhY3IiOiIxIiwiaWRwIjoiaHR0cHM6Ly9zdHMud2luZG93cy5uZXQvNmVlMGQyZjUtODk5ZC00OGM4LWIyYTMtMzMwNjFmNTY2NWJjLyIsImlkdHlwIjoiYXBwIiwib2lkIjoiNmUxOTA1OTMtMGQyNi00MzA3LTk2YjMtYWQ2Nzg3NmQxZWYzIiwicmgiOiIxLkFTQUE5ZExnYnAySnlFaXlvek1HSDFabHZINWN3QnJTRWdWR3Y1MVVuWEJCeHJQa0FBQWdBQS4iLCJzdWIiOiI2ZTE5MDU5My0wZDI2LTQzMDctOTZiMy1hZDY3ODc2ZDFlZjMiLCJ0aWQiOiI2ZWUwZDJmNS04OTlkLTQ4YzgtYjJhMy0zMzA2MWY1NjY1YmMiLCJ1dGkiOiJjSXF2dS12RFNFNnhKcmxBNjRlU0FBIiwidmVyIjoiMS4wIiwieG1zX2lkcmVsIjoiNyAxMCJ9.dLuNiqseXyjBPlAcRrCFaaK2HDa-tDEE0HcNUA4PGlpQYCWI_XS2bZKz2C8p9Yc5zpULgMANKctttg-W1LDNDkqhB9Z0lAdpAW6clwf3ZBXst7aahjAXimZrYnl4ifdawxF2AxAQaInI2XGBjsAudBQYWqF9Q6rVCPWBpCIWX6IX5XAWM-6oTWQFsyEVrmo7cdowpG6gYznA6gXa5duk9lo7qvZ8zP2sPgV5XMfz1UUq5czqj0vDEpQMLKmrD98gjTFEu85BS7carwJapVwKz1tpVazNsJvmS9BkxlR1h0kpy3fx7yiGwhJaMhVmX3Qbqt43HBoWl9lWjTOcuafG0Q'
pipelines = fetch_all_pipelines(endpoint, token)

# Check if the directory exists, if not, create it
directory = './data/pipelines'
if not os.path.exists(directory):
    os.makedirs(directory)

# Save the pipelines to a JSON file
with open(f'{directory}/pipelines_{environment}.json', 'w', encoding='utf-8') as f:
    json.dump(pipelines, f, indent=4)

print(f"Pipelines have been saved to pipelines_{environment}.json")

The we pick out some of the elements in the `pipelines.json` file and normalize it and saves it again

In [None]:
#import json

# Load the JSON data from the file
# with open('pipelines.json', 'r') as f:
#     pipelines = json.load(f)

# Normalize the data
normalized_data = []
for pipeline in pipelines:
    pipeline_info = {
        'name': pipeline.get('name'),
        'policy': None,
        'lastPublishTime': pipeline.get('properties', {}).get('lastPublishTime'),
        'state': None,
        'inactive_objects': []
    }
    
    # Check if any part of the pipeline has 'state' = 'Inactive' and get the 'timeout'
    for activity in pipeline.get('properties', {}).get('activities', []):
        if activity.get('state') == 'Inactive':
            pipeline_info['state'] = 'Inactive'
            pipeline_info['inactive_objects'].append({
                'name': activity.get('name'),
                'description': activity.get('description', '')
            })
        if 'policy' in activity and 'timeout' in activity['policy']:
            pipeline_info['policy'] = activity['policy']['timeout']
    
    normalized_data.append(pipeline_info)

# Save the normalized data to a JSON file
with open(f'./data/pipelines/normalized_pipelines_{environment}.json', 'w') as f:
    json.dump(normalized_data, f, indent=4)

print(f"Normalized data has been saved to normalized_pipelines_{environment}.json")

Then we filter on the `normalized_pipelines.json` file to only see the piplelines where something is "disabled"

In [None]:
# import json

# Load the normalized JSON data from the file
# with open('normalized_pipelines.json', 'r') as f:
#     normalized_data = json.load(f)

# Filter the data to only include items where state is 'Inactive'
inactive_pipelines = [pipeline for pipeline in normalized_data if pipeline['state'] == 'Inactive']

# Print the filtered data in a table format
for pipeline in inactive_pipelines:
    print(f"Name: {pipeline['name']}")
    print(f"Last Publish Time: {pipeline['lastPublishTime']}")
    print(f"Policy Timeout: {pipeline['policy']}")
    for obj in pipeline['inactive_objects']:
        print(f"  - Name: {obj['name']}, Description: {obj['description']}")
    print("-" * 40)

#Save the filtered data to a new JSON file
with open(f'./data/pipelines/inactive_pipelines_{environment}.json', 'w') as f:
    json.dump(inactive_pipelines, f, indent=4)

print(f"Filtered data has been saved to inactive_pipelines_{environment}.json")