In [None]:
"""
This Jupyter Notebook provides visualizations of the AWS HealthOmics data stores.  It will generate charts showing the distribution of data stores across regions, types, and trends over time.
"""

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import boto3


sns.set(style="whitegrid")


In [None]:
import botocore.exceptions  # Import botocore exceptions

def get_omics_regions():
    # Create a boto3 session to interact with AWS
    session = boto3.session.Session()
    omics_regions = session.get_available_regions("omics")
    return omics_regions

omics_regions = get_omics_regions()
print("Available Omics regions:", omics_regions)


In [None]:
def get_stores(client, store_type, region):
    try:
        if store_type == "sequence":
            response = client.list_sequence_stores()
            key = "sequenceStores"
        elif store_type == "annotation":
            response = client.list_annotation_stores()
            key = "annotationStores"
        elif store_type == "variant":
            response = client.list_variant_stores()
            key = "variantStores"
        else:
            raise ValueError(f"Invalid store type: {store_type}")

        if key in response:
            stores = response[key]
            for store in stores:
                store["type"] = store_type
                store["region"] = region  # Add region manually to each store
                store.pop('arn', None)  # Remove the ARN if present
            return stores
        else:
            return []  
    except botocore.exceptions.ClientError as error:
        error_code = error.response['Error']['Code']
        if error_code == 'UnrecognizedClientException':
            print(f"ERROR: UnrecognizedClientException for {store_type} stores in {client.meta.region_name}. This region may not support HealthOmics.")
        else:
            print(f"ERROR: Unexpected error retrieving {store_type} stores in {client.meta.region_name}: {error}")
        return []  

all_stores = []

for region in omics_regions:
    client = boto3.client("omics", region_name=region)

    sequence_stores = get_stores(client, "sequence", region)
    annotation_stores = get_stores(client, "annotation", region)
    variant_stores = get_stores(client, "variant", region)

    region_stores = sequence_stores + annotation_stores + variant_stores
    all_stores.extend(region_stores)

stores_df = pd.DataFrame(all_stores)

stores_df_filtered = stores_df[['type', 'name', 'region']]
stores_df_filtered.head()


In [None]:
region_type_distribution = stores_df_filtered.groupby(['region', 'type']).size().unstack().fillna(0)

plt.figure(figsize=(12, 6))
region_type_distribution.plot(kind='bar', stacked=True)
plt.title('Distribution of Omics Data Stores by Region')
plt.xlabel('Region')
plt.ylabel('Number of Stores')
plt.xticks(rotation=45)
plt.legend(title='Store Type')
plt.show()


In [None]:
print("Available columns in the data:")
print(stores_df.columns)
stores_df.head()


In [None]:
# Time-Based Trend Analysis: Number of Data Stores Created Over Time

stores_df['creationTime'] = pd.to_datetime(stores_df['creationTime'])
stores_df['updateTime'] = pd.to_datetime(stores_df['updateTime'])

creation_trend = stores_df.groupby(stores_df['creationTime'].dt.date).size()

plt.figure(figsize=(10, 6))
creation_trend.plot(kind='line', marker='o')
plt.title('Trend of Data Store Creation Over Time')
plt.xlabel('Creation Date')
plt.ylabel('Number of Stores Created')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()


In [None]:
stores_df['storeSizeGB'] = stores_df['storeSizeBytes'] / (1024**3)

plt.figure(figsize=(10, 6))
plt.hist(stores_df['storeSizeGB'], bins=20, color='skyblue', edgecolor='black')
plt.title('Distribution of Data Store Sizes (in GB)')
plt.xlabel('Store Size (GB)')
plt.ylabel('Number of Stores')
plt.grid(True)
plt.show()
