In [None]:
import boto3
import pandas as pd
import io
from botocore import UNSIGNED
from botocore.config import Config
import matplotlib.pyplot as plt
import seaborn as sns

## **Data Engineering**

Data Engineering involves the collection, transformation, and management of data to facilitate analytics and machine learning applications. 
In this notebook, data engineering principles are applied to extract and process genomic data stored in AWS S3.

Key components covered in this notebook include:

- Data Extraction from AWS S3: The dataset is retrieved using Boto3, allowing anonymous access to S3 buckets.

- Data Processing: The extracted GTF data follows a structured format with standardized columns such as seqname, source, feature, start, end, score, strand, and frame. This structure ensures consistency and usability in downstream analysis.

- Transformation & Visualization: The dataset is processed using Pandas, and key insights are visualized using Matplotlib. This step is crucial for understanding data distributions and patterns.

In [None]:
# Connect to S3 anonymously
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))

In [None]:
def load_gtf_data(bucket_name, file_key): # Load in the information using bucket and key information on AWS
    try:
        response = s3.get_object(Bucket= ---- , Key= ----) # Notice the '----' signs? That's code for you to fill in!
        # fill in the get_object method with the appropriate variables
        gtf_data = response['Body'].read().decode('utf-8')
        return gtf_data
    except Exception as e:
        print(f"Error fetching data from S3: {e}")
        return None

The GTF data has the following standardised column names together with their attributes

1. `seqname`: Name of the sequence (e.g., chromosome).
2. `source`: Source of the annotation (e.g., a specific database or tool).
3. `feature`: Type of feature (e.g., gene, exon).
4. `start`: Start position of the feature.
5. `end`: End position of the feature.
6. `score`: Score associated with the feature (often a placeholder).
7. `strand`: Strand of the feature (+ or -).
8. `frame`: Reading frame (0, 1, 2).
9. `attribute`: Additional attributes in a key-value format.

In [None]:
def parse_gtf_data(gtf_data): # Retrieve the relevant data and Load it Locally into a csv file
    col_names = [
        ----, ----, ----, "start", "end", # some of the column names here are empty --> fill them in as need be!!
        "score", "strand", "frame", "attribute"
    ]
    gtf_df = pd.read_csv(io.StringIO(gtf_data), sep="\t", comment="#", names=col_names)
    # The io library in Python allows for the quick (and even memoryless) retrieval of relevant data through boto's response
    return gtf_df

In [None]:
def filter_genes(gtf_df): # Attains the features of interest and get the relevant ID base information
  genes_df = gtf_df[gtf_df[----] == ----].copy() # What feature shld be filtered to get the relevant information as need be?
  genes_df["gene_id"] = genes_df["attribute"].str.extract('gene_id "([^"]+)"') # Regex is employed here - but can u think of other ways
  # that the code can be amended while being functional?
  return genes_df

In [None]:
def extract_attributes(df): # Amend the code as would appear logically!
    df[----] = df['attribute'].astype(str)
    df[----] = df['attribute'].str.extract('gene_name "([^"]+)"')
    df[----] = df['attribute'].str.extract('gene_biotype "([^"]+)"')
    return df

In [None]:
def clean_data(df):
    df.----(inplace=True) # A method is being used here to remove values that occur more than once -> implement that via code!
    df.----(method='ffill', inplace=True) # A method is being used here to fill up missing values -> implement that via code!
    return df

In [None]:
def normalize_data(df):
    df['start_norm'] = (df['start'] - df['start'].min()) / (df['start'].max() - df['start'].min()) # the start col has been normalised
    df['end_norm'] = ---- # Try it out for the end column as well!
    return df

In [None]:
def validate_data(df):
    print("Missing values:\n", df.isnull().sum())
    print("\nData types:\n", df.dtypes)
    print("\nUnique values in 'feature':\n", df['feature'].unique())
    print("\nRange of 'start' and 'end':\n", df[['start', 'end']].agg(['min', 'max']))

In [None]:
def aggregate_data(df): # Group data together by specific biotypes
    agg_df = df.groupby('gene_biotype').agg({
        'start': [----, ----], # Get useful metrics for attaining statistical information for the start and end columns 
        'end': [----, ----] # Try out 2 metrics each - though feel free to include more!!
    }).reset_index()
    print("Aggregated data:\n", agg_df)
    return agg_df

In [None]:
def explore_data(df): # Visualise useful information from the dataframe
    print("Summary statistics:\n", df.describe())
    df['feature'].value_counts().plot(kind='bar')
    plt.title('Distribution of Features')
    plt.xlabel('Feature')
    plt.ylabel('Count')
    plt.show()

In [None]:
def filter_long_genes(genes_df, min_length=1000):
    genes_df['length'] = genes_df['end'] - genes_df['start']
    long_genes_df = genes_df[----] # given the min_length parameter - > how would you filter the dataframe to only get long genes?
    return long_genes_df

In [None]:
def save_data(df, filename):
    df.to_csv(filename, index=False)
    print(f"Data saved as '{filename}'")

In [None]:
# Main pipeline
bucket_name = "sg-nex-data"
file_key = "data/data_tutorial/annotations/hg38_chr22.gtf"

# Fill in the method names below accordingly to create a useful pipeline structure!!
gtf_data = -----(bucket_name, file_key)
if gtf_data:
    gtf_df = -----(gtf_data)
    gtf_df = clean_data(gtf_df)
    validate_data(gtf_df)
    explore_data(gtf_df)
    genes_df = filter_genes(gtf_df)
    genes_df = -------(genes_df)
    genes_df = normalize_data(genes_df)
    agg_df = aggregate_data(genes_df)
    long_genes_df = ------(genes_df)
    save_data(----, "data/filtered_chr22_genes.csv")
    save_data(-----, "data/long_chr22_genes.csv")

## **Conclusion**

We hope to present this as a testament to the power of modern data engineering. By seamlessly integrating cloud-based storage, efficient retrieval strategies, and structured data transformations, we unlock the potential hidden within various datasets. 

The techniques showcased here lay the groundwork for further analytics and machine learning to be performed, pushing the boundaries of what’s possible in genomic research and beyond. With the right tools and methodologies, data engineers wield the power to turn raw information into groundbreaking discoveries.