In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import os
from collections import defaultdict
import warnings

# Suppress specific warnings from matplotlib if needed (optional)
warnings.filterwarnings("ignore", message="Glyph .* missing from current font.")
warnings.filterwarnings("ignore", message="FixedFormatter should only be used together with FixedLocator")


def parse_attributes(attr_string):
    """Parses the GTF attributes string into a dictionary."""
    attributes = {}
    # Use regex to handle variations like spaces, quotes, etc.
    # Handles 'key "value";' or 'key=value;' or 'key value;' (common in some GTFs)
    # Adjusted to be more robust with NCBI/Ensembl GTF variations
    for match in re.finditer(r'([\w.-]+)\s+(?:"([^"]+)"|([^\s;]+))\s*;?', attr_string):
        key = match.group(1)
        # Value can be in group 2 (quoted) or group 3 (unquoted)
        value = match.group(2) if match.group(2) else match.group(3)
        attributes[key] = value
    return attributes

# ADD THIS IMPORT AT THE TOP OF YOUR SCRIPT
import matplotlib.patches as mpatches
# Make sure other imports like pandas, plt, re, os, defaultdict, warnings are also present

# ... (Keep the parse_attributes function as it is) ...

def process_and_plot_single_gene(gene_df, gene_id, outdir, show_labels=False, colors=None):
    """
    Processes data for a single gene, calculates stats, and generates the plot
    with a legend and adjusted x-axis spacing.

    Args:
        gene_df (pd.DataFrame): DataFrame containing GTF entries ONLY for the target gene.
        gene_id (str): The ID of the gene to process.
        outdir (str): Output directory path.
        show_labels (bool): Whether to show labels on wide segments.
        colors (dict): Dictionary mapping feature types to colors.
    """
    # --- Color Definition (ensure consistent use) ---
    if colors is None:
        colors = {
            'exon': '#4169e1',        # RoyalBlue
            'intron': '#d3d3d3',      # LightGray
            'start_codon': '#32cd32', # LimeGreen
            'stop_codon': '#ff6347',  # Tomato
            # UTR is not explicitly handled in plotting based on request, but keep color if needed elsewhere
            # 'UTR': '#add8e6'
        }
    # Features we will actually plot and include in the legend
    plot_feature_types = ['exon', 'intron', 'start_codon', 'stop_codon']

    # --- Initial Checks and Setup ---
    if gene_df.empty:
        print(f"Gene {gene_id}: No entries found in GTF.")
        return

    try:
        gene_df['start'] = pd.to_numeric(gene_df['start'])
        gene_df['end'] = pd.to_numeric(gene_df['end'])
    except ValueError as e:
        print(f"Error converting start/end to numeric for {gene_id}: {e}. Check GTF format.")
        return

    # --- 1. Find the longest transcript AND store parsed attributes ---
    transcripts = defaultdict(lambda: {'start': float('inf'), 'end': float('-inf'), 'exons': []})
    transcript_id_keys = ['transcript_id', 'TranscriptID']
    has_transcript_id = False
    parsed_attrs_list = []
    longest_transcript_id = None
    gene_df_reset = gene_df.reset_index(drop=True)

    for index, row in gene_df_reset.iterrows():
        attrs = parse_attributes(row['attributes'])
        parsed_attrs_list.append(attrs)
        transcript_id = None
        for key in transcript_id_keys:
            if key in attrs:
                transcript_id = attrs[key]
                break

        if not transcript_id: continue
        has_transcript_id = True

        if row['feature'] == 'exon':
            transcripts[transcript_id]['start'] = min(transcripts[transcript_id]['start'], row['start'])
            transcripts[transcript_id]['end'] = max(transcripts[transcript_id]['end'], row['end'])
            # transcripts[transcript_id]['exons'].append((row['start'], row['end'])) # Not strictly needed anymore

    if not has_transcript_id:
        print(f"Gene {gene_id}: No 'transcript_id' found. Attempting fallback using all exons/codons.")
        longest_transcript_id = f"implicit_{gene_id}"
        transcript_df = gene_df_reset[gene_df_reset['feature'].isin(plot_feature_types)].copy()
        if transcript_df.empty:
            print(f"Gene {gene_id}: No exon/codon features found for fallback.")
            return
        transcript_df['transcript_id_parsed'] = longest_transcript_id
    else:
        if not transcripts:
            print(f"Gene {gene_id}: No transcripts with exons found.")
            return

        max_len = -1
        for tid, data in transcripts.items():
            if data['start'] != float('inf'):
                length = data['end'] - data['start'] + 1
                if length > max_len:
                    max_len = length
                    longest_transcript_id = tid

        if longest_transcript_id is None:
            print(f"Gene {gene_id}: Could not determine longest transcript ID.")
            return

        print(f"\nGene {gene_id}: Selected longest transcript '{longest_transcript_id}' (Approx Span: {max_len} bp)")

        # --- 2. Filter data for the longest transcript ---
        transcript_rows = []
        if len(gene_df_reset) != len(parsed_attrs_list):
             print(f"CRITICAL WARNING: Length mismatch rows/attributes for {gene_id}. Skipping.")
             return

        for i in range(len(gene_df_reset)):
            row = gene_df_reset.iloc[i]
            attrs = parsed_attrs_list[i]
            tid_found = False
            for key in transcript_id_keys:
                if key in attrs and attrs[key] == longest_transcript_id:
                    tid_found = True
                    break
            if tid_found and row['feature'] in plot_feature_types:
                transcript_rows.append(row.copy())

        if not transcript_rows:
            print(f"Gene {gene_id}: No plottable features (exon/codon) found for transcript '{longest_transcript_id}'.")
            return
        transcript_df = pd.DataFrame(transcript_rows)

    # --- 3. Extract Features and Infer Introns ---
    if transcript_df.empty:
        print(f"Gene {gene_id}: Transcript DataFrame is empty. Cannot plot.")
        return
    if 'feature' not in transcript_df.columns:
        print(f"Gene {gene_id}: Filtered DataFrame lacks 'feature' column.")
        return

    exons = transcript_df[transcript_df['feature'] == 'exon'].sort_values('start').reset_index(drop=True)
    start_codons = transcript_df[transcript_df['feature'] == 'start_codon']
    stop_codons = transcript_df[transcript_df['feature'] == 'stop_codon']

    if exons.empty:
        print(f"Gene {gene_id}: No exons found for '{longest_transcript_id}'. Cannot plot structure.")
        return

    strand = exons['strand'].iloc[0]
    features = []

    # Add valid exons
    for _, exon in exons.iterrows():
        if exon['end'] < exon['start']: continue
        features.append({'start': exon['start'], 'end': exon['end'], 'length': exon['end'] - exon['start'] + 1, 'type': 'exon', 'strand': exon['strand']})

    # Add valid start codon
    start_codon = None
    if not start_codons.empty:
        sc = start_codons.sort_values('start').iloc[0]
        if sc['end'] >= sc['start']:
            start_codon = {'start': sc['start'], 'end': sc['end'], 'length': sc['end'] - sc['start'] + 1, 'type': 'start_codon', 'strand': sc['strand']}
            if start_codon['length'] != 3: print(f"Warning: Gene {gene_id} start_codon length is {start_codon['length']} bp.")
            features.append(start_codon)

    # Add valid stop codon
    stop_codon = None
    if not stop_codons.empty:
        stc = stop_codons.sort_values('start').iloc[0]
        if stc['end'] >= stc['start']:
            stop_codon = {'start': stc['start'], 'end': stc['end'], 'length': stc['end'] - stc['start'] + 1, 'type': 'stop_codon', 'strand': stc['strand']}
            if stop_codon['length'] != 3: print(f"Warning: Gene {gene_id} stop_codon length is {stop_codon['length']} bp.")
            features.append(stop_codon)

    # Infer Introns from valid exons
    introns = []
    valid_exons_for_introns = sorted([f for f in features if f['type'] == 'exon'], key=lambda x: x['start'])
    for i in range(len(valid_exons_for_introns) - 1):
        exon1_end = valid_exons_for_introns[i]['end']
        exon2_start = valid_exons_for_introns[i+1]['start']
        if exon1_end < exon2_start - 1:
             intron_start, intron_end = exon1_end + 1, exon2_start - 1
             intron_len = intron_end - intron_start + 1
             if intron_len > 0: introns.append({'start': intron_start, 'end': intron_end, 'length': intron_len, 'type': 'intron', 'strand': strand})
    features.extend(introns)
    features.sort(key=lambda x: x['start'])


    # --- 4. Calculate and Print Statistics ---
    # (Using a simplified approach focusing on the features list)
    stats = defaultdict(lambda: {'count': 0, 'lengths': [], 'total_length': 0})
    print(f"--- Statistics for {gene_id} (Based on features from transcript: {longest_transcript_id}) ---")
    # Calculate stats directly from the 'features' list which contains exons, inferred introns, and codons
    for f in features:
        f_type = f['type']
        if f_type in plot_feature_types: # Only count types we care about
             stats[f_type]['count'] += 1
             stats[f_type]['lengths'].append(f['length'])
             stats[f_type]['total_length'] += f['length']

    total_structure_length = sum(stats[f_type]['total_length'] for f_type in plot_feature_types)

    for f_type in plot_feature_types:
        if f_type in stats:
            print(f"  {f_type.capitalize().replace('_',' ')}s:")
            print(f"    Count: {stats[f_type]['count']}")
            print(f"    Total Length: {stats[f_type]['total_length']} bp")
            print(f"    Lengths: {stats[f_type]['lengths']}")
        else:
            print(f"  {f_type.capitalize().replace('_',' ')}s: Not present")
    print(f"  Total Structure Length (Sum of plotted types): {total_structure_length} bp")
    print("--------------------------------------------------")


    # --- 5. Prepare Features for Plotting (Handle Codon Embedding) ---
    plot_segments = []
    processed_exons = set()
    plot_start_codon = next((f for f in features if f['type'] == 'start_codon'), None)
    plot_stop_codon = next((f for f in features if f['type'] == 'stop_codon'), None)

    # Ensure features are sorted by start for reliable processing
    features.sort(key=lambda x: x['start'])

    for feature in features:
        if feature['type'] == 'exon':
            exon_start, exon_end = feature['start'], feature['end']
            exon_len = feature['length']
            exon_ident = (exon_start, exon_end)
            if exon_ident in processed_exons: continue

            contained_codons = []
            if plot_start_codon and plot_start_codon['start'] >= exon_start and plot_start_codon['end'] <= exon_end: contained_codons.append(plot_start_codon)
            if plot_stop_codon and plot_stop_codon['start'] >= exon_start and plot_stop_codon['end'] <= exon_end: contained_codons.append(plot_stop_codon)

            if not contained_codons:
                plot_segments.append({'start': exon_start, 'length': exon_len, 'type': 'exon'})
            else:
                contained_codons.sort(key=lambda x: x['start'])
                current_pos = exon_start
                for codon in contained_codons:
                    if codon['start'] > current_pos: plot_segments.append({'start': current_pos, 'length': codon['start'] - current_pos, 'type': 'exon'})
                    plot_segments.append({'start': codon['start'], 'length': codon['length'], 'type': codon['type']})
                    current_pos = codon['end'] + 1
                if current_pos <= exon_end: plot_segments.append({'start': current_pos, 'length': exon_end - current_pos + 1, 'type': 'exon'})
                processed_exons.add(exon_ident)

        elif feature['type'] == 'intron':
            plot_segments.append({'start': feature['start'], 'length': feature['length'], 'type': 'intron'})
        # Codons are handled via exons

    plot_segments.sort(key=lambda x: x['start'])


    # --- 6. Plotting ---
    fig, ax = plt.subplots(figsize=(12, 2)) # Increased height slightly for legend/padding
    current_plot_pos = 0
    # Use total_structure_length calculated from stats for consistency in % label check
    total_plot_length = sum(s['length'] for s in plot_segments if s['length'] > 0) # Use this for xlim

    if total_plot_length == 0:
         print(f"Warning: Gene {gene_id} has total plot segment length of 0. Skipping plot.")
         plt.close(fig)
         return

    plotted_types = set() # Keep track of types actually plotted to include in legend

    for segment in plot_segments:
        length = segment['length']
        if length <= 0: continue
        f_type = segment['type']
        color = colors.get(f_type, '#808080')
        plotted_types.add(f_type) # Mark this type as plotted

        ax.barh(0, length, left=current_plot_pos, height=0.3, color=color, edgecolor='black', linewidth=0.5)

        # Add label if requested and segment is wide enough
        # Use total_structure_length (sum of exon/intron/codon) for the 5% check
        if show_labels and total_structure_length > 0 and (length / total_structure_length >= 0.05):
            label_text = f_type.upper().replace('_CODON','')
            try:
                r, g, b = plt.cm.colors.to_rgb(color)
                brightness = (r * 299 + g * 587 + b * 114) / 1000
                text_color = 'white' if brightness < 0.5 else 'black'
            except ValueError: text_color = 'black'
            ax.text(current_plot_pos + length / 2, 0, label_text, ha='center', va='center', color=text_color, fontsize=8, weight='bold')

        current_plot_pos += length

    ax.set_yticks([])
    ax.set_xlabel("Length (bp)")
    ax.xaxis.labelpad = 15 # *** INCREASED PADDING HERE *** (Adjust value as needed)

    title_transcript_part = f"(Transcript: {longest_transcript_id})" if longest_transcript_id and not longest_transcript_id.startswith("implicit_") else "(Transcript ID not found or fallback)"
    ax.set_title(f"Gene Structure: {gene_id} {title_transcript_part}", loc='left')

    ax.set_xlim(0, total_plot_length) # Set x-limits based on plotted segments
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)

    # *** ADD LEGEND HERE ***
    legend_patches = []
    # Create patches only for types that were actually plotted
    type_order_for_legend = ['exon', 'intron', 'start_codon', 'stop_codon'] # Control legend order
    for f_type in type_order_for_legend:
         if f_type in plotted_types:
            label = f_type.replace('_', ' ').capitalize()
            patch = mpatches.Patch(color=colors[f_type], label=label)
            legend_patches.append(patch)

    if legend_patches: # Only add legend if there's something to show
        ax.legend(handles=legend_patches, loc='best', bbox_to_anchor=(1.0, 1.15), ncol=len(legend_patches), fontsize='small', frameon=False)
        # Adjust bbox_to_anchor y-value (1.15) to move legend slightly above the plot if needed
        # Adjust ncol based on how many items you expect, or remove it for vertical stacking.


    # Adjust layout AFTER adding all elements (including legend)
    plt.tight_layout(rect=[0, 0, 1, 0.95]) # Add rect to maybe prevent title overlap with moved legend
    # plt.subplots_adjust(top=0.85) # Alternative/additional spacing adjustment if needed

    # --- 7. Save Output ---
    os.makedirs(outdir, exist_ok=True)
    base_filename = os.path.join(outdir, gene_id)
    png_path = f"{base_filename}.png"
    pdf_path = f"{base_filename}.pdf"

    try:
        plt.savefig(png_path, dpi=300, bbox_inches='tight')
        plt.savefig(pdf_path, bbox_inches='tight')
        print(f"Saved plots to {png_path} and {pdf_path}")
    except Exception as e:
        print(f"Error saving plots for {gene_id}: {e}")

    plt.close(fig)

def plot_genes(gtf_path, gene_list, outdir, show_labels=False):
    """
    Main function to read GTF and plot structures for a list of genes.

    Args:
        gtf_path (str): Path to the Arabidopsis GTF file.
        gene_list (list): List of gene IDs (e.g., ["AT3G50410", "AT5G62940"]).
        outdir (str): Path to the output directory for plots.
        show_labels (bool): Whether to show labels on plot segments.
    """
    print(f"Starting gene structure plotting...")
    print(f"GTF File: {gtf_path}")
    print(f"Genes: {gene_list}")
    print(f"Output Directory: {outdir}")

    # Create output directory if it doesn't exist
    os.makedirs(outdir, exist_ok=True)

    # --- Read GTF File Once ---
    try:
        print("Reading GTF file...")
        # Define column names for clarity
        col_names = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attributes']
        gtf_df = pd.read_csv(
            gtf_path,
            sep='\t',
            comment='#',
            header=None,
            names=col_names,
            usecols=['seqname', 'feature', 'start', 'end', 'strand', 'attributes'], # Read only necessary columns
            # Specify dtype for potentially large coordinate columns if needed, otherwise pandas infers
            dtype={'seqname': str, 'feature': str, 'start': int, 'end': int, 'strand': str, 'attributes': str}
        )
        print(f"Successfully read {len(gtf_df)} lines from GTF.")

    except FileNotFoundError:
        print(f"Error: GTF file not found at {gtf_path}")
        return
    except Exception as e:
        print(f"Error reading GTF file: {e}")
        return

    # --- Pre-filter GTF for rows potentially related to the target genes ---
    # This requires parsing attributes, which can be slow.
    # Optimization: Check if 'gene_id' or 'GeneID' (common variations) followed by any of the gene IDs exists in the attribute string.
    # This is still a string search but targets the likely key more specifically.
    print("Filtering GTF for target genes (this may take a moment)...")
    # Make the pattern more robust: look for gene_id key variations and handle quotes/no quotes
    # Example pattern: (?:gene_id|GeneID)\s+(?:"AT3G50410"|AT3G50410)|(?:gene_id|GeneID)\s+(?:"AT5G62940"|AT5G62940)
    patterns = []
    gene_id_keys = ['gene_id', 'GeneID'] # Add other variations if needed
    for g in gene_list:
        # Create pattern part for one gene ID, checking both keys and quoted/unquoted value
        patterns.append(f"(?:{'|'.join(gene_id_keys)})\\s+(?:\"{re.escape(g)}\"|{re.escape(g)})")
    combined_pattern = '|'.join(patterns)

    try:
        relevant_df = gtf_df[gtf_df['attributes'].str.contains(combined_pattern, regex=True, na=False)].copy()
    except Exception as e:
         print(f"Error during initial filtering with regex pattern: {combined_pattern}")
         print(f"Error message: {e}")
         print("Proceeding without pre-filtering (may be slower).")
         relevant_df = gtf_df # Fallback to using the full dataframe if regex fails

    # Check if relevant_df is empty after filtering
    if relevant_df.empty and gtf_df['attributes'].str.contains(combined_pattern, regex=True, na=False).any():
        # This condition shouldn't normally happen if the filter works, but as a safeguard
        print(f"Warning: Initial filtering resulted in an empty DataFrame, but matches might exist. Check filter pattern and GTF format.")
        # Proceed with the unfiltered dataframe to be safe, though it will be slower.
        relevant_df = gtf_df
    elif relevant_df.empty:
         print(f"Warning: No GTF entries found containing the specified gene IDs patterns: {gene_list}")
         # Still proceed, process_and_plot_single_gene will handle empty dataframes per gene

    else:
         print(f"Found {len(relevant_df)} potentially relevant GTF entries after initial filtering.")


    # --- Process Each Gene ---
    processed_count = 0
    gene_id_key_to_use = None # Determine which gene ID key is present in the filtered data

    # Determine the actual gene_id key used in this GTF subset
    if not relevant_df.empty:
        first_attrs = parse_attributes(relevant_df['attributes'].iloc[0])
        for key in gene_id_keys:
            if key in first_attrs:
                gene_id_key_to_use = key
                print(f"Detected '{gene_id_key_to_use}' as the key for gene identifiers.")
                break
        if not gene_id_key_to_use:
            print("Warning: Could not automatically detect a standard gene ID key (gene_id, GeneID) in the filtered attributes. Filtering per gene might fail.")
            # Fallback: try searching for the gene ID value itself within the attributes string
            gene_id_key_to_use = None # Signal to use direct string search below


    for gene_id in gene_list:
        print(f"\nProcessing Gene: {gene_id}...")

        # Filter the `relevant_df` for the specific gene_id more accurately
        gene_specific_df = pd.DataFrame() # Initialize empty DataFrame

        if not relevant_df.empty:
             # Dynamically build the search pattern based on detected key or fallback
            if gene_id_key_to_use:
                 # Handles key "value" or key value
                search_pattern = f'{gene_id_key_to_use}\\s+(?:"{re.escape(gene_id)}"|{re.escape(gene_id)})'
            else:
                 # Fallback: search for the gene ID value anywhere, potentially less specific
                 # Requires quotes around it or space/semicolon boundary to reduce false positives
                 search_pattern = f'(?:\"{re.escape(gene_id)}\"|\\s{re.escape(gene_id)}[\\s;])'
                 print(f"Warning: Using fallback search pattern for {gene_id}: {search_pattern}")

            try:
                gene_specific_df = relevant_df[relevant_df['attributes'].str.contains(search_pattern, regex=True, na=False)].copy()
            except Exception as e:
                 print(f"Error filtering for specific gene {gene_id} with pattern '{search_pattern}': {e}")
                 gene_specific_df = pd.DataFrame() # Ensure it's empty on error


        if gene_specific_df.empty:
            # Try a simple, direct string search as a last resort, in case regex was too complex/failed
            gene_specific_df_fallback = relevant_df[relevant_df['attributes'].str.contains(f'"{gene_id}"', na=False) | relevant_df['attributes'].str.contains(f' {gene_id};', na=False) | relevant_df['attributes'].str.contains(f' {gene_id} ', na=False)]
            if not gene_specific_df_fallback.empty:
                 print(f"Info: Found entries for {gene_id} using simple string search after regex failed.")
                 gene_specific_df = gene_specific_df_fallback.copy()
            else:
                 print(f"Gene {gene_id}: No entries found after specific filtering (checked multiple patterns). Check if '{gene_id}' exists in the GTF with expected formatting.")
                 continue # Skip to the next gene

        process_and_plot_single_gene(gene_specific_df, gene_id, outdir, show_labels=show_labels)
        processed_count += 1

    print(f"\nFinished processing. Plotted structures for {processed_count} out of {len(gene_list)} requested genes.")

In [20]:
# --- Example Usage ---
if __name__ == "__main__":
    # --- Configuration ---
    # USE YOUR PROVIDED VALUES HERE
    GTF_FILE_PATH = r"C:\Users\Lenovo\Downloads\ncbi_dataset (1)\ncbi_dataset\data\GCF_000001735.4\genomic.gtf"
    GENES_TO_PLOT = ['AT1G07640', 'AT4G24060', 'AT5G60850', 'AT1G64620', 'AT3G45610', 'AT5G60200', 'AT1G47655', 'AT3G50410', 'AT5G66940']
    OUTPUT_DIRECTORY = "gene_plots"
    SHOW_LABELS_ON_PLOT = True # Set to False to hide labels like EXON, INTRON

    # --- Run the plotting function ---
    plot_genes(GTF_FILE_PATH, GENES_TO_PLOT, OUTPUT_DIRECTORY)

    print(f"\nScript finished. Check the '{OUTPUT_DIRECTORY}' folder for plots.")

Starting gene structure plotting...
GTF File: C:\Users\Lenovo\Downloads\ncbi_dataset (1)\ncbi_dataset\data\GCF_000001735.4\genomic.gtf
Genes: ['AT1G07640', 'AT4G24060', 'AT5G60850', 'AT1G64620', 'AT3G45610', 'AT5G60200', 'AT1G47655', 'AT3G50410', 'AT5G66940']
Output Directory: gene_plots
Reading GTF file...
Successfully read 806166 lines from GTF.
Filtering GTF for target genes (this may take a moment)...
Found 76 potentially relevant GTF entries after initial filtering.
Detected 'gene_id' as the key for gene identifiers.

Processing Gene: AT1G07640...

Gene AT1G07640: Selected longest transcript 'NM_001035911.2' (Approx Span: 3383 bp)
--- Statistics for AT1G07640 (Based on features from transcript: NM_001035911.2) ---
  Exons:
    Count: 2
    Total Length: 3097 bp
    Lengths: [2669, 428]
  Introns:
    Count: 1
    Total Length: 286 bp
    Lengths: [286]
  Start codons:
    Count: 1
    Total Length: 3 bp
    Lengths: [3]
  Stop codons:
    Count: 1
    Total Length: 3 bp
    Length

In [29]:
# Demonstration of the five Python idioms mentioned by the user

from collections import defaultdict
import re

# --- helper used in snippet 1
def parse_attributes(attr: str):
    """A minimal version of parse_attributes that extracts key \"value\" pairs."""
    pat = re.compile(r'([\\w.-]+)\\s+"([^"]+)"')
    return {k: v for k, v in pat.findall(attr)}

# 1️⃣ tid = parse_attributes(row["attributes"]).get("transcript_id")
row = {"attributes": 'gene_id "AT1G01010"; transcript_id "AT1G01010.1";'}
tid = parse_attributes(row["attributes"]).get("transcript_id")
print("1) transcript_id extracted from attributes:", tid)

# 2️⃣ spans: dict[str, list[int]] = defaultdict(lambda: [10**12, -10**12])
spans: dict[str, list[int]] = defaultdict(lambda: [10**12, -10**12])
spans["T1"][0] = 100         # update min start
spans["T1"][1] = 105         # update max end
spans["T2"][0] = 8000
spans["T2"][1] = 300
print("\\n2) defaultdict with initial huge/small numbers:", dict(spans))

# 3️⃣ max(spans, key=lambda k: spans[k][1] - spans[k][0])
longest_tid = max(spans, key=lambda k: spans[k][1] - spans[k][0])
print("3) longest transcript id:", longest_tid)

# 4️⃣ zip(df['start'], df['end'], df['feature'])
starts = [100, 200, 300]
ends = [150, 260, 320]
features = ["exon", "intron", "exon"]
zipped = list(zip(starts, ends, features))
print("\\n4) result of zip on three lists:", zipped)

# 5️⃣ next((f for f in feats if f['type']=='start_codon'), None)
feats = [{"type": "exon"}, {"type": "start_codon"}, {"type": "stop_codon"}]
start_codon = next((f for f in feats if f["type"] == "start_codon"), None)
print("\\n5) first item matching 'start_codon':", start_codon)


1) transcript_id extracted from attributes: None
\n2) defaultdict with initial huge/small numbers: {'T1': [100, 105], 'T2': [8000, 300]}
3) longest transcript id: T1
\n4) result of zip on three lists: [(100, 150, 'exon'), (200, 260, 'intron'), (300, 320, 'exon')]
\n5) first item matching 'start_codon': {'type': 'start_codon'}


In [34]:
nums=1

In [36]:
square = lambda x: x ** 2
print(square(5))    

25


In [42]:
ba=lambda x,y: [x+1,y+3]

In [43]:
ba(1,1)

[2, 4]