In [2]:
import pandas as pd
import glob
import os
import csv

## Generating a summary of references

In [None]:
# Path to the folder with the interproscan outputs for the references
path = "./interproscan_refs"
files = glob.glob(os.path.join(path, "*.tsv"))

In [None]:
# Columns according to the InterProScan manual
columns = [
    "protein_accession",
    "md5",
    "seq_length",
    "analysis",
    "signature_accession",
    "signature_description",
    "start",
    "stop",
    "score",
    "status",
    "date",
    "ipr_accession",
    "ipr_description",
    "go_terms",
    "pathways"
]

dataframes = []

for file in files:
    # Read the file
    df = pd.read_csv(file, sep="\t", names=columns, comment="#", dtype=str)

    # Extract gene name from the file
    gene_name = os.path.basename(file).split("_")[0]
    df["Gene"] = gene_name

    # Select only the desired columns
    df = df[[
        "Gene",
        "protein_accession",
        "analysis",
        "signature_accession",
        "signature_description",
        "ipr_accession",
        "ipr_description"
    ]]

    # Remove exact duplicates (optional, but avoids repetition)
    df = df.drop_duplicates()

    dataframes.append(df)

# Concatenate all genes
final_df = pd.concat(dataframes, ignore_index=True)

# Save final result with quotes in all cells
final_df.to_csv(
    "interproscan_ref_summary.csv",
    index=False,
    quoting=csv.QUOTE_ALL,
    encoding="utf-8"
)

## Identifying common signatures among references

In [None]:
# Read the CSV file
df = pd.read_csv("interproscan_ref_summary.csv", delimiter=',')

# Ensure column names have no extra spaces
df.columns = df.columns.str.strip()

# Function to find signatures common to all proteins of a gene
def common_signatures(subdf):
    # Group by protein_accession and collect the sets
    sets_per_protein = subdf.groupby("protein_accession")["signature_accession"].apply(set)
    # Intersection among all sets
    return set.intersection(*sets_per_protein) if len(sets_per_protein) > 0 else set()

# Group by Gene and apply the function
result = df.groupby("Gene").apply(common_signatures).reset_index()
result.columns = ["Gene", "common_signatures"]

# Expand each signature into one row (if you prefer long format)
exploded_result = result.explode("common_signatures").reset_index(drop=True)

# Save to CSV
exploded_result.to_csv("common_signatures_per_gene.csv", index=False)

print(exploded_result.head())

   Gene   common_signatures
0  SIAE            SSF52266
1  SIAE  G3DSA:3.40.50.1110
2  cpsK             PF07922
3  kpsC             cd16439
4  kpsC             PF05159


  resultado = df.groupby("Gene").apply(signatures_comuns).reset_index()
