In [None]:
from transformers import BertModel, BertTokenizer
import re
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False )
model = BertModel.from_pretrained("Rostlab/prot_bert")


In [None]:
import torch
from Bio import SeqIO
import pandas as pd

In [1]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [None]:
import torch
from transformers import BertModel, BertTokenizer
from Bio import SeqIO
import pandas as pd
import re

# Load ProtBERT model and tokenizer
MODEL_NAME = "Rostlab/prot_bert"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME, do_lower_case=False)
model = BertModel.from_pretrained(MODEL_NAME)
model.eval()  # Set model to evaluation mode

def extract_features(sequence):
    """Tokenizes and extracts features from a single sequence using ProtBERT."""
    sequence = re.sub(r"[UZOB]", "X", sequence)  # Replace uncommon amino acids
    tokens = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**tokens)
    return outputs[0].mean(dim=1).squeeze().numpy()  # Mean pooling

def process_fasta(input_fasta, output_excel):
    """Processes all sequences in a FASTA file and saves extracted features to an Excel file."""
    data = []
    
    for record in SeqIO.parse(input_fasta, "fasta"):
        sequence_id = record.id
        sequence = " ".join(list(str(record.seq)))  # Add spaces for ProtBERT tokenization
        features = extract_features(sequence)
        data.append([sequence_id] + features.tolist())
    
    # Convert to DataFrame
    column_names = ["Sequence_ID"] + [f"Feature_{i}" for i in range(features.shape[0])]
    df = pd.DataFrame(data, columns=column_names)
    
    # Save to Excel
    df.to_excel(output_excel, index=False)
    print(f"Feature extraction completed. Results saved to {output_excel}")

# Example usage
process_fasta("/kaggle/input/hypo-2aaa/gene_protein_sequences (1).fasta", "protbert_features.xlsx")

In [None]:
!pip install biopython pandas openpyxl PyBioMed


In [None]:
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import pandas as pd
import itertools
from collections import Counter
from PyBioMed.PyProtein import CTD, AAComposition

# Define Amino Acids
AA_LIST = "ACDEFGHIKLMNPQRSTVWY"
DIPEPTIDES = [''.join(pair) for pair in itertools.product(AA_LIST, repeat=2)]
TRIPEPTIDES = [''.join(triplet) for triplet in itertools.product(AA_LIST, repeat=3)]

# Function to clean sequences by replacing ambiguous amino acids
def clean_sequence(sequence):
    """Replace ambiguous amino acids with a common residue (e.g., X → L)."""
    AMBIGUOUS_AA_MAP = {
        "X": "L",  # X: Unknown, replaced with Leucine (most common)
        "B": "D",  # B: Aspartic Acid (D) or Asparagine (N), replaced with Aspartic Acid
        "Z": "E",  # Z: Glutamic Acid (E) or Glutamine (Q), replaced with Glutamic Acid
        "U": "C",  # U: Selenocysteine, replaced with Cysteine
        "O": "K"   # O: Pyrrolysine, replaced with Lysine
    }
    
    return ''.join(AMBIGUOUS_AA_MAP.get(aa, aa) for aa in sequence if aa in AA_LIST or aa in AMBIGUOUS_AA_MAP)

# Function to compute Amino Acid Composition (AAC)
def compute_aac(sequence):
    analyzed_seq = ProteinAnalysis(str(sequence))
    aac = analyzed_seq.amino_acids_percent  # Updated from get_amino_acids_percent()
    return {aa: round(aac.get(aa, 0), 5) for aa in AA_LIST}

# Function to compute Dipeptide Composition (DPC)
def compute_dpc(sequence):
    count = Counter([sequence[i:i+2] for i in range(len(sequence)-1) if sequence[i:i+2] in DIPEPTIDES])
    total = sum(count.values())
    return {dipeptide: round(count.get(dipeptide, 0) / total, 5) if total > 0 else 0 for dipeptide in DIPEPTIDES}

# Function to compute Tripeptide Composition (TPC)
def compute_tpc(sequence):
    count = Counter([sequence[i:i+3] for i in range(len(sequence)-2) if sequence[i:i+3] in TRIPEPTIDES])
    total = sum(count.values())
    return {tripeptide: round(count.get(tripeptide, 0) / total, 5) if total > 0 else 0 for tripeptide in TRIPEPTIDES}

# Function to compute Physicochemical Properties
def compute_physicochemical(sequence):
    analyzed_seq = ProteinAnalysis(str(sequence))
    
    properties = {
        "Molecular_Weight": round(analyzed_seq.molecular_weight(), 2),
        "Isoelectric_Point": round(analyzed_seq.isoelectric_point(), 2),
        "Aromaticity": round(analyzed_seq.aromaticity(), 5),
        "Instability_Index": round(analyzed_seq.instability_index(), 2),
        "Aliphatic_Index": round(sum([analyzed_seq.amino_acids_percent.get(aa, 0) * w 
                                      for aa, w in zip("AVIL", [100, 143, 121, 117])]), 2),
        "Flexibility": round(sum(analyzed_seq.flexibility()), 2) if analyzed_seq.flexibility() else 0,
        "Hydrophobicity": round(sum([analyzed_seq.amino_acids_percent.get(aa, 0) * w 
                                     for aa, w in zip(AA_LIST, [1.8, 2.5, -3.5, -3.5, 2.8, -0.4, -3.2, -3.9, 4.5, 3.8, 1.9, -3.5, -1.6, -3.5, -4.5, -0.8, -0.7, -0.9, 4.2, -1.3])]), 2),
    }
    
    return properties

# Function to process the FASTA file and extract features
def extract_features(fasta_file, output_excel):
    records = list(SeqIO.parse(fasta_file, "fasta"))
    data = []

    for record in records:
        seq_id = record.id
        sequence = clean_sequence(str(record.seq).upper())  # Clean sequence
        
        if len(sequence) < 3:
            print(f"Skipping {seq_id} (sequence too short)")
            continue

        features = {"Sequence_ID": seq_id}
        features.update(compute_aac(sequence))
        features.update(compute_dpc(sequence))
        features.update(compute_tpc(sequence))
        features.update(compute_physicochemical(sequence))
        
        data.append(features)

    df = pd.DataFrame(data)
    df.to_excel(output_excel, index=False)
    print(f"Feature extraction complete. Data saved to {output_excel}")

# Example Usage
extract_features("/kaggle/input/hypo-stral/sequence (3).fasta", "protein_features.xlsx")


In [None]:
import pandas as pd
from Bio import Entrez, SeqIO

In [None]:
!pip install biopython

In [None]:

# Define the file path
file_path = "/kaggle/input/jfgjfghhb/Human_HP_GenBank_gene_result.csv"

# Read the CSV file
df = pd.read_csv(file_path)

# Display the first few rows and column names
df.head(), df.columns


In [None]:
from Bio import Entrez, SeqIO
import pandas as pd

# Set email (required for NCBI)
Entrez.email = "705607145v@gmail.com"

# Function to fetch protein sequence using GeneID
def fetch_protein_fasta(gene_id):
    try:
        # Step 1: Find linked protein sequences
        handle = Entrez.elink(dbfrom="gene", db="protein", id=gene_id)
        record = Entrez.read(handle)
        handle.close()
        
        # Extract linked protein IDs
        protein_ids = [link["Id"] for linkset in record for link in linkset["LinkSetDb"][0]["Link"]]
        if not protein_ids:
            print(f"No protein record found for GeneID {gene_id}")
            return None
        
        protein_id = protein_ids[0]  # Take the first available protein
        
        # Step 2: Fetch sequence in FASTA format
        handle = Entrez.efetch(db="protein", id=protein_id, rettype="fasta", retmode="text")
        fasta_data = handle.read()
        handle.close()
        
        return fasta_data
    except Exception as e:
        print(f"Error fetching protein sequence for GeneID {gene_id}: {e}")
        return None

# Load GeneIDs from your CSV
df = pd.read_csv("/kaggle/input/jfgjfghhb/Human_HP_GenBank_gene_result.csv")  # Modify as needed
gene_ids = df["GeneID"].astype(str).tolist()

# Fetch sequences and save to a FASTA file
with open("/kaggle/working/gene_protein_sequences.fasta", "w") as fasta_file:
    for gene_id in gene_ids:
        fasta_seq = fetch_protein_fasta(gene_id)
        if fasta_seq:
            fasta_file.write(fasta_seq + "\n")

print("Protein FASTA sequences saved to gene_protein_sequences.fasta")
