### Protein Feature Extraction Script

**This script extracts various protein features from a given DataFrame of protein sequences using the BioPython and ProPy libraries. It computes features such as molecular weight, instability index, isoelectric point, GRAVY, secondary structure fraction, grouped amino acid composition, and composition, transition, and distribution descriptors. The extracted features are then combined into a new DataFrame (features_df), which includes the 'id' column from the original DataFrame for easy reference. The dataframe is saved in a csv file called features_df.csv.**

In [50]:
uniprot_df = pd.read_csv("uniprot_df.csv")
uniprot_df

Unnamed: 0,id,sequence
0,A0A068FVC1,MNIIKTAIPDVHIFEPKVFFDERGFFFESFNHKLFEEAVGYSVNFV...
1,A0A068FZD0,MTTQSSKSRVFVAGHRGMVGSAICRQLAQRTDIELVVRSRSELDLT...
2,A0A068FZK6,METSGLVAFVGTALAIACLRPLSAKLQLVDLPNQRKQHVGAIPLIG...
3,A0A075P9Z7,MNLTELKQKPITDLLQLAEEMGIENMARSRKQDVIFSLLKKHAKSG...
4,A0A075PBX8,MQISVNEFLTPRHIDVQVVSPTRAKITLEPLERGFGHTLGNALRRI...
...,...,...
29771,Q7BJX9,MDIYMSRYEEITQQLIFSPKTWLITGVAGFIGSNLLEKLLKLNQVV...
29772,Q8UVZ1,MEQANLYEVAPRPLMTSLVQNQQNPYIYKDTAGDLSEICENENSID...
29773,Q9L5A4,MKQTSLALAITALLSTLPSALVQANEGCAPLTGKESGMDIGRSSTE...
29774,R1GTS7,MFARLEGRPVLLVGGGEVALRKARLLLAAGARLTLVSPVLASEFDE...


In [51]:
import pandas as pd
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from propy import PyPro

def compute_biopython_features(sequence):
    analyzed_seq = ProteinAnalysis(sequence)
    
    mol_weight = analyzed_seq.molecular_weight()
    instability_index = analyzed_seq.instability_index()
    isoelectric_point = analyzed_seq.isoelectric_point()
    gravy = analyzed_seq.gravy()
    sec_struc_frac = analyzed_seq.secondary_structure_fraction()
    ext_coefficient = analyzed_seq.molar_extinction_coefficient()  # Returns a tuple
    
    return pd.Series({
        'Molecular_Weight': mol_weight,
        'Instability_Index': instability_index,
        'Isoelectric_Point': isoelectric_point,
        'GRAVY': gravy,
        'Secondary_Structure_Fraction_Helix': sec_struc_frac[0],
        'Secondary_Structure_Fraction_Turn': sec_struc_frac[1],
        'Secondary_Structure_Fraction_Sheet': sec_struc_frac[2],
        'Extinction_Coefficient_1': ext_coefficient[0],
        'Extinction_Coefficient_2': ext_coefficient[1],
    })


def compute_propy_features(sequence):
    descriptor = PyPro.GetProDes(sequence)
    
    # Grouped amino acid composition
    grouped_aa_composition = descriptor.GetAAComp()
    
    # CTD descriptors
    ctd_descriptors = descriptor.GetCTD()
    
    return {**grouped_aa_composition, **ctd_descriptors}


# Compute features for each sequence
biopython_features_df = uniprot_df['sequence'].apply(compute_biopython_features).apply(pd.Series)
propy_features_df = uniprot_df['sequence'].apply(compute_propy_features).apply(pd.Series)

# Combine the computed features into a single DataFrame
features_df = pd.concat([biopython_features_df, propy_features_df], axis=1)

# Add the "id" column from the original DataFrame at the beginning
features_df.insert(0, 'id', uniprot_df['id'])


features_df


Unnamed: 0,id,Molecular_Weight,Instability_Index,Isoelectric_Point,GRAVY,Secondary_Structure_Fraction_Helix,Secondary_Structure_Fraction_Turn,Secondary_Structure_Fraction_Sheet,Extinction_Coefficient_1,Extinction_Coefficient_2,...,_HydrophobicityD2001,_HydrophobicityD2025,_HydrophobicityD2050,_HydrophobicityD2075,_HydrophobicityD2100,_HydrophobicityD3001,_HydrophobicityD3025,_HydrophobicityD3050,_HydrophobicityD3075,_HydrophobicityD3100
0,A0A068FVC1,20646.9604,36.467654,6.705830,-0.427374,0.346369,0.240223,0.173184,32430.0,32430.0,...,3.352,29.609,51.955,74.860,100.000,0.559,16.760,43.017,65.363,98.883
1,A0A068FZD0,36185.9306,27.469538,6.064857,-0.147077,0.298462,0.209231,0.280000,34380.0,34755.0,...,0.615,24.308,47.077,72.615,100.000,0.308,25.846,50.462,75.077,99.385
2,A0A068FZK6,39049.7153,41.351811,8.919346,0.847911,0.417827,0.208914,0.325905,37930.0,38430.0,...,0.836,22.841,48.189,74.652,99.164,0.279,25.070,47.911,71.031,97.493
3,A0A075P9Z7,46953.7505,47.560143,7.054878,-0.295704,0.298329,0.198091,0.298329,12950.0,12950.0,...,0.955,23.628,46.778,70.883,99.045,0.239,26.492,49.642,75.418,99.284
4,A0A075PBX8,36645.2664,46.763964,4.906251,-0.202703,0.315315,0.207207,0.303303,12950.0,13075.0,...,1.201,21.021,45.345,71.171,100.000,0.300,21.321,43.544,74.474,97.598
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29771,Q7BJX9,38948.8632,35.989565,6.713674,-0.203478,0.347826,0.220290,0.220290,51800.0,51925.0,...,1.159,30.435,48.406,68.406,100.000,0.290,19.420,45.797,72.754,99.420
29772,Q8UVZ1,32647.1731,49.491667,6.809448,-0.938542,0.211806,0.260417,0.246528,17880.0,18130.0,...,1.389,28.819,49.653,64.236,100.000,0.347,19.444,46.528,81.597,99.653
29773,Q9L5A4,66634.5645,32.068606,5.699496,-0.329647,0.262821,0.307692,0.246795,74370.0,74620.0,...,0.641,22.756,49.359,71.474,100.000,0.160,24.679,52.083,78.526,98.878
29774,R1GTS7,51789.6640,40.356485,6.169952,-0.064644,0.313808,0.219665,0.326360,34950.0,35075.0,...,0.628,25.105,54.393,74.477,100.000,0.209,21.757,44.770,72.594,99.791


In [58]:
features_df.to_csv("/Users/zaidur/Documents/Sequence_Project/aeromonasBact/features_df.csv", index=False)