In [1]:
#amino acid composition 
AALetter=["A","R","N","D","C","E","Q","G","H","I","L","K","M","F","P","S","T","W","Y","V"]

def CalculateAAComposition(seq):
    """
    ########################################################################
    Calculate the composition of Amino acids 
    
    for a given protein sequence.
    
    Usage:
    
    result=CalculateAAComposition(protein)

    Input: protein is a pure protein sequence.
    
    Output: result is a dict form containing the composition of 

    20 amino acids.
    ########################################################################
    """
   

    LengthSequence=len(seq)
        
    Result={}
    for i in AALetter:
        Result[i]=round(float(seq.count(i))/LengthSequence*100,3)
    return Result

In [2]:
import pandas as pd
data = pd.read_csv('seqUNi.csv', sep=';')
data

Unnamed: 0,Entry,Sequence
0,P09798,MKFCLYCCHCYIVICGKATHYYKSSKATSNLKSSNRVLMRNPMSPS...
1,P38717,MSVHGRDPKKRQLRLISVAFKEASIDSPSFRASVNFFQTRVDALED...
2,P07342,MIRQSTLKNFAIKRCFQHIAYRNTPAMRSVALAQRFYSSSSRYYSA...
3,P15790,MKCRVWSEARVYTNINKQRTEEYWDYENTVIDWSTNTKDYEIENKV...
4,P22035,MSNISTKDIRKSKPKRGSGFDLLEVTESLGYQTHRKNGRNSWSKDD...
...,...,...
4419,P41733,MDSTALKVALGCIAIRLAVNSLFPSLQQQLDQSVEFSTPVTSFRSL...
4420,P47013,MVDGLNTSNIRKRARTLSNPNDFQEPNYLLDPGNHPSDHFRTRMSK...
4421,P43636,MIEKDKRTIAFIHPDLGIGGAERLVVDAALGLQQQGHSVIIYTSHC...
4422,P38137,MTSAATVTASFNDTFSVSDNVAVIVPETDTQVTYRDLSHMVGHFQT...


In [4]:
for seq in data['Sequence'].values:
    AAC=CalculateAAComposition(seq)
    print(AAC)

{'A': 7.5, 'R': 3.69, 'N': 9.286, 'D': 6.548, 'C': 1.905, 'E': 5.238, 'Q': 3.571, 'G': 3.214, 'H': 2.024, 'I': 4.286, 'L': 11.071, 'K': 7.381, 'M': 2.619, 'F': 4.048, 'P': 3.214, 'S': 9.167, 'T': 5.952, 'W': 0.714, 'Y': 4.286, 'V': 4.286}
{'A': 4.15, 'R': 4.231, 'N': 4.475, 'D': 5.614, 'C': 1.465, 'E': 7.079, 'Q': 3.417, 'G': 2.766, 'H': 1.302, 'I': 7.16, 'L': 11.147, 'K': 9.927, 'M': 2.36, 'F': 6.021, 'P': 3.092, 'S': 10.171, 'T': 5.37, 'W': 1.221, 'Y': 4.312, 'V': 4.719}
{'A': 9.461, 'R': 4.367, 'N': 4.367, 'D': 4.658, 'C': 0.437, 'E': 5.677, 'Q': 4.658, 'G': 7.86, 'H': 2.329, 'I': 5.386, 'L': 7.569, 'K': 6.55, 'M': 3.057, 'F': 3.93, 'P': 6.405, 'S': 6.114, 'T': 6.259, 'W': 1.019, 'Y': 2.038, 'V': 7.86}
{'A': 2.688, 'R': 5.376, 'N': 5.108, 'D': 7.796, 'C': 0.538, 'E': 6.72, 'Q': 2.688, 'G': 3.495, 'H': 3.495, 'I': 8.065, 'L': 8.602, 'K': 10.215, 'M': 3.226, 'F': 5.376, 'P': 4.301, 'S': 3.763, 'T': 4.57, 'W': 1.882, 'Y': 5.914, 'V': 6.183}
{'A': 3.946, 'R': 4.686, 'N': 8.261, 'D': 6.1

In [5]:
# Define a list to store the AAC features for each sequence
aac_list = []

# Loop over each sequence in the dataframe and calculate the AAC features
for seq in data['Sequence'].values:
    aac = CalculateAAComposition(seq)
    aac_list.append(aac)

# Convert the list of AAC features to a pandas dataframe
aac_df = pd.DataFrame(aac_list)

# Add column and row names to the AAC dataframe
aac_df.columns = ['AAC_' + aa for aa in AALetter]
aac_df.index = data.index

# Print the AAC dataframe
print(aac_df)

      AAC_A  AAC_R  AAC_N  AAC_D  AAC_C  AAC_E  AAC_Q  AAC_G  AAC_H   AAC_I  \
0     7.500  3.690  9.286  6.548  1.905  5.238  3.571  3.214  2.024   4.286   
1     4.150  4.231  4.475  5.614  1.465  7.079  3.417  2.766  1.302   7.160   
2     9.461  4.367  4.367  4.658  0.437  5.677  4.658  7.860  2.329   5.386   
3     2.688  5.376  5.108  7.796  0.538  6.720  2.688  3.495  3.495   8.065   
4     3.946  4.686  8.261  6.165  0.370  5.672  3.822  6.289  3.206   4.932   
...     ...    ...    ...    ...    ...    ...    ...    ...    ...     ...   
4419  5.838  2.538  4.061  2.030  1.015  1.777  3.046  4.569  1.523  11.168   
4420  3.423  5.623  3.912  3.423  3.178  3.667  1.467  6.601  2.445   5.623   
4421  5.964  2.982  4.970  4.374  1.193  7.356  3.777  4.374  1.988   8.549   
4422  6.446  4.052  6.446  4.420  0.921  4.604  2.394  5.893  2.210   5.341   
4423  3.018  4.225  9.054  5.433  1.408  5.634  2.414  3.823  1.006   8.853   

       AAC_L   AAC_K  AAC_M  AAC_F  AAC_P   AAC_S  

In [6]:
aac_df.to_csv('aa_compositionSC.csv')
#read this file
dfaac = pd.read_csv('aa_compositionSC.csv', sep=',')
dfaac

Unnamed: 0.1,Unnamed: 0,AAC_A,AAC_R,AAC_N,AAC_D,AAC_C,AAC_E,AAC_Q,AAC_G,AAC_H,...,AAC_L,AAC_K,AAC_M,AAC_F,AAC_P,AAC_S,AAC_T,AAC_W,AAC_Y,AAC_V
0,0,7.500,3.690,9.286,6.548,1.905,5.238,3.571,3.214,2.024,...,11.071,7.381,2.619,4.048,3.214,9.167,5.952,0.714,4.286,4.286
1,1,4.150,4.231,4.475,5.614,1.465,7.079,3.417,2.766,1.302,...,11.147,9.927,2.360,6.021,3.092,10.171,5.370,1.221,4.312,4.719
2,2,9.461,4.367,4.367,4.658,0.437,5.677,4.658,7.860,2.329,...,7.569,6.550,3.057,3.930,6.405,6.114,6.259,1.019,2.038,7.860
3,3,2.688,5.376,5.108,7.796,0.538,6.720,2.688,3.495,3.495,...,8.602,10.215,3.226,5.376,4.301,3.763,4.570,1.882,5.914,6.183
4,4,3.946,4.686,8.261,6.165,0.370,5.672,3.822,6.289,3.206,...,8.015,7.398,1.973,1.973,6.412,13.194,7.275,1.233,1.726,3.453
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,4419,5.838,2.538,4.061,2.030,1.015,1.777,3.046,4.569,1.523,...,16.751,3.046,1.269,9.391,4.822,9.898,3.807,2.030,5.076,6.345
4420,4420,3.423,5.623,3.912,3.423,3.178,3.667,1.467,6.601,2.445,...,12.225,4.890,3.178,7.090,5.379,6.357,5.134,2.445,5.868,8.068
4421,4421,5.964,2.982,4.970,4.374,1.193,7.356,3.777,4.374,1.988,...,10.537,7.356,1.590,7.356,3.777,6.362,4.573,1.392,4.970,6.561
4422,4422,6.446,4.052,6.446,4.420,0.921,4.604,2.394,5.893,2.210,...,7.919,7.366,2.578,6.446,6.446,6.630,7.551,0.368,3.315,8.656


In [7]:
# Drop the 'column_to_drop' column
dfaac2 = dfaac.drop('Unnamed: 0', axis=1)

# Print the modified DataFrame
#print(dfaac2)

# Save the modified DataFrame to a new CSV file
dfaac2.to_csv('AAC.csv', index=False)
dfaac2

Unnamed: 0,AAC_A,AAC_R,AAC_N,AAC_D,AAC_C,AAC_E,AAC_Q,AAC_G,AAC_H,AAC_I,AAC_L,AAC_K,AAC_M,AAC_F,AAC_P,AAC_S,AAC_T,AAC_W,AAC_Y,AAC_V
0,7.500,3.690,9.286,6.548,1.905,5.238,3.571,3.214,2.024,4.286,11.071,7.381,2.619,4.048,3.214,9.167,5.952,0.714,4.286,4.286
1,4.150,4.231,4.475,5.614,1.465,7.079,3.417,2.766,1.302,7.160,11.147,9.927,2.360,6.021,3.092,10.171,5.370,1.221,4.312,4.719
2,9.461,4.367,4.367,4.658,0.437,5.677,4.658,7.860,2.329,5.386,7.569,6.550,3.057,3.930,6.405,6.114,6.259,1.019,2.038,7.860
3,2.688,5.376,5.108,7.796,0.538,6.720,2.688,3.495,3.495,8.065,8.602,10.215,3.226,5.376,4.301,3.763,4.570,1.882,5.914,6.183
4,3.946,4.686,8.261,6.165,0.370,5.672,3.822,6.289,3.206,4.932,8.015,7.398,1.973,1.973,6.412,13.194,7.275,1.233,1.726,3.453
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,5.838,2.538,4.061,2.030,1.015,1.777,3.046,4.569,1.523,11.168,16.751,3.046,1.269,9.391,4.822,9.898,3.807,2.030,5.076,6.345
4420,3.423,5.623,3.912,3.423,3.178,3.667,1.467,6.601,2.445,5.623,12.225,4.890,3.178,7.090,5.379,6.357,5.134,2.445,5.868,8.068
4421,5.964,2.982,4.970,4.374,1.193,7.356,3.777,4.374,1.988,8.549,10.537,7.356,1.590,7.356,3.777,6.362,4.573,1.392,4.970,6.561
4422,6.446,4.052,6.446,4.420,0.921,4.604,2.394,5.893,2.210,5.341,7.919,7.366,2.578,6.446,6.446,6.630,7.551,0.368,3.315,8.656
