In [184]:
from itertools import product
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Reading the files

#### USPs

In [185]:
# Reading the usp text file as a csv file with the delimiter \t(tab) 
usp_data = pd.read_csv('data/usp.txt', delimiter='\t', header=None)
# Renaming the column and adding a new column as a label
usp_data.rename(columns={0:'sequence'}, inplace=True)
usp_data['label'] = 'usp'
usp_data.head()

Unnamed: 0,sequence,label
0,MSAQQTNLGIVVGVDGSPCSHTAVEWAARDAQMRNVALRVVQVVPP...,usp
1,MTIVVGYLAGKVGPSALHLAVRVARMHKTSLTVATIVRRHWPTPSL...,usp
2,MSAQQTNLGIVVGVDGSPCSHTAVEWAARDAQMRNVALRVVQVVPP...,usp
3,MSKPRKQHGVVVGVDGSLESDAAACWGATDAAMRNIPLTVVHVVNA...,usp
4,MSSGNSSLGIIVGIDDSPAAQVAVRWAARDAELRKIPLTLVHAVSP...,usp


#### Non-USPs

In [186]:
# Reading the usp text file as a csv file with the delimiter \t(tab) 
non_usp_data = pd.read_csv('data/non_usp.txt', delimiter='\t', header=None)
# Renaming the column and adding a new column as a label
non_usp_data.rename(columns={0:'sequence'}, inplace=True)
non_usp_data['label'] = 'non_usp'
non_usp_data.head()

Unnamed: 0,sequence,label
0,MSTTEFPTTTKRLMGWGRTAPTVASVLSTSDPEVIVRAVTRAAEEG...,non_usp
1,MAISGVPVLGFFIIAVLMSAQESWAIKEEHVIIQAEFYLNPDQSGE...,non_usp
2,MVCLKLPGGSCMTALTVTLMVLSSPLALSGDTRPRFLWQPKRECHF...,non_usp
3,MASHRLLLLCLAGLVFVSEAGPTGTGESKCPLMVKVLDAVRGSPAI...,non_usp
4,MPELPEVETSRRGIEPHLVGATILHAVVRNGRLRWPVSEEIYRLSD...,non_usp


### Merging the two data together vertically

In [187]:
df = pd.concat([usp_data, non_usp_data], ignore_index=True)
df.sample(5)

Unnamed: 0,sequence,label
401,MPPAKKGPATSARKGQKXRRREKKNVPHGAAHIKSTFNNTIVTITD...,non_usp
633,DPDAMMRPSSSRDTAFFWDGVKAHELRIQRLADGSLRHPPVPAVWQ...,non_usp
69,MAGRGGAARPNGPAAGNKICQFKLVLLGESAVGKSSLVLRFVKGQF...,non_usp
108,MPELPEVEVVRRGLAEHVTGKTITGVRVHHPRAVRRHEAGPADLTA...,non_usp
208,MNTLDFVDQASLRDDVPNFGPGDTVNVHVKVIEGSKERIQVFKGVV...,non_usp


In [188]:
df['sequence'][0]

'MSAQQTNLGIVVGVDGSPCSHTAVEWAARDAQMRNVALRVVQVVPPVITAPEGWAFEYSRFQEAQKREIVEHSYLVAQAHQIVEQAHKVALEASSSGRAAQITGEVLHGQIVPTLTNISRQVAMVVLGYRGQGAVAGALLGSVSSSLVRHAHGPVAVIPEEPRPARPPHAPVVVGIDGSPTSGLAAEIAFDEASRRGVDLVALHAWSDMGPLDFPRLNWAPIEWRNLEDEQEKMLARRLSGWQDRYPDVVVHKVVVCDRPAPRLLELAQTAQLVVVGSHGRGGFPGMHLGSVSRAVVNSGQAPVIVARIPQDPAVPA'

#### Checking the uniqueness of the sequence

In [189]:
alpha = np.unique(list(df['sequence'][0]))
alpha

array(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P',
       'Q', 'R', 'S', 'T', 'V', 'W', 'Y'], dtype='<U1')

In [190]:
df['sequence'][0].count('A')/len(df['sequence'][0])

0.12618296529968454

#### Creating a column for each of the alphabets

In [191]:
for letter in alpha:
    df[letter] = 0

In [192]:
df.head()

Unnamed: 0,sequence,label,A,C,D,E,F,G,H,I,...,M,N,P,Q,R,S,T,V,W,Y
0,MSAQQTNLGIVVGVDGSPCSHTAVEWAARDAQMRNVALRVVQVVPP...,usp,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,MTIVVGYLAGKVGPSALHLAVRVARMHKTSLTVATIVRRHWPTPSL...,usp,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,MSAQQTNLGIVVGVDGSPCSHTAVEWAARDAQMRNVALRVVQVVPP...,usp,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,MSKPRKQHGVVVGVDGSLESDAAACWGATDAAMRNIPLTVVHVVNA...,usp,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,MSSGNSSLGIIVGIDDSPAAQVAVRWAARDAELRKIPLTLVHAVSP...,usp,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Creating a column each for the di- alphabets
    using product from itertools
    and merging with the mono alphabets

In [193]:
# This was hardcoded to remove unnecesary characters
# that might make it hard to access each column
combos = product(alpha, repeat=2)
for combo in combos:
    df[str(combo).replace(',', '').replace("'",'').replace(' ', '').replace(')', '').replace('(', '')] = 0

  df[str(combo).replace(',', '').replace("'",'').replace(' ', '').replace(')', '').replace('(', '')] = 0
  df[str(combo).replace(',', '').replace("'",'').replace(' ', '').replace(')', '').replace('(', '')] = 0
  df[str(combo).replace(',', '').replace("'",'').replace(' ', '').replace(')', '').replace('(', '')] = 0
  df[str(combo).replace(',', '').replace("'",'').replace(' ', '').replace(')', '').replace('(', '')] = 0
  df[str(combo).replace(',', '').replace("'",'').replace(' ', '').replace(')', '').replace('(', '')] = 0
  df[str(combo).replace(',', '').replace("'",'').replace(' ', '').replace(')', '').replace('(', '')] = 0
  df[str(combo).replace(',', '').replace("'",'').replace(' ', '').replace(')', '').replace('(', '')] = 0
  df[str(combo).replace(',', '').replace("'",'').replace(' ', '').replace(')', '').replace('(', '')] = 0
  df[str(combo).replace(',', '').replace("'",'').replace(' ', '').replace(')', '').replace('(', '')] = 0
  df[str(combo).replace(',', '').replace("'",'').replac

In [194]:
df

Unnamed: 0,sequence,label,A,C,D,E,F,G,H,I,...,YM,YN,YP,YQ,YR,YS,YT,YV,YW,YY
0,MSAQQTNLGIVVGVDGSPCSHTAVEWAARDAQMRNVALRVVQVVPP...,usp,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,MTIVVGYLAGKVGPSALHLAVRVARMHKTSLTVATIVRRHWPTPSL...,usp,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,MSAQQTNLGIVVGVDGSPCSHTAVEWAARDAQMRNVALRVVQVVPP...,usp,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,MSKPRKQHGVVVGVDGSLESDAAACWGATDAAMRNIPLTVVHVVNA...,usp,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,MSSGNSSLGIIVGIDDSPAAQVAVRWAARDAELRKIPLTLVHAVSP...,usp,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
654,MTRAGDDAVNLTLVTGAPANGGSCVAHHEGRVVFVRYALPGERVRA...,non_usp,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
655,MGAQGYLRRLTRRLTEDLEQRDVEELSDEVLNAGAQRAIDCQRGQE...,non_usp,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
656,MFESLSDRLTAALQGLRGKGRLTDADIDATTREIRLALLEADVSLP...,non_usp,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
657,MTEALIPAPSQISLTRDEVRRYSRHLIIPDIGVNGQQRLKDARVLC...,non_usp,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [195]:
non_alpha = ['sequence', 'label']
columns = df.columns
columns = [column for column in columns if column not in non_alpha]
len(columns)

420

### Calculating the values to be inserted in the mono alphabets and di alphabets
    creating a function for calculating the value

In [197]:
for column in columns:
    df[column] = df['sequence'].apply(lambda x: x.count(column)/len(x))


In [202]:
df.head()

Unnamed: 0,sequence,label,A,C,D,E,F,G,H,I,...,YM,YN,YP,YQ,YR,YS,YT,YV,YW,YY
0,MSAQQTNLGIVVGVDGSPCSHTAVEWAARDAQMRNVALRVVQVVPP...,usp,0.126183,0.006309,0.037855,0.056782,0.015773,0.082019,0.037855,0.041009,...,0.0,0.0,0.003155,0.0,0.003155,0.003155,0.0,0.0,0.0,0.0
1,MTIVVGYLAGKVGPSALHLAVRVARMHKTSLTVATIVRRHWPTPSL...,usp,0.116438,0.010274,0.037671,0.054795,0.013699,0.078767,0.030822,0.034247,...,0.0,0.0,0.003425,0.0,0.003425,0.006849,0.003425,0.0,0.0,0.0
2,MSAQQTNLGIVVGVDGSPCSHTAVEWAARDAQMRNVALRVVQVVPP...,usp,0.129338,0.006309,0.037855,0.056782,0.015773,0.082019,0.037855,0.041009,...,0.0,0.0,0.003155,0.0,0.003155,0.003155,0.0,0.0,0.0,0.0
3,MSKPRKQHGVVVGVDGSLESDAAACWGATDAAMRNIPLTVVHVVNA...,usp,0.125424,0.010169,0.050847,0.061017,0.010169,0.081356,0.023729,0.027119,...,0.0,0.0,0.00678,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,MSSGNSSLGIIVGIDDSPAAQVAVRWAARDAELRKIPLTLVHAVSP...,usp,0.117845,0.006734,0.057239,0.053872,0.003367,0.077441,0.030303,0.040404,...,0.0,0.0,0.003367,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Doing some analyses
    - correlation of the variables
    - relationship of the variables with the outcome/label