In [1]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
from Bio import pairwise2
import os
import AnOxPePred_funcs as AOf
from importlib import reload

## Load in data

In [2]:
data_path = 'Data'
result_path = 'Result'
AO_db = os.path.join(data_path,'01_AO_db.csv')
AO_df = pd.read_csv(AO_db, index_col=0)

In [3]:
AO_df = AO_df[AO_df.Sequence.apply(len) <= 30].reset_index(drop=True)

In [4]:
AOf.visualize_data(AO_df, 'AOdb')

Unnamed: 0,FRS,CHEL,FRS/CHEL,NON-AO,TOTAL
AOdb,615,11,70,218,914


## Pre-process data

In [5]:
AO_2_df = AOf.homology_reduction(AO_df, 0.9)

In [6]:
AO_2_df.to_csv(os.path.join(data_path, '02_process_AO_db.csv'))
AOf.visualize_data(AO_2_df, 'AOdb <90%')

Unnamed: 0,FRS,CHEL,FRS/CHEL,NON-AO,TOTAL
AOdb <90%,606,11,70,217,904


## Retrieve random negatives

In [7]:
random_pep_file = os.path.join(data_path,'randompeptide30000.fsa')
# Generate peptides
r_df = AOf.pep_generator(AO_2_df, random_pep_file, 200)
# Remove duplicates and peptides identical with known ones
r_df = AOf.reduce_df(r_df, AO_2_df, 100)
# Take out a number of random generated negatives which follows the distribution of AO_2_df
keys, values = np.unique(AO_2_df.Sequence.apply(len).values, return_counts=True)
my_dict = dict(zip(keys, values))
r_df['distribution'] = r_df.Sequence.apply(lambda x: my_dict[len(x)])
r_df = r_df.sample(n=500, weights='distribution', random_state=10).drop(['distribution'], axis=1).reset_index(drop=True)

In [8]:
t1 = AOf.visualize_data(AO_df, 'AOdb')
t2 = AOf.visualize_data(AO_2_df, 'AOdb <90%')
t3 = AOf.visualize_data(r_df, 'Random')
pd.concat([t1,t2,t3])

Unnamed: 0,FRS,CHEL,FRS/CHEL,NON-AO,TOTAL
AOdb,615,11,70,218,914
AOdb <90%,606,11,70,217,904
Random,0,0,0,500,500


In [9]:
r_df.to_csv(os.path.join(data_path, '02_random_AO_db.csv'))

## Partition data into folds

In [10]:
reload(AOf)
t_df = pd.read_csv(os.path.join(data_path,'02_process_AO_db.csv'), index_col=0)
r_df = pd.read_csv(os.path.join(data_path,'02_random_AO_db.csv'), index_col=0)
data_df = pd.concat([t_df, r_df], sort=False).reset_index(drop=True)
hom_part60, gr60_nr = AOf.homology_partition(data_df['Sequence'], ident=0.6, parts=5)
hom_part70, gr70_nr = AOf.homology_partition(data_df['Sequence'], ident=0.7, parts=5)
hom_part80, gr80_nr = AOf.homology_partition(data_df['Sequence'], ident=0.8, parts=5)
hom_part90, gr90_nr = AOf.homology_partition(data_df['Sequence'], ident=0.9, parts=5)

In [11]:
AO_overview = AOf.hc_part_visualizer(data_df, [hom_part60,hom_part70,hom_part80, hom_part90], [60, 70, 80, 90])
AO_overview

Unnamed: 0,Sum_P1,Sum_P2,Sum_P3,Sum_P4,Sum_P5,chel_P1,chel_P2,chel_P3,chel_P4,chel_P5,frs_P1,frs_P2,frs_P3,frs_P4,frs_P5
60,701,179,177,174,173,45,6,10,9,11,367,82,78,75,74
70,281,281,281,281,280,24,14,16,16,11,149,141,126,129,131
80,281,281,281,281,280,19,16,14,16,16,139,138,134,132,133
90,281,281,281,281,280,16,17,16,16,16,136,135,135,135,135


In [12]:
Data_info = pd.DataFrame([[gr60_nr,gr70_nr,gr80_nr,gr90_nr], 
              AO_overview.iloc[:,10:15].T.apply(AOf.gini).values], 
             columns=['AO_p60','AO_p70','AO_p80','AO_p90'], 
             index=['Clusters','Gini'])

Data_info.to_csv(os.path.join(result_path,'03_Data_Info.csv'))
Data_info

Unnamed: 0,AO_p60,AO_p70,AO_p80,AO_p90
Clusters,621.0,1271.0,1351.0,1404.0
Gini,0.350888,0.03432,0.011243,0.001183


In [13]:
full_df = data_df.copy()
full_df['partition'] = full_df.apply(lambda x: [num for num, val in enumerate(hom_part60) if x.name in val][0], axis=1)
full_df.to_csv(os.path.join(data_path, '03_p60_AO_db.csv'))
full_df = data_df.copy()
full_df['partition'] = full_df.apply(lambda x: [num for num, val in enumerate(hom_part70) if x.name in val][0], axis=1)
full_df.to_csv(os.path.join(data_path, '03_p70_AO_db.csv'))
full_df = data_df.copy()
full_df['partition'] = full_df.apply(lambda x: [num for num, val in enumerate(hom_part80) if x.name in val][0], axis=1)
full_df.to_csv(os.path.join(data_path, '03_p80_AO_db.csv'))
full_df = data_df.copy()
full_df['partition'] = full_df.apply(lambda x: [num for num, val in enumerate(hom_part90) if x.name in val][0], axis=1)
full_df.to_csv(os.path.join(data_path, '03_p90_AO_db.csv'))