In [None]:
import grand_scripts
import pandas as pd
from sequence_pairs import SampleList

# *Arabidopsis thaliana*

Loads in pairs of interacting proteins from a .csv file, and the results of CDHit clustering on the coding regions that code for the proteins. Performs GRAND to eliminate redundancy in the data, then generates an equal number of non-interacting "negative" pairs. Finally, create Chaos Game Representations for each sequence and save the data.

In [None]:
id_col_at_1='Protein_1'
id_col_at_2='Protein_2'
# Load in clustering results
cluster_dict_at = grand_scripts.read_cluster_file('data/0001_build_grand_databases/Arabidopsis_thaliana/all_sequences_clustered.clstr')
# Load in pairs
pairs_df_at = pd.read_csv('data/0001_build_grand_databases/Arabidopsis_thaliana/all_positive_pairs.csv')
# Load in sequences
sequence_mapping=grand_scripts.load_sequences('data/0001_build_grand_databases/Arabidopsis_thaliana/all_sequences.fasta')
# Run GRAND on positive (PPI) pairs
positive_df_at = grand_scripts.run_grand(cluster_dict_at, pairs_df_at, id_col_at_1, id_col_at_2)
# Generate equal number of negative (non-PPI) pairs
negative_df_at = grand_scripts.get_negative_pairs(cluster_dict_at, positive_df_at, pairs_df_at, id_col_at_1, id_col_at_2)
# Make into a "sample list" object for processing PPI data
sample_list_at=SampleList.make_from_dataframes(positive_df_at, negative_df_at, sequence_mapping, id_col_at_1, id_col_at_2)
# Make chaos game representations (3-mers and 4-mers)
sample_list_at.split_and_save(range(3,5),'data/0001_build_grand_databases/Arabidopsis_thaliana','AtGrand')