In [78]:
import pandas as pd 


In [79]:
# firstly we create a tsv file with all our folds
# in this case we chose to use tuples since we are working with fixed values 
fold_files = [
    ("fold_aa", "aa", "train"),
    ("fold_ab", "ab", "train"),
    ("fold_ac", "ac", "train"),
    ("fold_ad", "ad", "train"),
    ("fold_ae", "ae", "train"),
    ("fold_bench_aa", "aa", "bench"),
    ("fold_bench_ab", "ab", "bench"),
    ("fold_bench_ac", "ac", "bench"),
    ("fold_bench_ad", "ad", "bench"),
    ("fold_bench_ae", "ae", "bench"),
]
rows = []
for file_name, num, setname in fold_files:
    f= open(file_name, 'r')
    for line in f:
        sequence_id = line.strip()
        rows.append([sequence_id, num, setname])

folds = pd.DataFrame(rows, columns=["id", "Fold", "Set"])

folds.to_csv("folds.tsv", sep="\t", index=False)

In [80]:
positive= pd.read_csv("positive_info.tsv", sep="\t")
negative= pd.read_csv("neg_info.tsv", sep="\t")

In [85]:
# here we add a label to the positive and to the negative TSV files with the original information
positive["label"] = "positive"
negative["label"] = "negative"

# glue together both positive and negatives into one big dataframe:
pos_neg = pd.concat([positive, negative], ignore_index=True)

# we merge the databases together 
# using merge in this case is much more suitable because we can align two datasets based on matching keys
# merge the dataset folds with pos_neg, matching the column named "uniprot accession numbers" on pos_neg and the column "id" on folds
# "how=left" tells the code to add columns even if they dont match or are empty
all_info = pos_neg.merge(folds, left_on="UniprotAccession", right_on="id", how="left")

# now we drop the column named id to avoid overwriting of the same accession number
all_info = all_info.drop(columns=["id"])

# adding N/A to the TMHelix column for the positive dataset 
all_info["TMHelixFirst90"] = all_info["TMHelixFirst90"].fillna("N/A")

#store into csv
all_info.to_csv("all.tsv", sep="\t", index=False)