In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ["PYTORCH_CUDA_ALLOC_CONF"]="expandable_segments:True"
import argparse
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import random
# from create_embeddings import create_embeddings
import numpy as np
from sklearn.model_selection import train_test_split
from Create_simple_train_tsv import get_gene_fam_per_gene, create_similar_genes, create_train_test_val
import math

In [2]:
def create_segment_5(df: pd.DataFrame, flip=True):
    df.seq_x = df.seq_x.astype("string")
    df.seq_y = df.seq_y.astype("string") 

    shuffeld_df = df.sample(frac=1,random_state=42)
    chunk_size = math.floor(shuffeld_df.shape[0]/5)
    start = 0
    stop = chunk_size
    dfs = []

    for i in range(1,5):
        # print(i)
        dfs.append(shuffeld_df.iloc[start:stop*i].reset_index())
        start += chunk_size
    
    df = dfs[0]
    # display(df.head())
    for indx, d in enumerate(dfs[0:]):
        d = d.add_suffix(str(indx))
        df = pd.concat([df, d], axis=1)
    
    # df.rename(columns={"seq_x": "seq_x_old", "seq_y": "seq_y_old"})
    # print(df.columns)
    df["seq_x"] = df.seq_x0 + df.seq_x1 + df.seq_x2 + df.seq_x3 + df.seq_x
    df["seq_y"] = df.seq_y0 + df.seq_y1 + df.seq_y2 + df.seq_y3 + df.seq_y
    return df

def create_segment_2(df: pd.DataFrame, flip=True):
    df.seq_x = df.seq_x.astype("string")
    df.seq_y = df.seq_y.astype("string") 

    halfa = df.sample(frac=0.5,random_state=42)
    halfb = df[~df.index.isin(halfa.index)]
    halfa = halfa.add_suffix("a")
    halfb = halfb.add_suffix("b")
    halfb.reset_index(inplace=True)
    halfb.rename(columns={"index":'segment_id_x'}, inplace=True)
    halfa.reset_index(inplace=True)
    halfa.rename(columns={"index":'segment_id_y'}, inplace=True)
    df = pd.concat([halfa, halfb], axis=1)
    
    if flip:
        df1 = df.sample(frac=0.5,random_state=42)
        # print("df1", df1.shape[0])
        df2 = df[~df.index.isin(df1.index)].copy()
        # print("df2", df2.shape[0])
        df1.reset_index(inplace=True)
        df2.reset_index(inplace=True)
        
        
        df1["seq_x"] = df1.seq_xa + df1.seq_xb
        df1["seq_y"] = df1.seq_ya + df1.seq_yb

        df2["seq_x"] = df2.seq_xa + df2.seq_xb
        df2["seq_y"] = df2.seq_yb + df2.seq_ya

        # print("df1", df1.shape[0])
        # print("df2", df2.shape[0])
        df = pd.concat([df1, df2]).reset_index()
        # print("df: ", df.shape[0])
        df.drop(columns=['seq_xa', 'seq_ya', 'seq_xb', 'seq_yb', "index"], inplace=True)
    else:
        df["seq_x"] = df.seq_xa + df.seq_xb
        df["seq_y"] = df.seq_ya + df.seq_yb
        df.drop(columns=['seq_xa', 'seq_ya', 'seq_xb', 'seq_yb'], inplace=True)
    
    return df


def create_negative_segment(df: pd.DataFrame):
    """Generates negative samples (not similar)
    Negatives samples are created by randomly selecting another gene from a different faimly.
    No duplicates should appear

    Args:
        df (pd.DataFrame): output from create_similar_genes

    Returns:
        pd.Dataframe: df with negative samples
    """
    df_selection = df.copy()
    groups = []
    for (fama, famb), group in df.groupby(["familya", "familyb"]):
        shuffeld = group.copy()
        # print("printing group in create negative")
        temp = df_selection.query('(familya != @fama) & (familyb != @famb)').sample(n=shuffeld.shape[0], random_state=1)
        df_selection = df_selection[~df_selection.index.isin(temp.index)]
        shuffeld[["segment_id_y", "gene_ya",  "gene_yb", "seq_y"]] = temp[["segment_id_y", "gene_ya", "gene_yb", "seq_y"]].values
        groups.append(shuffeld)
    return pd.concat(groups)

def create_train_test_val(df):
    x_index, x_val_index, _, _ = train_test_split(df.index, df.similar.astype(int).values, test_size=0.30, random_state=42)
    df_val = df.iloc[x_val_index]
    
    df = df.iloc[x_index].reset_index(drop=True)
    x_train_index, x_test_index, _, _ = train_test_split(df.index, df.similar.astype(int).values, test_size=0.20, random_state=42)
    return df.iloc[x_train_index], df.iloc[x_test_index], df_val    

In [24]:
trans_p="/home/jong505/thesis/iadh/data/"

refseqs = [f"{trans_p}annotation.all_transcripts.aar.csv.gz", f"{trans_p}/annotation.all_transcripts.ath.csv.gz", f"{trans_p}/annotation.all_transcripts.bol.csv.gz",
           f"{trans_p}annotation.all_transcripts.chi.csv.gz", f"{trans_p}/annotation.all_transcripts.cpa.csv.gz", f"{trans_p}/annotation.all_transcripts.tha.csv.gz"]

gene_fam = Path(f"{trans_p}gene_fam_parsed.tsv")
refseqs = [Path(f) for f in refseqs]
output_prefix = Path("data/aar_ath_bol_chi_cpa_tha/medium_2g_500")
output_prefix_raw = Path("data/aar_ath_bol_chi_cpa_tha/medium_2g_500")
sample_size = 500

if not output_prefix.parent.is_dir():
    raise ValueError(f"Output folder {output_prefix.parent} doesn't exist")

df0 = get_gene_fam_per_gene(gene_fam, refseqs, sample_size=700)

starting with /home/jong505/thesis/iadh/data/annotation.all_transcripts.aar.csv.gz
starting with /home/jong505/thesis/iadh/data/annotation.all_transcripts.ath.csv.gz


  dfn = pd.read_csv(path, compression='gzip', sep='\t', header=0, skiprows=8)


starting with /home/jong505/thesis/iadh/data/annotation.all_transcripts.bol.csv.gz
starting with /home/jong505/thesis/iadh/data/annotation.all_transcripts.chi.csv.gz
starting with /home/jong505/thesis/iadh/data/annotation.all_transcripts.cpa.csv.gz
starting with /home/jong505/thesis/iadh/data/annotation.all_transcripts.tha.csv.gz


In [28]:
df1 = create_similar_genes(df0)

if df1.shape[0] %2 != 0:
    df1.drop(df1.tail(1).index, inplace=True)

df_P = create_segment_2(df1, flip=True)
df_negatives = create_negative_segment(df_P)

df_P.insert(loc=7, column="similar", value=True)
df_negatives.insert(loc=7, column="similar", value=False)


df = pd.concat([df_P, df_negatives]).reset_index(drop=True)

print(df1.shape[0])
print(df_P.shape[0])
print(df_negatives.shape[0])
print(df.shape[0])

2848
1424
1424
2848


In [29]:
# print(df0.seq_y.isna().sum())
print(df1.seq_y.isna().sum())
print(df_P.seq_y.isna().sum())
print(df_negatives.seq_y.isna().sum())
print(df.seq_y.isna().sum())

0
0
0
0


In [35]:
print(df.query('family_id == "HOM05D000003" & gene_id =="Aa31sc146G90"').seq.values[0] + df.query('family_id == "HOM05D000040" & gene_id =="Aa31LG1G5240"').seq.values[0])
print(test.iloc[0].seq_x)
print("-"*20)
print(df.query('family_id == "HOM05D000003" & gene_id =="BolC3t20183H"').seq.values[0] + df.query('family_id == "HOM05D000040" & gene_id =="AT1G31450"').seq.values[0])
print(test.iloc[0].seq_y)
print("-"*20)

ATGCTTAGAACACATGTACATGAAGAAGACTTGGAATGGGTTAAGCAGGTAATTCCGGGTATTGATTCATCAAATGACTCCTACGTGTGGAATTATTCAAAGGATGGAGAATACTCTGTAAGGTCTGGAACATGGGTATACAAAAATGAAGGACAAAATCAACGCAGAACAGAGAACCCGAACCTACAACCTTTGTACAAATCAATATGGGAATCAAGAACAATACCTAAGATCAAAAACTTCATGTGGAGAGTGGTCTCAAATTGCTTAGCAGTAGGAGGGAATATGAAAAGGAAGAAGCTAACAAAGGAAGGTTTATGCCCGTTTTGTAAAACACAGGAAGAAACGGTGAATCACCTGTTGTTCACATGTCCTTTTGCAAGGTTAGTCTGGGCGGTGAGCGGAATACCGGGAATACAGAGCTCTCCCTTTAGTGAGTCTGTGTACTCCAACATAGCACGTTTGCTATGGCAACAAGCACCTGAAAAGGAAATAAGAGAACTGTTGTTAAAAGGACCCTGGATACTATGGGGGATTTGGAAATCAAGGAATGATGCAGTATTCAAAGGACTGCCACCATCAGAGATGAAGATCAGAACCAAAGCAGATATCGGAGCTGAGGAATGGAGAAGTACTGAACAAGAAAAGGCACAAGAGCAAACCAGAATGCAGGGTCCGAGAAATCGTGACCCGATCCGAGTAGATCAGGCTCGTAGAGCATGGAGACCACCGAGAGAAGGTACCTGGAAATGTAATATAGACGGTGCATGTGAGGATGGAGGAGCGAGTGCAGTAGGATGGATTCTCAGAGATTGGAGAGGCCAAACGAGAATGTGGGGAGGCAAAAAGCTTGAAATCACAAAACTGATCCAGCATCAAACAGTCCTATCGGTTGAGGCAGAGGCACTAACCTGGGCTATACTACAGATCAAGGATCTAGGTCTCGGGAGGGTTGAAATAGAATCTGACTGTGGCGATCTGGTTAAAGGACTCACAATTG

In [42]:
test.query('gene_ya == "THA.LOC104809906" and gene_yb == "BolC7t45641H"')

Unnamed: 0,level_0,segment_id_y,familya,gene_xa,gene_ya,segment_id_x,familyb,gene_xb,gene_yb,seq_x,seq_y
56,56,2481,HOM05D000017,Aa31LG11G4650,THA.LOC104809906,2969,HOM05D000048,AT3G23560,BolC7t45641H,ATGACTGATTTTGATGATGAAGCTGAAGATAGAGACCAGAATCTCG...,ATGAGGAAAATGATTGTAGTTGATTGTACATTCTTGACTGGCAAGT...


In [51]:
print(df_negatives.query('segment_id_x == 2969').seq_y.values[0])
print(test.query('segment_id_x == 2969').seq_y.values[0])

ATGCTACTTAGAAACATAGATCCTAAAGAGGGTTTGTGCAATGGTACAAGACTTCAGGTTACGAAATTAGCTAACCATGTCACTGAAGCAGAGATCATAACAACAGAAGATGATAAGGTTAAAAAGGTCATGATTCCTAGGATATTTCTTTCTCCATTGGATACAAAGTTTCCTTTCCAGATGAGGCGTCAGCAATTTCCCGTCGCATTAGCATTCATGATGACAATCAACAAAAGCCAAGGTCAAACTTTATCAAGAGTCGGACTCTATCTTCCCCGTCCAGTTTTCTCACACGGTCAGTTGTATGTGGCTCTTTTACGGGTTAAGTCAAAAGATGGTTTGAAGATTTTGATATTAGATGATAAAGGTAAATGTCAACAAAGTTTGGTGCATGGAGCAGATGGTGAAGGCAGGGAAAAACAGCGTAGAAAAGAGGAAGATAGTAAGAAAACAAAGGATGAGGAGCTTGAAGATGGATAAATGTCGACGGAACGAGAAGTAGTCTGCGTCACCGGCGCCAGTGGGGGCATCGGCTCGTGGCTAGTCCATCTACTCCTGCACCGCGGCTACTCCGTTCACGCAACCGTGAAAAACCTCCAGGACGAGAAAGAGACAAAACATCTGGAAGCTCTAGAAGGTGCAGCCACGCGCCTCCATTTATTCGAGATGGATCTCCTCCAATACGACACCGTTTTAACCGCCGTCAACGGATGTTCCGGCGTATTCCATCTCGCATCGCCTTGTATCGTCGACGAAGTCCAAGATCCTCAGAAACAACTACTTGACCCGGCGGTTAAAGGGACACTAAATGTACTAACGGCGGCGAAAGAAGCCGGCGTTAAAAGAGTTGTTGTAACGTCTTCGATATCGGCGATAACTCCAAGTCCTAACTGGCCTGTCGATAAGATCAAGAATGAGGAATGTTGGGCTGATCAAGACTACTGTAACCAACATAAATTGTGGTATCCACTGTCGAAGATACTTGCTGAGAAAGCAGCTT

In [None]:
df_negatives.insert(loc=7, column="similar", value=False)
test.insert(loc=7, column="similar", value=True)
df = pd.concat([df, df_negatives]).reset_index(drop=True)
train, test, val = create_train_test_val(df)

(1429, 9)

Maybe the segment_id are getting duplicated when concatenating the negative and normal matrix?