In [1]:
import pickle
from pathlib import Path

import pandas as pd
import numpy as np
import seaborn.objects as so
from seaborn import axes_style

from IPython.display import display

In [10]:
gff3_col = [
    "seqid",
    "source",
    "type",
    "start",
    "end",
    "score",
    "strand",
    "phase",
    "attributes",
    "irl read number",
    "irr read number",
    "norm irl",
    "norm irr",
    "SampleID",
    "treatment",
    "mouseID",
]

# insertion_dir = Path("/home/fisch872/mat/projects/Laura-SB-Analysis/2020_SB-output/GRCm39/results-insertions")
# insertion_dir = Path("/home/fisch872/mat/projects/Laura-SB-Analysis/2020_SB-output/GRCm38/results-insertions")

insertion_dir = Path("/home/fisch872/mat/projects/Laura-SB-Analysis/2023-SB-screen/output/GRCm39/results-insertions")
# case = "ACF"
# control = "SCF"
case = "CAR"
control = "NoCAR"

In [11]:
insert_list = []
for file in insertion_dir.iterdir():
    tmp_df = pd.read_csv(file, sep="\t")
    tmp_meta = file.name.split(".")[0].split("-")
    if len(tmp_meta) == 3:  # 2020 SB
        tmp_df["treatment"] = tmp_meta[2]
        tmp_df["sampleID"] = tmp_meta[1]
        tmp_df["cell_type"] = tmp_meta[0]
    elif len(tmp_meta) == 2:  # 2023 SB
        tmp_df["treatment"] = tmp_meta[0]
        tmp_df["sampleID"] = tmp_meta[1]
    tmp_df["sample_sub_id"] = range(len(tmp_df))
    insert_list.append(tmp_df)
inserts_df = pd.concat(insert_list, ignore_index=True)
display(inserts_df)

Unnamed: 0,chr,pos,strand,ref_length,query_length,read_length,mapping_quality,read_name,TA_location,read_first_last,ref_first_last,tpn_promoter_orient,library,treatment,sampleID,sample_sub_id
0,chr1,13352291,+,36,148,148,2,VH00510:284:AACNNW5M5:1:1201:38114:31670,none,GGACCCAAAT-CGTCGTATGG,TACAAGCCTT-ACACACACAC,+,IRR,NoCAR,4_4_5,0
1,chr1,31486190,-,41,148,148,2,VH00510:284:AACNNW5M5:1:1506:34648:54293,none,CAAATTTGTG-CGTCGTATGG,CTTCTCTCTC-ACCTGAGTTA,-,IRR,NoCAR,4_4_5,1
2,chr1,31486190,-,41,148,148,2,VH00510:284:AACNNW5M5:1:1611:26979:6717,none,ACAAATTTGT-ACTCGTATGG,CTTCTCTCTC-ACCTGAGTTA,-,IRR,NoCAR,4_4_5,2
3,chr1,31486190,-,41,148,148,2,VH00510:284:AACNNW5M5:1:2301:72557:24873,none,CAAATTTGTG-CGTCGTATGG,CTTCTCTCTC-ACCTGAGTTA,-,IRR,NoCAR,4_4_5,3
4,chr1,31486190,-,41,148,148,2,VH00510:284:AACNNW5M5:1:2309:11244:29757,first,TAAATTTGTG-CGTCGTATGG,CTTCTCTCTC-ACCTGAGTTA,-,IRR,NoCAR,4_4_5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52547,chr9,5100841,-,54,75,75,9,VH00510:284:AACNNW5M5:1:2208:51141:10845,first,TACCTAAAAT-GAGTCTGGGA,CCATCTACTC-ATTTTAGGTA,+,IRL,NoCAR,3_20_17,23
52548,chr9,64562157,-,67,67,67,36,VH00510:284:AACNNW5M5:1:1510:33096:38655,first,TAATCATTAA-ATTAATAATG,CATTATTAAT-TTAATGATTA,-,IRR,NoCAR,3_20_17,24
52549,chr9,85877626,-,47,71,71,22,VH00510:284:AACNNW5M5:1:2308:52145:49144,first,TATATGGTCT-AGTCTGGGAG,CTGAGCATTC-AGACCATATA,-,IRR,NoCAR,3_20_17,25
52550,chr9,85991174,-,38,71,71,22,VH00510:284:AACNNW5M5:1:1604:49872:42745,first,TATGTCAGCT-GTATGGTTAC,GACATCACCA-AGCTGACATA,+,IRL,NoCAR,3_20_17,26


In [12]:
out_df = pd.DataFrame(inserts_df["chr"])
out_df.columns = ["seqid"]
out_df["source"] = "T2/Onc3"
out_df["type"] = "insertion site"
out_df["start"] = inserts_df["pos"]
out_df["end"] = inserts_df["pos"]
out_df["score"] = inserts_df["mapping_quality"]
out_df["strand"] = inserts_df["strand"]
out_df["phase"] = "."
out_df["attributes"] = inserts_df.apply(lambda x: f"ID={x['treatment']}:{x['sampleID']}*{x['sample_sub_id']};color=#{'154360' if x['tpn_promoter_orient'] == '+' else 'D35400'}", axis=1)
# strand is still the strand that the insertion was on
# tpn_promoter_orient is the orientation w.r.t. the IRL or IRR library
# the color in attributes is based on tpn_promoter_orient
out_df = pd.concat((out_df, inserts_df.iloc[:, 11:-1]), axis=1)
out_df = out_df[out_df["treatment"].isin([case, control])]
out_df.to_csv(insertion_dir.parent / f"{case}_{control}-insertions.gff3", sep="\t", header=True, index=False)
display(out_df)

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,tpn_promoter_orient,library,treatment,sampleID
0,chr1,T2/Onc3,insertion site,13352291,13352291,2,+,.,ID=NoCAR:4_4_5*0;color=#154360,+,IRR,NoCAR,4_4_5
1,chr1,T2/Onc3,insertion site,31486190,31486190,2,-,.,ID=NoCAR:4_4_5*1;color=#D35400,-,IRR,NoCAR,4_4_5
2,chr1,T2/Onc3,insertion site,31486190,31486190,2,-,.,ID=NoCAR:4_4_5*2;color=#D35400,-,IRR,NoCAR,4_4_5
3,chr1,T2/Onc3,insertion site,31486190,31486190,2,-,.,ID=NoCAR:4_4_5*3;color=#D35400,-,IRR,NoCAR,4_4_5
4,chr1,T2/Onc3,insertion site,31486190,31486190,2,-,.,ID=NoCAR:4_4_5*4;color=#D35400,-,IRR,NoCAR,4_4_5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
52547,chr9,T2/Onc3,insertion site,5100841,5100841,9,-,.,ID=NoCAR:3_20_17*23;color=#154360,+,IRL,NoCAR,3_20_17
52548,chr9,T2/Onc3,insertion site,64562157,64562157,36,-,.,ID=NoCAR:3_20_17*24;color=#D35400,-,IRR,NoCAR,3_20_17
52549,chr9,T2/Onc3,insertion site,85877626,85877626,22,-,.,ID=NoCAR:3_20_17*25;color=#D35400,-,IRR,NoCAR,3_20_17
52550,chr9,T2/Onc3,insertion site,85991174,85991174,22,-,.,ID=NoCAR:3_20_17*26;color=#154360,+,IRL,NoCAR,3_20_17
