In [1]:
import os
import glob
import itertools
import re
import git
import numpy as np
import pandas as pd
import skbio

# Import this project's library
import wgregseq 

# Seaborn, useful for graphics
import seaborn as sns

# Import Interactive plot libraries
import bokeh.plotting
import bokeh.layouts
from bokeh.themes import Theme
import holoviews as hv
import hvplot
import hvplot.pandas
import iqplot

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib_venn as mpl_venn

bokeh.io.output_notebook()
hv.extension('bokeh')

# Find home directory for repo
repo = git.Repo("./", search_parent_directories=True)
homedir = repo.working_dir



In [3]:
# Set PBoC style for matplotlib
wgregseq.viz.pboc_style_mpl()
# Increase dpi
mpl.rcParams['figure.dpi'] = 110

# Set PBoC plotting style
theme = Theme(json=wgregseq.viz.pboc_style_bokeh())
hv.renderer('bokeh').theme = theme
bokeh.io.curdoc().theme = theme

AttributeError: module 'wgregseq' has no attribute 'viz'

In [7]:
# Define data directory
datadir = f"{homedir}/data/seq/processed_sequencing/"


# List all fastq.gz files
fastq_files = glob.glob(f"{datadir}/20211201_grouped_mapping/*.fastq.gz")

fastq_files

['/Users/tomroschinger/git/Reg-Seq2/data/seq/processed_sequencing//20211201_grouped_mapping/scrambles_merged.fastq.gz']

In [8]:
# Use skbio to have a generator to iterate over fastq
seqs = skbio.io.read(
    fastq_files[0],
    format="fastq",
    verify="false",
    variant="illumina1.8",
)

# Define number of samples
n_samples = 10000

# Initialize list to save sequence objects
seq_list = list()
# Iterate over sequences
for seq in itertools.islice(seqs, n_samples):
    # Extract sequence information
    seq_id = seq.metadata["id"]
    quality = seq._positional_metadata["quality"].values
    sequence = str(skbio.DNA(sequence=seq, validate=True))
    # Append to list
    seq_list.append([sequence, quality, np.sum(quality)])

# Initialize dataframe to save sequences
names = ["sequence", "quality", "total_quality"]
df_seq = pd.DataFrame.from_records(seq_list, columns=names)

# Add index and sequence length to dataframe
df_seq["seq_len"] = df_seq.sequence.apply(len)
df_seq.head(10)

Unnamed: 0,sequence,quality,total_quality,seq_len
0,TTTTGTTAAGTTACTCATTTGAGCTCCCGCATTGACTAGTTATAGT...,"[15, 16, 33, 32, 15, 33, 33, 17, 17, 16, 17, 3...",2757,126
1,TCTCGTTTTCTGTTTTTACTTGTTTCTTTTTTTTTTTTTTTTGTTT...,"[18, 18, 18, 18, 15, 15, 15, 32, 15, 18, 33, 1...",2393,130
2,TCTATACATAGTAATTTGACGAGCTCCCGCATTGACTAGTTGGAAC...,"[39, 19, 37, 20, 38, 20, 20, 20, 35, 20, 20, 2...",3263,126
3,TTTTTTTTGCCTTTTTTTGTTGTTTCTTTTTTTTTTTTTTCACTCT...,"[18, 18, 16, 16, 32, 15, 15, 32, 15, 16, 16, 1...",2386,130
4,AATACATGTTTAATCGGAAAGAGCTCCCGCATTGACTAGTTATAGT...,"[15, 15, 33, 16, 16, 16, 35, 17, 35, 37, 38, 1...",5063,220
5,GATAGCCATCGATCGTGCTAGGTCGACTTCAGGGATAACATGGCAC...,"[15, 15, 33, 15, 16, 16, 32, 16, 33, 16, 15, 1...",2412,126
6,TCGATTTTATTTTTTTCTATGAGCTCCCGCATTGACTAGTTGGAAC...,"[15, 16, 15, 15, 33, 17, 33, 16, 17, 16, 17, 1...",3028,126
7,TTTTTTTTCTTATTTTTTTGTTTTTTCTTTTTTTTTTTTCTTTTTT...,"[18, 16, 16, 15, 32, 15, 32, 14, 15, 16, 16, 1...",2470,130
8,TCCCATTTGTACCTTGTTTTGAGCTCCCGCATTGACTAGTTGGAAC...,"[38, 16, 16, 15, 16, 16, 16, 17, 17, 35, 17, 1...",2838,126
9,ATCCAGTCATGATAACGAATGAGCTCCCGCATTGACTAGTTGGAAC...,"[15, 33, 32, 32, 16, 16, 37, 32, 16, 35, 16, 1...",3190,126


Take a look at distribution of read lengths.

In [9]:
bokeh.io.show(
    iqplot.ecdf(
        data=df_seq[['seq_len']],
        q='seq_len'
    )
)

Clear peaks for read 1 (126 bp after trimming) and read 2 (130bp after trimming), which could not be merged, as well as merged reads of length 220bp.

In [10]:
bokeh.io.show(
    iqplot.ecdf(
        data=df_seq[['total_quality']],
        q='total_quality'
    )
)

Average total quality score for sequence length.

In [11]:
df_qual_len = df_seq.groupby('seq_len')['total_quality'].agg(np.mean).reset_index()
p = bokeh.plotting.figure(
    x_axis_label="Sequence Length",
    y_axis_label="Average Total Quality",
    #y_axis_type="log",
    frame_width=500,
    frame_height=300
    
)
p.scatter(df_qual_len.seq_len, df_qual_len.total_quality)
bokeh.io.show(p)

In [12]:
# Read 1
np.sum(df_seq.seq_len == 126)

2212

In [13]:
# Read 2
np.sum(df_seq.seq_len == 130)

2137

In [14]:
# Merged
np.sum(df_seq.seq_len == 220)

4958

In [32]:
df_filt = df_seq[df_seq.seq_len == 220].reset_index(drop=True)
df_filt.insert(4, "barcode", [seq[0:20] for seq in df_filt.sequence])
df_filt.insert(4, "promoter", [str(skbio.DNA(sequence=seq[-160:]).complement(reverse=True)) for seq in df_filt.sequence])
df_filt.head()

Unnamed: 0,sequence,quality,total_quality,seq_len,promoter,barcode
0,AATACATGTTTAATCGGAAAGAGCTCCCGCATTGACTAGTTATAGT...,"[15, 15, 33, 16, 16, 16, 35, 17, 35, 37, 38, 1...",5063,220,TTGGTCTTGCCTTTTGCGGTTGTTCGTTTTTCATGATTCTCACTCT...,AATACATGTTTAATCGGAAA
1,TTTCTATCCTTTTTTTTAACGAGCTCCCGCATTGACTAGTTATAGT...,"[19, 33, 38, 33, 35, 20, 35, 37, 18, 20, 33, 3...",5537,220,GGCGTTTTCATCTCGTTGTTTCCCTTTGTCTGTTTGTTATTGCGCT...,TTTCTATCCTTTTTTTTAAC
2,ATATTGTATACTCTACCATCGAGCTCCCGCATTGACTAGTTATAGT...,"[17, 32, 19, 35, 35, 37, 38, 20, 37, 20, 32, 3...",5361,220,TTGTTTTTCCTCCCTTGTGATCGGTGGTTTATTCTGTGTATTGTCG...,ATATTGTATACTCTACCATC
3,GCATATCGTACTTGTACCCAGAGCTCCCGCATTGACTAGTTATAGT...,"[17, 17, 17, 37, 33, 37, 32, 17, 38, 32, 37, 3...",6342,220,TTTGACTTAGCAATGGCTGCTCCTGGCACAAACTTAGATACCATCA...,GCATATCGTACTTGTACCCA
4,ATTTGACATTCGTAGTAAGAGAGCTCCCGCATTGACTAGTTATAGT...,"[17, 37, 38, 37, 33, 35, 33, 36, 39, 39, 37, 3...",6716,220,GTTTTCATATCGCGTAAATCCACAGGGACAGCCCTCGATGTTGTCG...,ATTTGACATTCGTAGTAAGA


In [33]:
df_list1 = pd.read_csv(f"{homedir}/data/twist_order/twist_sys_scrambles_10.csv", index_col=0)
df_list2 = pd.read_csv(f"{homedir}/data/twist_order/twist_sys_scrambles_2_16.csv", index_col=0)
df_list = pd.concat([df_list1, df_list2])
df_list.insert(3, "promoter", [x.upper() for x in df_list.seq])
df_list.head()

Unnamed: 0,name,seq,primers_added,promoter
0,scr10__znuCp_0_10_0,accgacgcgaTCCCCAGAGAGCGGCGGATAATGCTGCGAAAAGAAG...,False,ACCGACGCGATCCCCAGAGAGCGGCGGATAATGCTGCGAAAAGAAG...
1,scr10__znuCp_0_10_1,tcaccctcaaTCCCCAGAGAGCGGCGGATAATGCTGCGAAAAGAAG...,False,TCACCCTCAATCCCCAGAGAGCGGCGGATAATGCTGCGAAAAGAAG...
2,scr10__znuCp_0_10_2,agtagagcttTCCCCAGAGAGCGGCGGATAATGCTGCGAAAAGAAG...,False,AGTAGAGCTTTCCCCAGAGAGCGGCGGATAATGCTGCGAAAAGAAG...
3,scr10__znuCp_0_10_3,cgcgccattaTCCCCAGAGAGCGGCGGATAATGCTGCGAAAAGAAG...,False,CGCGCCATTATCCCCAGAGAGCGGCGGATAATGCTGCGAAAAGAAG...
4,scr10__znuCp_5_15_4,GTGTTagtagagaatAGAGAGCGGCGGATAATGCTGCGAAAAGAAG...,False,GTGTTAGTAGAGAATAGAGAGCGGCGGATAATGCTGCGAAAAGAAG...


In [34]:
name_list = df_list.name.values
list_promoters = df_list.promoter.values
sequenced_promoters = df_filt.promoter.values
ident_list = []

for promoter in sequenced_promoters:
    index = np.where(promoter == list_promoters)
    if len(index[0]) != 0:
        ident_list.append(name_list[index[0][0]])
    else:
        ident_list.append("None")
        
df_filt.insert(5, "identified_promoter", ident_list)

In [35]:
np.sum(df_filt.identified_promoter != 'None')/len(df_filt)

0.4497781363453005

In [37]:
df_return = df_filt[["identified_promoter", "promoter", "barcode"]]
df_return[df_return.identified_promoter != 'None'].reset_index()

Unnamed: 0,index,identified_promoter,promoter,barcode
0,5,scr10__rspAp_35_45_1642,GACAAAAGGTATTCTATTTCATCTTTTGTCAACCAGGGCACTTATA...,TTGAGGATAATTTTACTTCC
1,7,scr10__rspAp_100_110_5659,GACAAAAGGTATTCTATTTCATCTTTTGTCAACCATTCACAGCGCA...,CACTATCGGTTTGCAATCCA
2,12,scr10__marRp_49_59_4551,CTCTTTAGCTAGCCTTGCATCGCATTGAACAAAACTTGAACCGATT...,GTTATTATTTTATTCTCGTG
3,13,scr10__ftsKp1_5_15_4809,ACACGAGACACGTGCGTTATGGAAGAAGTCAAGCAGAGTAATCGTC...,CTCTGAAGTTTATATAAACC
4,18,scr10__ftsKp1_36_46_4840,ACACGGACATACGTTGTTATGGAAGAAGTCAAGCAGGCGTTGACGA...,GTATCAGGATTAAAAATGAT
...,...,...,...,...
2225,4949,scr216_rspAp_124_128_2771,GACAAAAGGTATTCTATTTCATCTTTTGTCAACCATTCACAGCGCA...,TATAGACCCACTTGGTATAT
2226,4951,scr10__rpsBp_33_43_5894,ATTTCGCCAAACGTGCCACTGAAGGTTTTCTATTCCCATTCCCTCG...,CAAAGACATTGTCTTCAATG
2227,4952,scr10__relBp_104_114_4304,ATGGACTTAGCAATGGCTGCTCCTGGCACAAAGCGGACAGTGATCA...,GGATCTGTTCCATCTGTATT
2228,4953,scr216_serCp_136_144_1638,TAATCCGAGAGATTCTTTTGTGTGATGCAAGCCACATTTTTGCCCT...,CTTGTCCAATATATATCATC


All results.

In [52]:
df_all = pd.read_csv(f"{homedir}/data/seq/barcodes/20211201_grouped_mapping/barcode_mapping.csv")
df_all = df_all[df_all.identified_promoter != 'None']

In [53]:
df_collapse = df_all.groupby(['identified_promoter', 'promoter', 'barcode']).agg(len).reset_index()

In [60]:
df_collapse.rename(columns={0: "counts"}, inplace=True)
np.sum(df_collapse.counts > 2)

750