In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pyspark.sql.types as T
import os

In [3]:
spark = SparkSession.builder.appName("add_TE").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/14 16:23:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
chromosome="NONE"
chromosome="chr22"

if "which_chr" in os.environ:
    chromosome = os.environ['which_chr']

if chromosome=="NONE":
    print("error : did not find which chromosome we are supposed to crunch!")
    exit(-1)
else:
    print("only crunching chromosome "+chromosome)

only crunching chromosome chr22


In [5]:
variant_path=f"/home/mcn26/varef/scripts/noon_data/2.2.add_roulette/{chromosome}/*.csv.gz"

variants=spark.read.option("delimiter","\t") \
    .csv(variant_path, header=True, inferSchema=True)

24/05/14 16:24:05 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

In [6]:
transposon_path="/home/mcn26/varef/data/repmask/hg38.fa.out.fixed.gz"

This file is a modified version of the output of repeatmasker. It's a very strange format, which I have munged slightly to be more tractable. Take a look at `helper/prechew_transposons` for more information.

In [7]:
transposon = spark.read \
    .option("sep", "\t") \
    .option("header", "true") \
    .csv(transposon_path)

In [8]:
transposon.take(3)

[Row(SW_score='463', perc_div='1.3', perc_del='0.6', perc_ins='1.7', query_sequence='chr1', pos_in_query_begin='10001', pos_in_query_end='10468', (left)='(248945954)', matching_repeat='+', repeat_class_slash_family='(TAACCC)n', position_in_repeat_begin='Simple_repeat', position_in_repeat_end='1', position_in_repeat_left='471', id='(0)'),
 Row(SW_score='3612', perc_div='11.4', perc_del='21.5', perc_ins='1.3', query_sequence='chr1', pos_in_query_begin='10469', pos_in_query_end='11447', (left)='(248944975)', matching_repeat='C', repeat_class_slash_family='TAR1', position_in_repeat_begin='Satellite/telo', position_in_repeat_end='(399)', position_in_repeat_left='1712', id='483'),
 Row(SW_score='484', perc_div='25.1', perc_del='13.2', perc_ins='0.0', query_sequence='chr1', pos_in_query_begin='11505', pos_in_query_end='11675', (left)='(248944747)', matching_repeat='C', repeat_class_slash_family='L1MC5a', position_in_repeat_begin='LINE/L1', position_in_repeat_end='(2382)', position_in_repeat_l

In [24]:
# not terrifically efficient, should probably be pre-processes for
# faster speeds, but this is a quick step, so it shouldn't matter terribly.
transposon_chr=transposon.filter(F.col("query_sequence") == chromosome)

In [26]:
columns_to_keep = ["query_sequence","pos_in_query_begin","pos_in_query_end"]
transposon_chr = transposon_chr.select(columns_to_keep)

In [38]:
# the variants is 1-based, as it is from a VCF
# The transposon table is 1-based, 
# as we discovered in /helper/prechew_transposons/1.0.determine_base
# this makes checking ranges quite easy !

#Chr1        T   A   C   G   T
#          | | | | | | | | | |
#1 based   | 1 | 2 | 3 | 4 | 5
#0 based   0   1   2   3   4

#broadcast transposon for performance
transposon_broadcast = F.broadcast(transposon_chr)

#left-join
#note that this will duplicate variants if they fall into multiple transposons
joined=variants.join(transposon_broadcast,
            (variants.POS > transposon_chr.pos_in_query_begin) & 
            (variants.POS <= transposon_chr.pos_in_query_end),
            how='left'
    )

#add a column that records whether each variant is in a transposon or not
result = joined.withColumn("in_rep", 
             F.when(F.col("pos_in_query_begin").isNotNull(), F.lit(True)).otherwise(F.lit(False)))

#drop rep_element columns we don't care about anymore
#important for dedup
result=result.drop(*transposon_broadcast.columns)

# deduplicate variants by grouping by all columns that aren't the newly added in_rep, and taking the max
# max wil thus indicate "in any rep element"
group_columns = [col for col in result.columns if col != "in_rep"]
result = result.groupBy(*group_columns).agg(F.max("in_rep").alias("in_rep"))



In [39]:
result.write.option("codec", "org.apache.hadoop.io.compress.GzipCodec") \
    .option("delimiter", "\t") \
    .csv(f"/home/mcn26/varef/scripts/noon_data/2.3.add_transposons/{chromosome}.csv.gz", header=True, mode="overwrite")

DataFrame[CHROM: string, POS: int, REF: string, ALT: string, ID: string, QUAL: string, FILTER: string, INFO: string, K562__ref: double, HepG2__ref: double, SKNSH__ref: double, K562__alt: double, HepG2__alt: double, SKNSH__alt: double, K562__skew: double, HepG2__skew: double, SKNSH__skew: double, AC: int, AN: int, AF: double, cadd_phred: double, is_in_dELS: boolean, is_in_CA: boolean, is_in_pELS: boolean, is_in_CA-H3K4me3: boolean, is_in_CA-CTCF: boolean, is_in_PLS: boolean, is_in_TF: boolean, is_in_CA-TF: boolean, P_ANNO: double, mean_ref: double, mean_skew: double, MAF: double, category: string, roulette_PN: string, roulette_MR: double, roulette_MG: double, in_rep: boolean]