# Script used to reformat and export Hail covariate and phenotype tables for use with Regenie

In [None]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
output_notebook()

In [None]:
UKBbucket_root = 'gs://rbif120data'
UKBbucket = UKBbucket_root + '/ukb/'
UKBbucket_pheno = UKBbucket + 'pheno/'
import hail as hl

# Hail

In [None]:
#to fix spark ui problem
import os
os.environ.pop("spark.ui.proxyBase", None)
#sys.props.update("spark.ui.proxyBase", "")

In [None]:
import os
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

# Unset any existing PYSPARK_SUBMIT_ARGS
os.environ.pop("PYSPARK_SUBMIT_ARGS", None)

hail_jar_path = "/opt/conda/lib/python3.10/site-packages/hail/backend/hail-all-spark.jar"

os.environ['PYSPARK_SUBMIT_ARGS'] = f'--jars {hail_jar_path} --driver-class-path {hail_jar_path} --conf spark.executor.extraClassPath=./hail-all-spark.jar pyspark-shell'

#os.environ['PYSPARK_SUBMIT_ARGS'] = ""
conf = SparkConf().setAppName("Hail") \
    .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .set("spark.kryo.registrator", "is.hail.kryo.HailKryoRegistrator") \
    .set("spark.driver.memory", "32g") \
    .set("spark.ui.proxyBase", "") \
    .set("spark.driver.maxResultSize", "8g") \
    .set("spark.kryoserializer.buffer.max", "2047m") \
    .set("spark.jars", hail_jar_path) \
    .set("spark.driver.extraClassPath", hail_jar_path) \
    .set("spark.executor.extraClassPath", "./hail-all-spark.jar")

sc = SparkContext(conf=conf)
spark = SparkSession(sc)

hail_context = hl.init(sc)

## Export Hail covariates table to text file for Regenie input

In [None]:
import pandas as pd
td_pro=hl.read_table(UKBbucket_pheno + "npx_processing")
td_pro.count()

tablePhe=hl.read_table(UKBbucket_pheno + "AllPhenosCat2")

tablePhe = tablePhe.annotate(
    age_squared = tablePhe.age ** 2,
    age_sex = tablePhe.age * (tablePhe.sex +1),
    age_squared_sex = (tablePhe.age ** 2) * (tablePhe.sex +1)
)

tablePhe= tablePhe.join(td_pro)

covariates=['Batch','days_to_olink','age','sex','age_squared','age_sex','age_squared_sex','PC1','PC2','PC3','PC4','PC5','PC6','PC7','PC8','PC9','PC10']
donttest=['gSex','gMissing','Caucasian','Aneupleoidy','hetOutlier']

# tablePhe.head(3).show()

In [None]:
# Select desired covariates from Pheno table
tableCov = tablePhe.select(*covariates)
# tableCov.head(3).show()

In [None]:
tableCov.export(UKBbucket + "regenie/covariates.txt")

In [None]:
hl.stop()

In [None]:
sc.stop()

### Convert olink protein data (.csv) to regenie input format (tab-separated .txt w/ FID and IID as first 2 columns)

In [None]:
import pandas as pd
df = pd.read_csv(UKBbucket + "npx/npx_rint.csv")

In [None]:
cols = df.columns.tolist()

In [None]:
cols = cols[-1:] + cols[-1:] + cols[:-1]
#cols[0:5]

In [None]:
df = df[cols]
#df.columns.tolist()[0:5]

In [None]:
df.columns = ['FID', 'IID'] + cols[2:]
#df.columns.tolist()[0:5]

In [None]:
df.to_csv(UKBbucket + "regenie/phenotype_bin.txt", sep='\t', index=False, na_rep='NA')