In [36]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql import functions as F
import pyspark.sql.types as T

In [37]:
#importlib code just for debugging : to allow changes in scripts we are imporing from
#reflect here. It can be removed if you are not making changes to the scripts we are imporing from

import importlib ##can remove
import graphing_config as gc
importlib.reload(gc) ##can remove


<module 'graphing_config' from '/gpfs/gibbs/pi/reilly/VariantEffects/scripts/noon_scripts/5.graphs/logistic_regressions/graphing_config.py'>

In [38]:
conf = SparkConf() \
    .setAppName("logreg_malin_vs_rare/common")\

# Create a SparkContext with the specified configurations
if 'spark' in locals() and spark!=None:
    spark.stop()

sc = SparkContext(conf=conf)

# Create a SparkSession from the SparkContext
spark = SparkSession(sc)

In [54]:
df = spark.read \
    .option("comment", "#") \
    .option("delimiter", ",") \
    .csv("/gpfs/gibbs/pi/reilly/VariantEffects/scripts/noon_data/3.pleio_and_filter/chr*/*.csv.gz", header=True)

                                                                                

In [55]:
int_columns=["POS","AC","AN","pleio"]
float_columns=["AF","K562__ref","HepG2__ref","SKNSH__ref","K562__alt","HepG2__alt","SKNSH__alt","K562__skew","HepG2__skew","SKNSH__skew","cadd_phred","P_ANNO","mean_ref","mean_skew","MAF"]
cre_bool_columns=[]
for column in df.columns:
    if column.startswith("is_in"):
        cre_bool_columns.append(column)
        
emvar_bool_columns=["emVar_K562","emVar_SKNSH","emVar_HepG2"]

for column in int_columns:
    df = df.withColumn(column, F.col(column).cast(T.IntegerType()))

for column in float_columns:
    df = df.withColumn(column, F.col(column).cast(T.FloatType()))

for column in cre_bool_columns+emvar_bool_columns:
    df = df.withColumn(column, F.col(column).cast(T.BooleanType()))

In [56]:
df = df.withColumn("rare_bool", F.col("category").isin(gc.rare_classes))
df = df.withColumn("rare_bool", F.col("rare_bool").cast("integer"))

In [57]:
df=df.where(F.col("is_in_PLS")==True)
df = df.na.drop(subset=["mean_skew", "mean_ref","rare_bool"])

In [58]:
assembler = VectorAssembler(inputCols=["mean_skew", "mean_ref"], outputCol="features")

In [59]:
prepared_df = assembler.transform(df)

In [60]:
(train_data, test_data) = prepared_df.randomSplit([0.7, 0.3], seed=42)

In [61]:
lr = LogisticRegression(featuresCol="features", labelCol="rare_bool")

In [62]:
pipeline = Pipeline(stages=[lr])

In [63]:
model = pipeline.fit(train_data)

24/02/26 18:48:56 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/02/26 18:48:56 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
                                                                                

In [64]:
predictions = model.transform(test_data)

In [65]:
evaluator = BinaryClassificationEvaluator(labelCol="rare_bool")

In [66]:
accuracy = evaluator.evaluate(predictions)

                                                                                

In [67]:
print(f"Test Accuracy: {accuracy}")

Test Accuracy: 0.5215079085730067


In [68]:
model.save("./model")

In [70]:
df.groupBy("rare_bool").count().show()



+---------+-------+
|rare_bool|  count|
+---------+-------+
|        1|1777086|
|        0|  73588|
+---------+-------+



                                                                                