In [2]:
import pandas as pd 
import subprocess
import os

In [3]:

current_dir = os.path.abspath('')
data_path = os.path.join(current_dir, '..', 'data')
original_data_path = os.path.join(data_path, 'original_data')
scripts_path = os.path.join(current_dir, 'scripts')
model_data_path = os.path.join(data_path, 'processed_data', 'replicated_models')
model_output_path = os.path.join('..', 'model_outputs')

In [4]:
tss_path = os.path.join(original_data_path, "rLP5_Endo2_lb_expression_formatted.txt")
scramble_path = os.path.join(original_data_path, "endo_scramble_expression_formatted.txt")
peak_path= os.path.join(original_data_path, "peak_tile_expression_formatted.txt")
flp3_path = os.path.join(original_data_path, "fLP3_Endo2_lb_expression_formatted.txt")
rlp6_path = os.path.join(original_data_path, "rLP6_Endo2_lb_expression_formatted.txt")

In [18]:
def generate_processed_std_files():
    r_script_path = os.path.join(scripts_path, "standardize_expression.R")
    result = subprocess.run(["Rscript", r_script_path, tss_path, scramble_path, peak_path, flp3_path, rlp6_path], shell=True, capture_output=True, text=True)

    print("STDOUT:\n", result.stdout)
    print("STDERR:\n", result.stderr)

    if result.returncode == 0:
        print("R script ran successfully.")
    else:
        print(f"R script failed with return code {result.returncode}.")

generate_processed_std_files()

STDOUT:
 
STDERR:
 
Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

Adding missing grouping variables: `tss_name`

R script ran successfully.


In [19]:
python_path = os.path.join(scripts_path, "define_genome_splits.py")
model_format_path = os.path.join(model_data_path, "tss_expression_model_format.txt")
result = subprocess.run(["python", python_path, str(0.75), str(4639675), model_format_path, model_format_path], shell=True, capture_output=True, text=True)

print("STDOUT:\n", result.stdout)
print("STDERR:\n", result.stderr)

if result.returncode == 0:
    print("R script ran successfully.")
else:

    print(f"R script failed with return code {result.returncode}.")

STDOUT:
 
STDERR:
 
R script ran successfully.


In [None]:
train_name = os.path.join(model_data_path, "tss_expression_model_format_train_genome_split.txt")
test_name = os.path.join(model_data_path, "tss_expression_model_format_test_genome_split.txt")

output_name = os.path.join(model_data_path, "tss_expression_pwm_info.txt")

In [26]:
def get_PWM_model_results():
    python_path = os.path.join(scripts_path, "calculate_pwm.R")
    result = subprocess.run(["Rscript", python_path, train_name, test_name, output_name], shell=True, capture_output=True, text=True)

    print("STDOUT:\n", result.stdout)
    print("STDERR:\n", result.stderr)

    if result.returncode == 0:
        print("R script ran successfully.")
    else:
        print(f"R script failed with return code {result.returncode}.")

get_PWM_model_results()
    

STDOUT:
 
STDERR:
 
Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

Loading required package: BiocGenerics
Loading required package: generics

Attaching package: 'generics'

The following object is masked from 'package:dplyr':

    explain

The following objects are masked from 'package:base':

    as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
    setequal, union


Attaching package: 'BiocGenerics'

The following object is masked from 'package:dplyr':

    combine

The following objects are masked from 'package:stats':

    IQR, mad, sd, var, xtabs

The following objects are masked from 'package:base':

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, is.unsorted, lapply, Map, mapply, match, mget,
    order, p

In [31]:
def run_linear_model():
    python_path = os.path.join(scripts_path, "linear-model.R")
    output_path = os.path.join(model_output_path, "pwm_linear_model_results.txt")
    model_input = os.path.join(model_data_path, "tss_expression_pwm_info.txt")
    result = subprocess.run(["Rscript", python_path, model_input, output_path], shell=True, capture_output=True, text=True)

    print("STDOUT:\n", result.stdout)
    print("STDERR:\n", result.stderr)

    # Check exit status
    if result.returncode == 0:
        print("R script ran successfully.")
    else:
        print(f"R script failed with return code {result.returncode}.")

run_linear_model()

STDOUT:
 
Call:
lm(formula = expn_med_fitted_scaled ~ minus35_max_score + minus10_max_score + 
    pwm_paired_max + gc_content, data = train)

Residuals:
    Min      1Q  Median      3Q     Max 
 -2.146  -0.676  -0.436  -0.149 122.136 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)        0.87124    0.86012   1.013  0.31111    
minus35_max_score  0.19171    0.07373   2.600  0.00933 ** 
minus10_max_score  0.09531    0.03209   2.970  0.00298 ** 
pwm_paired_max     0.11934    0.11199   1.066  0.28663    
gc_content        -4.08736    0.68706  -5.949 2.77e-09 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.706 on 13113 degrees of freedom
Multiple R-squared:  0.007904,	Adjusted R-squared:  0.007601 
F-statistic: 26.12 on 4 and 13113 DF,  p-value: < 2.2e-16

$mae
[1] 0.8806889

$mse
[1] 23.8038

$rmse
[1] 4.878914

$mape
[1] 2.176719

$nmse
[1] 122.2779

$rstd
[1] 9.502867


Call:
lm(formula = logged ~ 

In [9]:
python_path = os.path.join(scripts_path, "define_genome_splits.py")
model_format_path = os.path.join(model_data_path, "tss_expression_model_format.txt")
result = subprocess.run(["python", python_path, str(0.75), str(4639675), "--classification", model_format_path, model_format_path], shell=True, capture_output=True, text=True)

print("STDOUT:\n", result.stdout)
print("STDERR:\n", result.stderr)

if result.returncode == 0:
    print("R script ran successfully.")
else:

    print(f"R script failed with return code {result.returncode}.")

STDOUT:
 
STDERR:
 
R script ran successfully.


In [None]:
train_name = os.path.join(model_data_path, "tss_expression_model_format_train_genome_split_classification.txt")
test_name = os.path.join(model_data_path, "tss_expression_model_format_test_genome_split_classification.txt")

train_output_name = os.path.join(model_data_path, "tss_scramble_peak_expression_model_format_train_genome_split_classification_3to6mer.txt")
test_output_name = os.path.join(model_data_path, "tss_scramble_peak_expression_model_format_test_genome_split_classification_3to6mer.txt")


In [10]:
python_path = os.path.join(scripts_path, "kmer_feature_generator.py")
model_format_path = os.path.join(model_data_path, "tss_expression_model_format.txt")
result = subprocess.run(["python", python_path, train_name, test_name, train_output_name, test_output_name, str(3), str(6)], shell=True, capture_output=True, text=True)

print("STDOUT:\n", result.stdout)
print("STDERR:\n", result.stderr)

if result.returncode == 0:
    print("R script ran successfully.")
else:

    print(f"R script failed with return code {result.returncode}.")

STDOUT:
 loaded training data
loaded test data
Train:
Counting k-mers of length 3 in all sequences...
Elapsed time: 0.3802261999999246
Creating features...
Test:
Counting k-mers of length 3 in all sequences...
Elapsed time: 0.12358260000019072
Creating features...
Train:
Counting k-mers of length 4 in all sequences...
Elapsed time: 0.3921602999998868
Creating features...
Test:
Counting k-mers of length 4 in all sequences...
Elapsed time: 0.14178950000041368
Creating features...
Train:
Counting k-mers of length 5 in all sequences...
Elapsed time: 0.4466474000000744
Creating features...
Test:
Counting k-mers of length 5 in all sequences...
Elapsed time: 0.14060950000020966
Creating features...
Train:
Counting k-mers of length 6 in all sequences...
Elapsed time: 0.4751916999994137
Creating features...
Test:
Counting k-mers of length 6 in all sequences...
Elapsed time: 0.14325790000020788
Creating features...
Writing training set...
Writing test set...

STDERR:
 
R script ran successfully.

In [11]:
log_output_path = os.path.join(model_data_path, "kmer_logistic_predictions.txt")

In [None]:
python_path = os.path.join(scripts_path, "kmer_regression_models.py")
result = subprocess.run(["python", python_path, train_output_name, test_output_name, log_output_path, "linear", "--classifcation"], shell=True, capture_output=True, text=True)

print("STDOUT:\n", result.stdout)
print("STDERR:\n", result.stderr)

if result.returncode == 0:
    print("R script ran successfully.")
else:

    print(f"R script failed with return code {result.returncode}.")