In [1]:
import pandas as pd
import sbol2
import os
import requests
from dotenv import load_dotenv
import zipfile

pd.set_option('display.max_columns', None)

current_dir = os.path.abspath('')
data_path = os.path.join(current_dir, '..', 'data')
attachments_path = os.path.join(current_dir, '..', 'attachments')
pulled_attachments_path = os.path.join(current_dir, '..', 'pulled_attachments')
sbol_path = os.path.join(current_dir, '..', 'sbol_data')
downloaded_sbol_path = os.path.join(current_dir, '..', 'downloaded_sbol')

load_dotenv()

True

In [None]:
df = pd.read_csv(os.path.join(data_path, "frag-rLP5_LB_expression.txt"), delimiter=" ")
# Expression levels from genomic fragment MPRA (random 200–300 bp sheared fragments) in LB media	
df2 = pd.read_csv(os.path.join(data_path, "frag-rLP5-M9_expression.txt"), delimiter=" ")
# Expression levels from genomic fragment MPRA (random 200–300 bp sheared fragments) in M9 media at rLP5

In [None]:
df.head(10)

In [None]:
df_new

In [None]:
def create_sbol_doc(df, file_path, media="LB"):
    doc = sbol2.Document()
    sbol2.setHomespace('http://github.com/cywlol/promoters')
    doc.displayId = "E_coli_promoters"
    # save labels for later when we do the attachments through api
    exp_data_labels = []
    attachment_file_names = []

    media_label_MD = media
    chassis_label_MD = "E_coli_chassis"

    # define the media once, then have it as a reference for each promoter row
    media_md = sbol2.ModuleDefinition(media_label_MD)
    doc.addModuleDefinition(media_md)
    media_md.addRole("http://identifiers.org/ncit/NCIT:C48164") 
    
    # define the chassis once, then have it as a reference for each promoter row
    chassis_md = sbol2.ModuleDefinition(chassis_label_MD) 
    doc.addModuleDefinition(chassis_md)
    chassis_md.addRole("http://identifiers.org/ncit/NCIT:C14419")
    
    #attach genome 
    chassis_md.wasDerivedFrom = ["https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=511145"]

    for i, row in df.iterrows():
        fragment_seq = row["fragment"]

        # Establish the identities of each component, ensuring that each one is unique
        promoter_label_CD = f"promoter_{i}"
        promoter_seq_label = f"promoter_seq_{i}"
        sample_design_label_CD = f"sample_design_{i}"
        strain_label_MD = f"strain_{i}"
        data_label = f"exp_data_{i}"
        exp_label = f'sample_{i}_expression_data'
        attachment_file_name = f'frag_{media}_exp_sample_{i}.csv'

        #engr_region_label_CD = f"engr_region_{i}"
        #location_promoter_annotation = f"location_promoter_annotation_{i}"
        #location_promoter_label = f"location_promoter_label_{i}"
        
        # Promoter and Sequence 
        promoter_cd = sbol2.ComponentDefinition(promoter_label_CD, sbol2.BIOPAX_DNA)
        promoter_cd.roles = [sbol2.SO_PROMOTER]
        seq = sbol2.Sequence(promoter_seq_label, fragment_seq, sbol2.SBOL_ENCODING_IUPAC)
        doc.addSequence(seq)
        promoter_cd.sequences = [seq.persistentIdentity]
        doc.addComponentDefinition(promoter_cd)

        
        # Add strain module definition
        strain_md = sbol2.ModuleDefinition(strain_label_MD)
        strain_md.addRole("http://identifiers.org/ncit/NCIT:C14419")
        doc.addModuleDefinition(strain_md)
        strain_c1 = strain_md.modules.create('chassis')
        strain_c1.definition = chassis_md.persistentIdentity
        strain_c2 = strain_md.functionalComponents.create('promoter')
        strain_c2.definition = promoter_cd.persistentIdentity
    
        #annotation = sbol2.SequenceAnnotation("promoter_location")
        #range = sbol2.Range("prange", start, end)
    
        #if (strand == "-"):
        #    range.orientation = sbol2.SBOL_ORIENTATION_REVERSE_COMPLEMENT
                
        #annotation.locations.add(range)
        #promoter_cd.sequenceAnnotations.add(annotation)
        '''
        # Engineered Region 
        engineered_cd = sbol2.ComponentDefinition(engr_region_label_CD, sbol2.BIOPAX_DNA)
        engineered_cd.roles = ["https://identifiers.org/so/SO:0000804"]
        sub = engineered_cd.components.create('promoter')
        sub.definition = promoter_cd.persistentIdentity
        doc.addComponentDefinition(engineered_cd)
        # No sequence?
        '''
        
        #strain_c2 = strain_md.functionalComponents.create('engineered_region')
        #strain_c2.definition = strain_md.persistentIdentity
        
        # Sample Design  
        sample_md = sbol2.ModuleDefinition(sample_design_label_CD)
        doc.addModuleDefinition(sample_md)
        sample_md.addRole("http://identifiers.org/obo/OBI:0000073")

        m_strain = sample_md.modules.create('strain')
        m_strain.definition = strain_md.persistentIdentity
        
        m_media = sample_md.modules.create('media')
        m_media.definition = media_md.persistentIdentity

        # Create the attachment data as a csv
        rna_series = row[["RNA_exp_1", "RNA_exp_2", "RNA_exp_ave"]]
        rna_df = pd.DataFrame([rna_series])  
        rna_df.to_csv(os.path.join(attachments_path, attachment_file_name), index=False)
        
        # exp_attachment = sbol2.Attachment(exp_label)
        # exp_attachment.name = f'fragmentation expression at rLP5 for sample {i}'
        # exp_attachment.description = 'CSV including the gene expression of the sequence: RNA_exp1, RNA_exp2, and its average.'
        # exp_attachment.source = 'CSV_LINK_HERE' # update when added attachment to SBOL collection
        # exp_attachment.format = 'https://identifiers.org/edam/format_3752'
        # doc.addAttachment(exp_attachment)
        
        # Experiment and Measurement Data
        exp = sbol2.ExperimentalData(exp_label)
        exp.wasDerivedFrom = sample_md.persistentIdentity
        doc.add(exp)

        exp_data_labels.append(exp_label)
        attachment_file_names.append(attachment_file_name)

        if (i == 5):
            break
            
    report = doc.validate()
    if (report == 'Valid.'):
        doc.write('promoters.xml')
    else:
        print(report)
    return exp_data_labels, attachment_file_names, doc, chassis_label_MD
        
def partshop_attach_exp_data(synbio_username, collection_name, file_names, email, password, exp_data_labels, version=1):
    shop = sbol2.PartShop("https://synbiohub.org")
    print(shop.login(email, password))
    exp_labels = ["ExperimentalData_" + label for label in exp_data_labels]  
    for label, file_name in zip(exp_labels, file_names):
        path = os.path.join(attachments_path, file_name)
        attachment_uri = f"https://synbiohub.org/user/{synbio_username}/{collection_name}/{label}/{version}"
        shop.attachFile(attachment_uri, path)

def partshop_attach_genome_to_md(synbio_username, collection_name, file_path, email, password, chassis_label, version=1):
    shop = sbol2.PartShop("https://synbiohub.org")
    print(shop.login(email, password))

    label = "ModuleDefinition_" + chassis_label
    attachment_uri = f"https://synbiohub.org/user/{synbio_username}/{collection_name}/{label}/{version}"

    shop.attachFile(attachment_uri, file_path)

def create_synbio_collection(email, password, file_path, id, name, description, version='1'):
    response = requests.post(
        "https://synbiohub.org/login",
        headers={"Accept": "text/plain"},
        data={"email": email, "password": password}
    )
        
    if response.ok:
        token = response.text.strip() # theres a whitespace before the token for some reason
        response = requests.post(
        'https://synbiohub.org/submit',
        headers={
            'X-authorization': token,
            'Accept': 'text/plain'
        },
        files={
        'files': open(file_path,'rb'),
        },
        data={
            'id': id,
            'version' : version,
            'name' :  name,
            'description' : description,
            'citations' : '',
            'overwrite_merge' : '0'
        },
    
    )
    else:
        print("Login failed:", response.status_code)
        print(response.text)

def partshop_pull(email, password, synbio_username, collection_name, file_path, version=1):
    shop = sbol2.PartShop("https://synbiohub.org")
    doc = sbol2.Document()
    shop.login(email, password)

    collection_uri = f"https://synbiohub.org/user/{synbio_username}/{collection_name}/{collection_name}_collection/{version}"
    s = shop.pull(collection_uri, doc)
    
    for obj in doc:
        print(obj)   
    
    doc.write(file_path)

def download_all_attachments(email, password, doc, file_path):
    shop = sbol2.PartShop("https://synbiohub.org")
    shop.login(email, password)
    for attachment in doc.attachments:
        shop.downloadAttachment(attachment.identity, filepath=file_path)

In [2]:
ecoli_genome_file_name = "E. coli.fasta"
env_email = os.getenv("SYNBIO_EMAIL")
env_password = os.getenv("SYNBIO_PASSWORD")

username = "cywong"
output_name = 'ecolipromoters.xml'
id = "Ecolipromoterexpdata"
name = "E coli promoter data exploration"
description = "A collection containing the extracted E coli data from paper"
sbol_file_name = output_name
imported_sbol_file_name = "promoters_import.xml"

In [3]:
exp_labels, attachment_file_names, doc, chassis_label = create_sbol_doc(df, os.path.join(sbol_path, output_name))       
#create_synbio_collection(env_email, env_password, os.path.join(sbol_path, sbol_file_name), id, name, description)
#partshop_attach_exp_data(username, id, attachment_file_names, env_email, env_password, exp_labels)
#partshop_attach_genome_to_md(username, id, os.path.join(attachments_path, ecoli_genome_file_name), env_email, env_password, chassis_label)

NameError: name 'create_sbol_doc' is not defined

In [None]:
partshop_pull(env_email, env_password, username, id, os.path.join(downloaded_sbol_path, imported_sbol_file_name), version=1)

In [None]:
# download_all_attachments(env_email, env_password, doc, pulled_attachments_path)

In [None]:
doc = sbol2.Document()
doc.read(os.path.join(downloaded_sbol_path, imported_sbol_file_name))

df_new = pd.DataFrame()

for exp in doc.experimentalData:
    attachment = doc.get(exp.attachments[0])
    sample_design = doc.get(exp.wasDerivedFrom[0])

    # may need to be changed
    strain = doc.get(sample_design).modules[0] if 'strain' in doc.get(sample_design).modules[0].identity else doc.get(sample_design).modules[1]

    promoter =  doc.get(doc.get(strain.definition).functionalComponents[0].definition)
    promoter_seq = doc.get(promoter.sequences[0]).elements
    
    df1 = pd.read_csv(os.path.join(pulled_attachments_path, attachment.name))
    df1['Sequence'] = promoter_seq
    df_new = pd.concat([df_new, df1], ignore_index=True)


# for mod in doc.moduleDefinitions:
#     print("Mod: ", mod.identity)
# for attachment in doc.attachments:
#     print("Attachment: ", attachment.source) 
    
# for seq in doc.sequences:
#     print("Seq:", seq.elements) 



In [4]:
import pandas as pd
import os
current_dir = os.path.abspath('')

data_path = os.path.join(current_dir, '..', 'data')

df3 = pd.read_csv(os.path.join(data_path, "endo_scramble_expression_formatted_std.txt"), delimiter= "\t")
df3

# Positive promoter controls are synthetic thus have no actual location data. This refers to the 2,000 out of the 17,000 TSS they found, scrambled them to find the functional sites, including controls, scrambled variants, and unscrambled
    # name: ID represented as {[TSS name][genomic position][strand + the scrambled region]}
    # tss_name: Name of the original (unscrambled) transcription start site (TSS) this variant is derived from.
    # tss_position: Start genome coordinate of the TSS.
    # strand: Strand direction
    # scramble_start: Start of the 10 bp scrambled window relative to var_left and var_right
    # scramble_end: End of the scrambled window relative to var_left and var_right
    # var_left: Genomic coordinate of the start (5' end) of the full 150 bp promoter variant.
    # var_right: Genomic coordinate of the end (3' end) of the full 150 bp promoter variant.
    # scramble_start_pos: Genomic coordinate where the scrambled 10 bp region begins.
    # scramble_end_pos: Genomic coordinate where the scrambled 10 bp region ends.
    # scramble_pos_rel_tss: Position of the scrambled region relative to the TSS.
    # variant: The sequence
    # RNA_exp_sum_1_1: Sum of RNA expression counts for barcodes in replicate 1, technical replicate 1 (normalized by DNA?)
    # RNA_exp_sum_1_2: Sum of RNA expression counts for barcodes in replicate 1, technical replicate 2.
    # RNA_exp_sum_2_1: Sum of RNA expression counts for barcodes in replicate 2, technical replicate 1.
    # RNA_exp_sum_2_2: Sum of RNA expression counts for barcodes in replicate 2, technical replicate 2.
    # RNA_exp_sum_1: Sum of RNA expression across both technical replicates for biological replicate 1.
    # RNA_exp_sum_2: Sum of RNA expression across both technical replicates for biological replicate 2.
    # RNA_exp_sum_ave: (RNA_exp_sum_1 + RNA_exp_sum_2)/2)
    # DNA_1: DNA-seq count in biological replicate 1 (how much of this fragment was integrated).
    # DNA_2: DNA-seq count in biological replicate 2
    # DNA_ave: (DNA_1 + DNA_2) / 2
    # expn_med: Median RNA expression across all barcodes, normalized by DNA abundance (RNA/DNA).
    # num_barcodes_integrated: Number of barcodes actually integrated and measured in DNA/RNA-seq.
    # category: Label indicating whether this is scrambled, unscrambled, negative, positive control
    # unscrambled_exp: Measured promoter activity of the wild-type (unscrambled) version of this promoter.
    # relative_exp: (expn_med / unscrambled_exp)

df4 = pd.read_csv(os.path.join(data_path, "fLP3_Endo2_lb_expression_formatted_std.txt"), delimiter="\t")
df4
# Expression summary for the TSS promoter library (17,635 reported TSSs) in LB, integrated at the fLP3 site
    # name: Unique identifier for the promoter variant in the format [TSS_name,TSS_position,strand]
    # tss_name: The named identifier for the transcription start site (TSS), such as from RegulonDB or Storz 
    # tss_position: Genomic coordinate of the TSS — the position where transcription starts.
    # strand: Strand direction
    # start: Absolute genomic coordinate where the 150 bp promoter fragment starts.
    # end: Absolute genomic coordinate where the 150 bp promoter fragment ends.
    # variant: The promoter sequence
    # RNA_exp_sum_1_1: Sum of RNA-seq counts for this variant in biological replicate 1, technical replicate 1.
    # RNA_exp_sum_1_2: Sum of RNA-seq counts in biological replicate 1, technical replicate 2.
    # RNA_exp_sum_2_1: Sum of RNA-seq counts in biological replicate 2, technical replicate 1.
    # RNA_exp_sum_2_2: Sum of RNA-seq counts in biological replicate 2, technical replicate 2.
    # RNA_exp_sum_1: Total RNA expression for biological replicate 1 
    # RNA_exp_sum_2: Total RNA expression for biological replicate 2 
    # RNA_exp_sum_ave: Average of RNA expression across the two biological replicates.
    # DNA_1: DNA-seq count in biological replicate 1 
    # DNA_2: DNA-seq count in biological replicate 2.
    # DNA_ave: Average of DNA_1 and DNA_2
    # expn_med: Median RNA/DNA expression across barcodes 
    # num_barcodes_integrated: Number of barcodes that were actually integrated and sequenced.
    # category: Type of promoter: ['tss', 'neg_control', 'pos_control']
    # active: Binary classification — "active" or "inactive" promoter based on expn_med


df5 = pd.read_csv(os.path.join(data_path,"peak_tile_expression_formatted_std.txt"), delimiter="\t")
df5
# The authors first used sheared genomic fragments (~200–300 bp) to find candidate promoter regions, and then designed a tiling oligo library: ~150 bp sequences overlapping by 10 bp Spanning all ~3,500 candidate promoter region, located at LB and nth-ydgR intergenic locus 
    # variant: The 150 bp DNA sequence (oligo) used in the MPRA tile
    # name: Identifier for the tile, usually formatted as “peak_start_peak_end_strand_posStart-posEnd”.
    # peak_start: Genomic start coordinate of the candidate promoter region (the whole peak region).
    # peak_end: Genomic end coordinate of the candidate promoter region.
    # strand: Strand direction
    # tile_start: Start of this 150 bp oligo relative to the peak
    # tile_end: End of the tile 
    # RNA_exp_sum_1_1: Sum of RNA reads for this tile in biological replicate 1, technical replicate 1
    # RNA_exp_sum_1_2: Same as above, but technical replicate 2
    # RNA_exp_sum_2_1: Sum of RNA reads in biological replicate 2, technical replicate 1.
    # RNA_exp_sum_2_2: Same as above, but technical replicate 2.
    # RNA_exp_sum_1: Total RNA expression in biological replicate 1
    # RNA_exp_sum_2: Total RNA expression in biological replicate 2 
    # RNA_exp_sum_ave: Average RNA expression across the two biological replicates.
    # DNA_1: DNA read count in biological replicate 1
    # DNA_2: DNA read count in biological replicate 2.
    # DNA_ave: Average of DNA_1 and DNA_2
    # expn_med: Median RNA/DNA expression across all barcodes for this tile 
    # num_barcodes_integrated: Number of barcodes that were successfully integrated and sequenced.
    # category: ['tile', 'neg_control', 'random', 'pos_control']
    # peak_length: Length (in bp) of the peak region from which this tile was derived (e.g., 362 bp).
    # tile_start_relative: Position of the tile’s start within the peak (tile_start / peak_length)
    # start: Genomic start coordinate of the tile (based on tile_start + peak_start).
    # end: Genomic end coordinate of the tile.
    # active: Binary label indicating if the tile is active or inactive

df6 = pd.read_csv(os.path.join(data_path,"rLP5_Endo2_lb_expression_formatted_std.txt"), delimiter="\t")
df6
# Expression summary for the TSS promoter library (17,635 reported TSSs) in LB, integrated at the rLP5 site

df7 = pd.read_csv(os.path.join(data_path, "rLP6_Endo2_lb_expression_formatted_std.txt"), delimiter="\t")
df7
# Expression summary for the TSS promoter library (17,635 reported TSSs) in LB, integrated at the rLP6 site

        
def post_all_attachments(synbio_username, collection_name, file_names, email, password, exp_data_labels, version=1):
    sbh_url = "https://synbiohub.org/login" 
    sbh_email = email
    sbh_password = password
    
    response = requests.post(
        sbh_url,
        headers={"Accept": "text/plain"},
        data={"email": sbh_email, "password": sbh_password}
    )

    if response.ok:
        token = response.text.strip() # theres a whitespace before the token for some reason
        print(token)
        exp_labels = ["ExperimentalData_" + label for label in exp_data_labels]  
        
        for label, file_name in zip(exp_labels, file_names):
            attachment_uri = f"https://synbiohub.org/user/{synbio_username}/{collection_name}/{label}/{version}/attach"
            print(attachment_uri)
            with open(file_name, 'rb') as f:
                response = requests.post(
                    attachment_uri,
                    headers={
                        'X-authorization': token,
                        'Accept': 'text/plain'
                    },
                    files={'file': f}
                )
    
            
            print("Status Code:", response.status_code)
            print("Response Body:", response.text)
    else:
        print("Login failed:", response.status_code)
        print(response.text)

def post_attachment(attachment_uri, file_path, email, password):
    sbh_url = "https://synbiohub.org/login" 
    sbh_email = email
    sbh_password = password
    
    response = requests.post(
        sbh_url,
        headers={"Accept": "text/plain"},
        data={"email": sbh_email, "password": sbh_password}
    )
    
    if response.ok:
        token = response.text.strip() # theres a whitespace before the token for some reason
        print(token)

        with open(file_path, 'rb') as f:
            response = requests.post(
                attachment_uri,
                headers={
                    'X-authorization': token,
                    'Accept': 'text/plain'
                },
                files={'file': f}
            )
        
        print("Status Code:", response.status_code)
        print("Response Body:", response.text)
    
    else:
        print("Login failed:", response.status_code)
        print(response.text)

#df1 and df2

# fragment: 150–300 bp genomic DNA sequence that were randomly sheared and barcoded
# RNA_exp_1: RNA expression level (replicate 1) – normalized measurement of transcript abundance for this fragment in the first RNA-Seq replicate. (normalized by DNA)
# RNA_exp_2: RNA expression level (replicate 2) – same as above, but from a second biological replicate.
# RNA_exp_ave: Average RNA expression – mean of RNA_exp_1 and RNA_exp_2
# DNA_sum_1: DNA integration abundance (replicate 1) – quantifies how much of this DNA fragment (or barcode) was actually integrated in the first DNA-Seq sample. 
# DNA_sum_2: DNA integration abundance (replicate 2) – same as above, from the second replicate.
# DNA_ave: Average DNA integration level – mean of DNA_sum_1 and DNA_sum_2
# num_integrated_barcodes: Number of barcodes that were integrated into the genome for this fragment. Typically should match num_mapped_barcodes unless there are sequencing/integration issues.
# start: Start coordinate (in bp) in the E. coli MG1655 reference genome (U00096.2). 
# end: End coordinate in the E. coli MG1655 reference genome (U00096.2). 
# strand: DNA strand orientation (+ or -) 
# variation: Standard deviation or variability in RNA measurement across barcode replicates 

In [9]:
import numpy as np, random
np.random.seed(1)
random.seed(1)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split  # sklearn >= 0.18
import sys
import argparse
num_epochs = 100


# if __name__ == '__main__':

# 	parser = argparse.ArgumentParser()
# 	parser.add_argument('train', help='''pre-defined training set, 
# 		one column sequence, one column expression. Tab-separated''')
# 	parser.add_argument('test', help='pre-defined test set')
# 	parser.add_argument('seq_length', type=int, help='length of input sequences')
# 	parser.add_argument('num_layers', type=int, help='number of convolutional layers')
# 	parser.add_argument('min_filter', type=int, help='minimum number of filters')
# 	parser.add_argument('max_filter', type=int, help='maximum number of filters')
# 	parser.add_argument('validation_fraction', type=float)
# 	parser.add_argument('num_trials', type=int, 
# 		help='number of hyperparameter trials')
# 	parser.add_argument('prefix', help='output prefix for saved model files')
# 	parser.add_argument('--validation', help='Optional pre-defined validation set')
	
# 	args = parser.parse_args()

# 	# load in pre-defined splits
# 	seq_length = args.seq_length
# 	print("loading training set...")
# 	X_train, y_train = process_seqs(args.train, seq_length)
# 	print("loading test set...")
# 	X_test, y_test = process_seqs(args.test, seq_length)

	
# 	num_layers = args.num_layers
# 	min_filter = args.min_filter
# 	max_filter = args.max_filter
# 	validation_fraction = args.validation_fraction
# 	num_hyperparameter_trials = args.num_trials
# 	prefix = args.prefix

# 	if args.validation:
# 		X_valid, y_valid = process_seqs(args.validation, seq_length)
# 	else:
# 		# split training into validation set
# 		X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, 
# 			test_size=validation_fraction)


# 	print('Starting hyperparameter search...')
# 	min_layer = 1
# 	max_layer = 4
# 	min_conv_width = 15
# 	max_conv_width = 20
# 	min_dropout = 0.1
# 	max_dropout = 0.9

# 	fixed_hyperparameters = {'seq_length': seq_length, 'num_epochs': num_epochs}
# 	grid = {'num_filters': ((min_filter, max_filter),), 'pool_width': (5, 40),
# 	        'conv_width': ((min_conv_width, max_conv_width),), 
# 	        'dropout': (min_dropout, max_dropout)}

# 	# number of convolutional layers        
# 	print("Number of convolutional layers: ", num_layers)
# 	filters = tuple([(min_filter, max_filter)] * num_layers)
# 	conv_widths = tuple([(min_conv_width, max_conv_width)] * num_layers)
# 	grid.update({'num_filters': filters, 'conv_width': conv_widths})

# 	# Backend is RandomSearch; if using Python 2, can also specify MOESearch
# 	# (requires separate installation)
# 	searcher = HyperparameterSearcher(SequenceDNN, fixed_hyperparameters, grid, 
# 		X_train, y_train, validation_data=(X_valid, y_valid), backend=RandomSearch)
# 	searcher.search(num_hyperparameter_trials)
# 	print('Best hyperparameters: {}'.format(searcher.best_hyperparameters))
# 	model = searcher.best_model
	
# 	# print test results
# 	print('Test results: {}'.format(model.score(X_test, y_test)))
	
# 	# save model
# 	model.save(prefix)
	
# 	# print predictions
# 	predictions = model.predict(X_test)
# 	test_sequences = [line.split('\t')[0] for line in open(args.test)]
	
# 	with open(prefix + '_predictions.txt', 'w') as outfile:
# 		for i in range(len(predictions)):
# 			outfile.write(
# 				test_sequences[i] + '\t' + 
# 				str(float(predictions[i])) + '\t' + 
# 				str(float(y_test[i])) + '\n')




In [16]:
def one_hot_encode_modern(sequences):
    sequences_array = np.array([[char for char in seq] for seq in sequences])
    sequence_length = sequences_array.shape[1]
    num_samples = sequences_array.shape[0]

    defined_categories = ['A', 'C', 'G', 'T', 'N']
    ohe = OneHotEncoder(sparse=False,
                        categories=[defined_categories] * sequence_length,
                        dtype=np.float32) # Still keep this as float32

    one_hot_encoding_flat = ohe.fit_transform(sequences_array)

    # Reshape to (num_samples, sequence_length, num_bases)
    one_hot_encoding_reshaped = one_hot_encoding_flat.reshape(
        num_samples, sequence_length, len(defined_categories))

    # Add the 'height' dimension (which will be 1 for a 1D sequence interpreted as 2D)
    # and ensure channels are last: (N, H, W, C) -> (num_samples, 1, sequence_length, 4)
    # This becomes (num_samples, height=1, width=sequence_length, channels=4)
    # Keras Conv2D with channels_last expects (batch, rows, cols, channels)
    # For sequence, if you treat rows=1, cols=seq_length, channels=4
    # then it should be (N, 1, seq_length, 4)
    final_one_hot_encoding = one_hot_encoding_reshaped[:, np.newaxis, :, :] # (N, 1, L, 4)
    # If your Conv2D is truly 2D over a sequence, this shape is fine.
    # If it's more like 1D conv, you might need (N, L, C) and then a Reshape for Conv2D.
    # Given your current Conv2D setup and input_shape:
    # Conv2D(..., input_shape=(1, 4, seq_length), data_format='channels_first')
    # You want Conv2D(..., input_shape=(1, seq_length, 4), data_format='channels_last')

    # Let's target: (num_samples, height, width, channels)
    # For your 4-channel DNA, it's (num_samples, 1, seq_length, 4) or just (num_samples, seq_length, 4) if it's a "1D" Conv2D.
    # Given your `conv_height=4` for the first layer, it seems you want the 4 bases as height.
    # This implies: (N, 4, seq_length, 1) or (N, seq_length, 4, 1) as input, if you want to use channels_last.
    # The original code's input_shape=(1, 4, seq_length) and conv_height=4 suggests NCHW (N, C, H, W) where C=1.
    # If C=1, then it's (N, 1, 4, seq_length) where 4 is the 'height' and seq_length is 'width'.
    # For channels_last, that would be (N, 4, seq_length, 1).

    # So, from (num_samples, sequence_length, len(defined_categories))
    # you want (num_samples, Height=4, Width=sequence_length, Channels=1)

    # Transpose to get channels last if your original was (N, C, H, W)
    # From (num_samples, sequence_length, 4) where 4 are the one-hot channels
    # We need (num_samples, H, W, C) where H=4, W=seq_length, C=1
    # This is a bit tricky given your initial Conv2D setup.

    # Let's simplify the one-hot encoding output:
    # Output directly (num_samples, seq_length, 4) as (batch, steps, features)
    # Then we'll reshape for Conv2D inside SequenceDNN if needed.
    # (N, L, 4) where L is sequence_length, 4 is features (A,C,G,T).
    return one_hot_encoding_reshaped # (N, L, 4)
    return final_one_hot_encoding

def pad_sequence(seq, max_length):
	if len(seq) > max_length:
		diff = len(seq) - max_length
		trim_length = int(diff / 2)
		seq = seq[trim_length : -(trim_length + diff%2)]
	else:
		seq = seq.center(max_length, 'N')
	return seq

def process_seqs(df, seq_length):
	padded_seqs = [pad_sequence(x, seq_length) for x in df['variant']]
	X = one_hot_encode_modern(np.array(padded_seqs))
	y = np.array(df['expn_med_fitted_scaled'], dtype=np.float32)
	return X, y


def split_data_by_peak(data_df):
    np.random.seed(123)

    # 1. Create 'peak_name' column (if not already present)
    # Assuming peak_start and peak_end are numeric, convert to string for concatenation
    data_df['peak_name'] = data_df['peak_start'].astype(str) + 'to' + data_df['peak_end'].astype(str)

    # 2. Get unique peak names
    peak_names = data_df['peak_name'].unique()

    # 3. Randomly sample train peak names (90%)
    train_size = int(0.90 * len(peak_names))
    train_peak_names = np.random.choice(peak_names, size=train_size, replace=False)

    # 4. Determine test peak names
    test_peak_names = np.array([p for p in peak_names if p not in train_peak_names])


    data_train = data_df[data_df['peak_name'].isin(train_peak_names)].copy()
    data_test = data_df[data_df['peak_name'].isin(test_peak_names)].copy()

    return data_train, data_test

Exception ignored in: <function BaseSession._Callable.__del__ at 0x000001B11311CAE8>
Traceback (most recent call last):
  File "C:\Users\Sai\anaconda3\envs\tf1.0\lib\site-packages\tensorflow\python\client\session.py", line 1455, in __del__
    self._session._session, self._handle, status)
  File "C:\Users\Sai\anaconda3\envs\tf1.0\lib\site-packages\tensorflow\python\framework\errors_impl.py", line 528, in __exit__
    c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.InvalidArgumentError: No such callable handle: 0


In [18]:
X_train

array([[[0., 0., 0., 1., 0.],
        [0., 1., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        ...,
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [1., 0., 0., 0., 0.]],

       [[0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 1., 0., 0., 0.],
        ...,
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 1., 0., 0., 0.]],

       [[0., 1., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        ...,
        [0., 0., 1., 0., 0.],
        [0., 1., 0., 0., 0.],
        [1., 0., 0., 0., 0.]],

       ...,

       [[0., 0., 0., 1., 0.],
        [0., 0., 0., 1., 0.],
        [0., 1., 0., 0., 0.],
        ...,
        [0., 0., 1., 0., 0.],
        [1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1.]],

       [[0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 1.],
        ...,
        [0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 1.]],

       [[0.,

In [19]:
from __future__ import absolute_import, division, print_function
import numpy as np
import os
import subprocess
import sys
import tempfile
from abc import abstractmethod, ABCMeta

class Model(object):
    __metaclass__ = ABCMeta

    @abstractmethod
    def __init__(self, **hyperparameters):
        pass

    @abstractmethod
    def train(self, X, y, validation_data):
        pass

    @abstractmethod
    def predict(self, X):
        pass

    def test(self, X, y):
        return self.evaluate(X, y)
        # return ClassificationResult(y, self.predict(X))

    def score(self, X, y):
        pass



class SequenceDNN(Model):
    """
    Sequence DNN models, regression. No activation layer

    Parameters
    ----------
    seq_length : int, optional
        length of input sequence.
    keras_model : instance of keras.models.Sequential, optional
        seq_length or keras_model must be specified.
    num_tasks : int, optional
        number of tasks. Default: 1.
    num_filters : list[int] | tuple[int]
        number of convolutional filters in each layer. Default: (15,).
    conv_width : list[int] | tuple[int]
        width of each layer's convolutional filters. Default: (15,).
    pool_width : int
        width of max pooling after the last layer. Default: 35.
    L1 : float
        strength of L1 penalty.
    dropout : float
        dropout probability in every convolutional layer. Default: 0.
    verbose: int
        Verbosity level during training. Valida values: 0, 1, 2.

    Returns
    -------
    Compiled DNN model.
    """

    def __init__(self, seq_length=None, keras_model=None,
                 use_RNN=False, num_tasks=1,
                 num_filters=(15, 15, 15), conv_width=(15, 15, 15),
                 pool_width=35, GRU_size=35, TDD_size=15,
                 L1=0, dropout=0.0, num_epochs=100, verbose=1):
        from tensorflow.keras.models import Sequential
        from tensorflow.keras.layers import (
            Activation, Dense, Dropout, Flatten,
            Permute, Reshape)
        from tensorflow.keras.layers import TimeDistributed
        from tensorflow.keras.layers import Conv2D, MaxPooling2D
        from tensorflow.keras.layers import GRU
        from tensorflow.keras.regularizers import l1
        self.num_tasks = num_tasks
        self.num_epochs = num_epochs
        self.verbose = verbose
        self.train_metrics = []
        self.valid_metrics = []
        if keras_model is not None and seq_length is None:
            self.model = keras_model
            self.name = 'Sequential'
            self.num_tasks = keras_model.layers[-1].output_shape[-1]
        elif seq_length is not None and keras_model is None:
            self.model = Sequential()
            assert len(num_filters) == len(conv_width)
            for i, (nb_filter, nb_col) in enumerate(zip(num_filters, conv_width)):
                conv_height = 4 if i == 0 else 1
                # self.model.add(Convolution2D(
                #     nb_filter=nb_filter, nb_row=conv_height,
                #     nb_col=nb_col, activation='linear',
                #     init='he_normal', input_shape=(1, 4, seq_length),
                #     W_regularizer=l1(L1), b_regularizer=l1(L1)))
                
                self.model.add(Conv2D(filters=nb_filter, 
                        kernel_size=(conv_height, nb_col),
                        activation='linear', kernel_initializer='he_normal',
                        input_shape=(1, 4, seq_length),
                        kernel_regularizer=l1(L1), bias_regularizer=l1(L1),
                        data_format='channels_last'))
                self.model.add(Activation('relu'))
                self.model.add(Dropout(dropout))
            self.model.add(MaxPooling2D(pool_size=(1, pool_width),
                data_format='channels_first'))
            if use_RNN:
                num_max_pool_outputs = self.model.layers[-1].output_shape[-1]
                self.model.add(Reshape((num_filters[-1], num_max_pool_outputs)))
                self.model.add(Permute((2, 1)))
                self.model.add(GRU(GRU_size, return_sequences=True))
                self.model.add(TimeDistributed(TDD_size, activation='relu'))
            self.model.add(Flatten())
            self.model.add(Dense(units=self.num_tasks))
            # no activation layer, MSE loss
            self.model.compile(optimizer='adam', loss='mean_squared_error')
        else:
            raise ValueError("Exactly one of seq_length or keras_model must be specified!")

    def train(self, X, y, validation_data, early_stopping_metric='Loss',
              early_stopping_patience=5, save_best_model_to_prefix=None):

        if self.verbose >= 1:
            print('Training model (* indicates new best result)...')
        X_valid, y_valid = validation_data
        early_stopping_wait = 0
        best_metric = np.inf if early_stopping_metric == 'Loss' else -np.inf
        # self.model.fit(X, y, epochs=self.num_epochs+1, verbose=self.verbose >= 2)
        # score = self.test(X_valid, y_valid)
        # print('Test loss:', score[0])
        # print('Test accuracy:', score[1])
        for epoch in range(1, self.num_epochs + 1):
            self.model.fit(X, y, batch_size=128, epochs=1, verbose=0)
            epoch_train_metrics = self.model.evaluate(X, y, verbose=0)
            epoch_valid_metrics = self.model.evaluate(X_valid, y_valid, verbose=0)
            self.train_metrics.append(epoch_train_metrics)
            self.valid_metrics.append(epoch_valid_metrics)
            if self.verbose >= 1:
                print('Epoch {}:'.format(epoch))
                print('Train: ', epoch_train_metrics)
                print('Valid: ', epoch_valid_metrics)
            current_metric = epoch_valid_metrics
            if current_metric <= best_metric:
                if self.verbose >= 1:
                    print(' *')
                best_metric = current_metric
                best_epoch = epoch
                early_stopping_wait = 0
                if save_best_model_to_prefix is not None:
                    self.save(save_best_model_to_prefix)
            else:
                if early_stopping_wait >= early_stopping_patience:
                    break
                early_stopping_wait += 1

        if self.verbose >= 1:
            print('Finished training after {} epochs.'.format(epoch))
            if save_best_model_to_prefix is not None:
                print("The best model's architecture and weights (from epoch {0}) "
                      'were saved to {1}.arch.json and {1}.weights.h5'.format(
                    best_epoch, save_best_model_to_prefix))


    def predict(self, X):
        return self.model.predict(X)


    def score(self, X, y):
        predictions = np.squeeze(self.model.predict(X))
        return np.corrcoef(predictions, y)[0,1]

    def save(self, save_best_model_to_prefix):
        arch_fname = save_best_model_to_prefix + '.arch.json'
        weights_fname = save_best_model_to_prefix + '.weights.h5'
        open(arch_fname, 'w').write(self.model.to_json())
        self.model.save_weights(weights_fname, overwrite=True)

    @staticmethod
    def load(arch_fname, weights_fname=None):
        from keras.models import model_from_json
        model_json_string = open(arch_fname).read()
        sequence_dnn = SequenceDNN(keras_model=model_from_json(model_json_string))
        if weights_fname is not None:
            sequence_dnn.model.load_weights(weights_fname)
        return sequence_dnn

In [20]:
from __future__ import absolute_import, division, print_function
import numpy as np, sys
from abc import abstractmethod, ABCMeta


class HyperparameterBackend(object):
    __metaclass__ = ABCMeta

    @abstractmethod
    def __init__(self, grid):
        """
        Parameters
        ----------
        grid: dict
            Keys are hyperparameter names and values are either
            a single (min, max) tuple for single value parameters
            or a tuple of (min, max) tuples for tuple-valued parameters.
        """
        pass

    @abstractmethod
    def get_next_hyperparameters(self):
        pass

    @abstractmethod
    def record_result(self, hyperparam_dict, score):
        """
        Parameters
        ----------
        hyperparam_dict: dict
            hyperparameter names as keys and values as values.
        score: int or float
            The result, or metric value, of using the hyparameters.
        """
        pass


class RandomSearch(HyperparameterBackend):
    def __init__(self, grid):
        self.grid = grid

    def get_next_hyperparameters(self):
        return [np.random.uniform(start, end) for start, end in self.grid]

    def record_result(self, hyperparam_dict, score):
        pass  # Random search doesn't base its decisions on the results of previous trials




class HyperparameterSearcher(object):
    def __init__(self, model_class, fixed_hyperparameters, grid, X_train, y_train, validation_data,
                 maximize=True, backend=RandomSearch):
        self.model_class = model_class
        self.fixed_hyperparameters = fixed_hyperparameters
        self.grid = grid
        self.X_train = X_train
        self.y_train = y_train
        self.validation_data = validation_data
        self.maximize = maximize
        self.best_score = 0
        self.best_model = self.best_hyperparameters = None
        # Some hyperparameters have multiple elements, and we need backend to treat each of them
        # as a separate dimension, so unpack them here.
        backend_grid = [bounds for value in grid.values()
                        for bounds in (value if isinstance(value[0], (list, tuple, np.ndarray))
                                       else (value,))]
        self.backend = backend(backend_grid)

    def search(self, num_hyperparameter_trials):
        for trial in range(num_hyperparameter_trials):
     
            print("Trial ", trial)
            # Select next hyperparameters with MOE, rounding hyperparameters that are integers
            # and re-packing multi-element hyperparameters
            raw_hyperparameters = self.backend.get_next_hyperparameters()
            hyperparameters = {}
            i = 0
            for name, bounds in self.grid.items():
                if isinstance(bounds[0], (list, tuple, np.ndarray)):
                    # Multi-element hyperparameter
                    hyperparameters[name] = raw_hyperparameters[i : i + len(bounds)]
                    if isinstance(bounds[0][0], int):
                        hyperparameters[name] = np.rint(hyperparameters[name]).astype(int)
                    i += len(bounds)
                else:
                    hyperparameters[name] = raw_hyperparameters[i]
                    if isinstance(bounds[0], int):
                        hyperparameters[name] = int(round(hyperparameters[name]))
                    i += 1
            assert i == len(raw_hyperparameters)
            print(hyperparameters)
            # Try these hyperparameters
            model = self.model_class(**{key: value
                                        for dictionary in (hyperparameters, self.fixed_hyperparameters)
                                        for key, value in dictionary.items()})
            print(y_train.dtype)
            model.train(self.X_train, self.y_train, validation_data=self.validation_data)
            
            # task_scores = model.score(self.validation_data[0], self.validation_data[1], self.metric)
            # score = task_scores.mean()  # mean across tasks
            # score = np.corrcoef(np.squeeze(model.predict(X_valid)), y_valid)[0,1]
            score = model.score(self.validation_data[0], self.validation_data[1])
            print("Valid: correlation between predicted and observed:", score)

            # Record hyperparameters and validation loss
            self.backend.record_result(hyperparameters, score)
            # If these hyperparameters were the best so far, store this model
            if self.maximize == (score > self.best_score):
                self.best_score = score
                self.best_model = model
                self.best_hyperparameters = hyperparameters



In [21]:
def train_CNN(X_train, y_train, X_test, y_test, seq_length, num_layers, min_filter, max_filter, validation_fraction, num_hyperparameter_trials, prefix):
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=validation_fraction)
    print('Starting hyperparameter searchererereer...')
    print(X_test.dtype)
    min_layer = 1
    max_layer = 4
    min_conv_width = 15
    max_conv_width = 20
    min_dropout = 0.1
    max_dropout = 0.9
    
    fixed_hyperparameters = {'seq_length': seq_length, 'num_epochs': num_epochs}
    grid = {'num_filters': ((min_filter, max_filter),), 'pool_width': (5, 40),
            'conv_width': ((min_conv_width, max_conv_width),), 
            'dropout': (min_dropout, max_dropout)}
    
    # number of convolutional layers        
    print("Number of convolutional layers: ", num_layers)
    filters = tuple([(min_filter, max_filter)] * num_layers)
    conv_widths = tuple([(min_conv_width, max_conv_width)] * num_layers)
    grid.update({'num_filters': filters, 'conv_width': conv_widths})
    
    # Backend is RandomSearch; if using Python 2, can also specify MOESearch
    # (requires separate installation)
    searcher = HyperparameterSearcher(SequenceDNN, fixed_hyperparameters, grid, 
        X_train, y_train, validation_data=(X_valid, y_valid), backend=RandomSearch)
    searcher.search(num_hyperparameter_trials)
    print('Best hyperparameters: {}'.format(searcher.best_hyperparameters))
    model = searcher.best_model
    
    # print test results
    print('Test results: {}'.format(model.score(X_test, y_test)))
    
    # save model
    model.save(prefix)
    
    # print predictions
    predictions = model.predict(X_test)
    test_sequences = [line.split('\t')[0] for line in open(args.test)]
    
    with open(prefix + '_predictions.txt', 'w') as outfile:
        for i in range(len(predictions)):
            outfile.write(
                test_sequences[i] + '\t' + 
                str(float(predictions[i])) + '\t' + 
                str(float(y_test[i])) + '\n')

In [22]:
train_CNN(X_train, y_train, X_test, y_test, seq_length=150, num_layers=4, min_filter=5, max_filter=100, validation_fraction=0.2, num_hyperparameter_trials=100, prefix="results")

Starting hyperparameter searchererereer...
float32
Number of convolutional layers:  4
Trial  0
{'num_filters': array([94, 77, 77, 60]), 'pool_width': 14, 'conv_width': array([17, 19, 16, 19]), 'dropout': 0.1883397207778515}


ValueError: Negative dimension size caused by subtracting 4 from 1 for 'conv2d_4/Conv2D' (op: 'Conv2D') with input shapes: [?,1,4,150], [4,17,150,94].

In [20]:
from __future__ import absolute_import, division, print_function
import numpy as np
import os
import subprocess
import sys
import tempfile
# matplotlib.use('pdf')
# import matplotlib.pyplot as plt
from abc import abstractmethod, ABCMeta
from sklearn.tree import DecisionTreeClassifier as scikit_DecisionTree
from sklearn.ensemble import RandomForestRegressor

class Model(object):
    __metaclass__ = ABCMeta

    @abstractmethod
    def __init__(self, **hyperparameters):
        pass

    @abstractmethod
    def train(self, X, y, validation_data):
        pass

    @abstractmethod
    def predict(self, X):
        pass

    def test(self, X, y):
        return self.evaluate(X, y)
        # return ClassificationResult(y, self.predict(X))

    def score(self, X, y):
        pass



class DecisionTree(Model):

    def __init__(self):
        self.classifier = scikit_DecisionTree()

    def train(self, X, y, validation_data=None):
        self.classifier.fit(X, y)

    def predict(self, X):
        predictions = np.asarray(self.classifier.predict_proba(X))[..., 1]
        if len(predictions.shape) == 2:  # multitask
            predictions = predictions.T
        else:  # single-task
            predictions = np.expand_dims(predictions, 1)
        return predictions
class RandomForestRegression(DecisionTree):

    def __init__(self):
        self.regressor = RandomForestRegressor(n_estimators=100)

    def train(self, X, y, validation_data=None):
        # X shape: n_samples, n_features
        # y shape: n_samples
        self.regressor.fit(X, y)

    def predict(self, X):
        return self.regressor.predict(X)

    def score(self, X, y):
        predictions = np.squeeze(self.regressor.predict(X))
        return np.corrcoef(predictions, y)[0,1]

In [21]:
def one_hot_encode_modern(sequences):
    sequences_array = np.array([[char for char in seq] for seq in sequences])
    sequence_length = sequences_array.shape[1]
    num_samples = sequences_array.shape[0]

    defined_categories = ['A', 'C', 'G', 'T', 'N']
    ohe = OneHotEncoder(sparse_output=False,
                        categories=[defined_categories] * sequence_length,
                        dtype=np.float32) # Still keep this as float32

    return ohe.fit_transform(sequences_array)

def pad_sequence(seq, max_length):
	if len(seq) > max_length:
		diff = len(seq) - max_length
		trim_length = int(diff / 2)
		seq = seq[trim_length : -(trim_length + diff%2)]
	else:
		seq = seq.center(max_length, 'N')
	return seq

def process_seqs(df, seq_length):
	padded_seqs = [pad_sequence(x, seq_length) for x in df['variant']]
	X = one_hot_encode_modern(np.array(padded_seqs))
	y = np.array(df['expn_med_fitted_scaled'], dtype=np.float32)
	return X, y


def split_data_by_peak(data_df):
    np.random.seed(123)

    # 1. Create 'peak_name' column (if not already present)
    # Assuming peak_start and peak_end are numeric, convert to string for concatenation
    data_df['peak_name'] = data_df['peak_start'].astype(str) + 'to' + data_df['peak_end'].astype(str)

    # 2. Get unique peak names
    peak_names = data_df['peak_name'].unique()

    # 3. Randomly sample train peak names (90%)
    train_size = int(0.90 * len(peak_names))
    train_peak_names = np.random.choice(peak_names, size=train_size, replace=False)

    # 4. Determine test peak names
    test_peak_names = np.array([p for p in peak_names if p not in train_peak_names])


    data_train = data_df[data_df['peak_name'].isin(train_peak_names)].copy()
    data_test = data_df[data_df['peak_name'].isin(test_peak_names)].copy()

    return data_train, data_test

In [22]:

data_df = df5
train_df, test_df = split_data_by_peak(data_df.copy())

X_train, y_train = process_seqs(train_df, 150)
X_test, y_test = process_seqs(test_df, 150)



In [36]:
import pandas as pd
import numpy as np, random
np.random.seed(1)
random.seed(1)
from sklearn.model_selection import train_test_split  # sklearn >= 0.18
import sys
import argparse
import time

def train_RandomForest(X_train, X_test, y_train, y_test):
    print("Running random forest regression...")
    model = RandomForestRegression()
    model.train(X_train, y_train)
    predictions = model.predict(X_test)
    
    with open("outputs.txt", 'w') as outfile:
        for i in range(len(predictions)):
            outfile.write(str(float(predictions[i])) + '\t' +
                      str(float(y_test[i])) + '\n')
    
    score = model.score(X_test, y_test)
    print("Score:", score)
    return model, predictions

In [38]:
model, predictions = train_RandomForest(X_train, X_test, y_train, y_test)

Running random forest regression...
Score: 0.07955371559314629


In [41]:
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, predictions)
r_squared = r2_score(y_test, predictions)
rmse = np.sqrt(mse)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Square Error (RMSE): {rmse}")
print(f"R^2: {r_squared}")

Mean Squared Error (MSE): 5.548276973597522
Root Mean Square Error (RMSE): 2.3554780775030624
R^2: -0.10019475191929983
