# RandomForestClassifierCustom

In [22]:
from sklearn.datasets import make_classification
from custom_random_forest import RandomForestClassifierCustom
from bio_files_processor import MeasureTime


X, y = make_classification(n_samples=100000)
random_forest_1_job = RandomForestClassifierCustom(
    max_depth=30, n_estimators=10, max_features=2, random_state=42
)
random_forest_2_job = RandomForestClassifierCustom(
    max_depth=30, n_estimators=10, max_features=2, random_state=42
)

In [23]:
print("RF fitting time with one job:")
with MeasureTime():
    random_forest_1_job.fit(X, y, n_jobs=1)
print("\nRF fitting time with two jobs:")
with MeasureTime():
    random_forest_2_job.fit(X, y, n_jobs=2)

RF fitting time with one job:
The program block took 8.314451694488525 seconds to complete

RF fitting time with two jobs:
The program block took 6.0656726360321045 seconds to complete


In [24]:
print("RF predict time with one job:")
with MeasureTime():
    y_pred_1_job = random_forest_1_job.predict(X)
print("\nRF predict time with two jobs:")
with MeasureTime():
    y_pred_2_job = random_forest_2_job.predict(X)

RF predict time with one job:
The program block took 2.8008508682250977 seconds to complete

RF predict time with two jobs:
The program block took 0.5011236667633057 seconds to complete


In [25]:
all(y_pred_1_job == y_pred_2_job)

True

# OpenFasta

In [26]:
from bio_files_processor import OpenFasta

In [27]:
with OpenFasta("data/example_fasta.fasta") as f:
    print(f.read_record())
    print(f.read_record())
    print(f.read_record())
    print(f.read_record())
    print(f.read_record())

ID: >GTD323452,
 Description: 5S_rRNA NODE_272_length_223_cov_0.720238:18-129(+),
 Sequence:ACGGCCATAGGACTTTGAAAGCACCGCATCCCGTCCGATCTGCGAAGTTAACCAAGATGCCGCCTGGTTAGTACCATGGTGGGGGACCACATGGGAATCCCTGGTGCTGTG

ID: >GTD678345,
 Description: 16S_rRNA NODE_80_length_720_cov_1.094737:313-719(+),
 Sequence:TTGGCTTCTTAGAGGGACTTTTGATGTTTAATCAAAGGAAGTTTGAGGCAATAACAGGTCTGTGATGCCCTTAGATGTTCTGGGCCGCACGCGCGCTACACTGAGCCCTTGGGAGTGGTCCATTTGAGCCGGCAACGGCACGTTTGGACTGCAAACTTGGGCAAACTTGGTCATTTAGAGGAAGTAAAAGTCGTAACAAGGT

ID: >GTD174893,
 Description: 16S_rRNA NODE_1_length_2558431_cov_75.185164:2153860-2155398(+),
 Sequence:TTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAACAGCTTGCTGTTTCGCTGACGAGTGGGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCTT

ID: >GTD906783,
 Description: 16S_rRNA NODE_1_length_2558431_cov_75.185164:793941-795479(-),
 Sequence:TTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAA

In [28]:
with OpenFasta("data/example_fasta.fasta") as f:
    for record in f:
        print(record)

ID: >GTD323452,
 Description: 5S_rRNA NODE_272_length_223_cov_0.720238:18-129(+),
 Sequence:ACGGCCATAGGACTTTGAAAGCACCGCATCCCGTCCGATCTGCGAAGTTAACCAAGATGCCGCCTGGTTAGTACCATGGTGGGGGACCACATGGGAATCCCTGGTGCTGTG

ID: >GTD678345,
 Description: 16S_rRNA NODE_80_length_720_cov_1.094737:313-719(+),
 Sequence:TTGGCTTCTTAGAGGGACTTTTGATGTTTAATCAAAGGAAGTTTGAGGCAATAACAGGTCTGTGATGCCCTTAGATGTTCTGGGCCGCACGCGCGCTACACTGAGCCCTTGGGAGTGGTCCATTTGAGCCGGCAACGGCACGTTTGGACTGCAAACTTGGGCAAACTTGGTCATTTAGAGGAAGTAAAAGTCGTAACAAGGT

ID: >GTD174893,
 Description: 16S_rRNA NODE_1_length_2558431_cov_75.185164:2153860-2155398(+),
 Sequence:TTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAACAGCTTGCTGTTTCGCTGACGAGTGGGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCTT

ID: >GTD906783,
 Description: 16S_rRNA NODE_1_length_2558431_cov_75.185164:793941-795479(-),
 Sequence:TTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAA

In [29]:
with OpenFasta("data/example_fasta.fasta") as f:
    total_fasta = f.read_records()
total_fasta

[ID: >GTD323452,
  Description: 5S_rRNA NODE_272_length_223_cov_0.720238:18-129(+),
  Sequence:ACGGCCATAGGACTTTGAAAGCACCGCATCCCGTCCGATCTGCGAAGTTAACCAAGATGCCGCCTGGTTAGTACCATGGTGGGGGACCACATGGGAATCCCTGGTGCTGTG,
 ID: >GTD678345,
  Description: 16S_rRNA NODE_80_length_720_cov_1.094737:313-719(+),
  Sequence:TTGGCTTCTTAGAGGGACTTTTGATGTTTAATCAAAGGAAGTTTGAGGCAATAACAGGTCTGTGATGCCCTTAGATGTTCTGGGCCGCACGCGCGCTACACTGAGCCCTTGGGAGTGGTCCATTTGAGCCGGCAACGGCACGTTTGGACTGCAAACTTGGGCAAACTTGGTCATTTAGAGGAAGTAAAAGTCGTAACAAGGT,
 ID: >GTD174893,
  Description: 16S_rRNA NODE_1_length_2558431_cov_75.185164:2153860-2155398(+),
  Sequence:TTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAACAGCTTGCTGTTTCGCTGACGAGTGGGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCTT,
 ID: >GTD906783,
  Description: 16S_rRNA NODE_1_length_2558431_cov_75.185164:793941-795479(-),
  Sequence:TTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAAC

In [30]:
type(total_fasta[0])

bio_files_processor.FastaRecord

# run_genscan

In [31]:
from bioinf_tools import run_genscan
import pandas as pd

In [32]:
file = "data/example_gene_for_genscan.fna"

output = run_genscan(sequence_file=file)

print("Status:", output.status, "\n")
print("Predicted CDS List:")
print(output.cds_list[0]["peptide"], ":", output.cds_list[0]["sequence"])

exon_df = pd.DataFrame(output.exon_list)
print("\nExon List:")
print(exon_df)

intron_df = pd.DataFrame(output.intron_list)
print("\nIntron List:")
print(intron_df)

Status: 200 

Predicted CDS List:
['GENSCAN_predicted_peptide_1', '681_aa'] : MRFVKQYCYGGAAEKGRGFEREQKKMPAVRNPCRIHETLPIAAGGHFALELTTPEQGRDSPDAGRLFCPFGDTSPPLPGPASLKGSPCSCLDAGFFSGSGKPALTPLPRPPSGVLAPEMRRNCEERGSGRFQNSCYPWWGGSGGGIAAGSLAQLHLRIECEGRWRKTLCLGFWQIVFLTATSRGFLRAPGPISIPLPLRGRLPGFALRAPGGAGARRAPSRWFTKCVSEIAGDCPKGQPPATMPLNVSFTNRNYDLDYDSVQPYFYCDEEENFYQQQQQSELQPPAPSEDIWKKFELLPTPPLSPSRRSGLCSPSYVAVTPFSLRGDNDGGGGSFSTADQLEMVTELLGGDMVNQSFICDPDDETFIKNIIIQDCMWSGFSAAAKLVSEKLASYQAARKDSGSPNPARGHSVCSTSSLYLQDLSAAASECIDPSVVFPYPLNDSSSPKSCASQDSSAFSPSSDSLLSSTESSPQGSPEPLVLHEETPPTTSSDSEEEQEDEEEIDVVSVEKRQAPGKRSESGSPSAGGHSKPPHSPLVLKRCHVSTHQHNYAAPPSTRKDYPAAKRVKLDSVRVLRQISNNRKCTSPRSSDTEENVKRRTHNVLERQRRNELKRSFFALRDQIPELENNEKAPKVVILKKATAYILSVQAEEQKLISEEDLLRKRREQLKHKLEQLRNSCA

Exon List:
  number  type  start   end
0   1.01  Init    366   443
1   1.02  Intr   1002  1230
2   1.03  Intr   1976  2156
3   1.04  Intr   2461  2683
4   1.05  Intr   2855  3626
5   1.06  Term   5003  5565
6   1.07  PlyA   5860  5865

Int

# AminoAcidSequence / DNASequence / RNASequence

In [33]:
from bioinf_tools import AminoAcidSequence, DNASequence, RNASequence

In [34]:
rna_seq = RNASequence("AUGC")
rna_seq.complement()

NucleicAcid(sequence=UACG)

In [35]:
aa_seq = AminoAcidSequence("CASSQDTEVFF")
print(aa_seq.is_valid_alphabet())
aa_seq.calculate_aa_freq()

True


{'C': 1, 'A': 1, 'S': 2, 'Q': 1, 'D': 1, 'T': 1, 'E': 1, 'V': 1, 'F': 2}

In [36]:
dna_seq = DNASequence("ATGCCGTA")
print(dna_seq.transcribe())

type(dna_seq.transcribe())

AUGCCGUA


bioinf_tools.RNASequence

In [37]:
dna_seq = DNASequence("ATGCCGTA")
print(dna_seq.transcribe())

type(dna_seq.transcribe())

AUGCCGUA


bioinf_tools.RNASequence