In [1]:
import time
from pybioseq_utils import DNASequence, RNASequence
from bio_files_processor import OpenFasta
from custom_random_forest import RandomForestClassifierCustom
from sklearn.datasets import make_classification

# biopyseq-utils 
Library for processing protein, nucleic sequences and filter fastaq files. 

### Usage
This section contains example of usage some of biopyseq-utils functions.

### `DNASequence` - class, that represents a DNA sequence

`complement` - gives complement DNA or RNA sequence.

In [2]:
dna = DNASequence("AGATACACA")
dna.complement().seq

'TCTATGTGT'

`transcribe` - transcribes given DNA sequences to RNA.

In [3]:
dna = DNASequence("AGATACACA")
dna.transcribe().seq

'AGAUACACA'

### `RNASequence` - class, that represents a RNA sequence

In [4]:
dna = RNASequence("AGAUACACA")
dna.complement().seq

'UCUAUGUGU'

# bio_files_processor
Library for proccessing fasta files.


### Usage
This section contains example of usage some of bio_files_processor features.

### `OpenFasta` - context manager for reading FASTA files

In [5]:
with OpenFasta("example_data/example_fasta.fasta") as fasta:
    record = fasta.read_record()
    print(record.seq)

ACGGCCATAGGACTTTGAAAGCACCGCATCCCGTCCGATCTGCGAAGTTAACCAAGATGCCGCCTGGTTAGTACCATGGTGGGGGACCACATGGGAATCCCTGGTGCTGTG


# custom_random_forest
Script with custom random forest classificator.


### Usage
This section contains example of usage custom random forest classificator with 1 and 2 threads with corresponding time consumption.

In [6]:
X, y = make_classification(n_samples=100000)
random_forest = RandomForestClassifierCustom(max_depth=30, n_estimators=10, 
                                             max_features=2, random_state=42)

In [7]:
# Fit and predict with 1 thread
rf_single_thread = RandomForestClassifierCustom()
start_time_single_fit = time.time()
rf_single_thread.fit(X, y, n_jobs=1)
end_time_single_fit = time.time()

start_time_single_predict = time.time()
predictions_single_thread = rf_single_thread.predict(X, n_jobs=1)
end_time_single_predict = time.time()

In [8]:
# Fit and predict with 2 threads
rf_multi_thread = RandomForestClassifierCustom()
start_time_multi_fit = time.time()
rf_multi_thread.fit(X, y, n_jobs=2)
end_time_multi_fit = time.time()

start_time_multi_predict = time.time()
predictions_multi_thread = rf_multi_thread.predict(X, n_jobs=2)
end_time_multi_predict = time.time()

In [9]:
# Print time metrics
print("Time for fit with 1 thread:", end_time_single_fit - start_time_single_fit)
print("Time for fit with 2 threads:", end_time_multi_fit - start_time_multi_fit)
print("Time for predict with 1 thread:", end_time_single_predict - start_time_single_predict)
print("Time for predict with 2 threads:", end_time_multi_predict - start_time_multi_predict)

Time for fit with 1 thread: 6.486485719680786
Time for fit with 2 threads: 3.9471328258514404
Time for predict with 1 thread: 0.5635766983032227
Time for predict with 2 threads: 0.301513671875


In [10]:
predictions_match = (predictions_single_thread == predictions_multi_thread).all()
print("Predictions match:", predictions_match)

Predictions match: False
