## Import libraries and functions

In [None]:
import sys
import numpy as np
import pandas as pd

sys.path.append('../Scripts')

from train import *
from predict import *
from features import *
from processing import *

## Create Moreno-Mateos training dataset

In [2]:
# Load original dataset
ds = pd.read_excel('../Data/1. Original datasets/Haeussler.xlsx')

# Extract sequences and efficiency
seq_23, seq_30, eff = extract_haeussler(ds, 'morenoMateos')

# Create new dataframe with rescaled efficiency
moreno = pd.DataFrame({'23-nt sequence': seq_23, '30-nt sequence': seq_30, 'modFreq': eff})
moreno['modFreq'] = rescale(moreno, 3)

# Save modified training dataset
moreno.to_csv('Moreno-Mateos.csv', index=False)

## Train

In [3]:
# Train a U6 Regressor
reg_model = train_model(promoter='u6', classifier=False, save=True)

# Train a T7 Classifier
cls_model = train_model(promoter='t7', classifier=True, save=True)

Optimization terminated successfully.
         Current function value: 0.323759
         Iterations 8


## Predict

### Predict the efficiency of a sequence

In [4]:
seq = 'TTCACAGTCATACCTATAACAGCGAGGTTC'
reg_pred = predict_sequence(seq, promoter='u6', classifier=False)
cls_pred = predict_sequence(seq, promoter='t7', classifier=True)

print(f'Prediction of U6 Regression model: {reg_pred}')
print(f'Prediction of T7 Classification model: {cls_pred}')

Prediction of U6 Regression model: 0.6896745020642117
Prediction of T7 Classification model: (0.0321939694071364, 0)


### Predict the efficiencies of multiple sequences from a file

In [5]:
koike_path = '../Data/3. Testing datasets/1. Regression (U6)/3. mESC (Koike-Yusa) predictions.csv'

predictions = predict_file(koike_path, promoter='u6', classifier=False, save=True)

predictions

Unnamed: 0,Sequence ID,30-nt sequence,Prediction
0,0,TTCACAGTCATACCTATAACAGCGAGGTTC,0.689675
1,1,TTGCAGGTATGTAAGTAATCATAGTGGAGA,0.596946
2,2,AGCCAGTCTACATAACACGCCCATGGGCGC,0.595669
3,3,CGTACGCAACCTGCTCAGCGCCTACGGCGA,0.291837
4,4,GCTGAGCAGGTTGCGTACGTGAAGGGGCCG,0.418130
...,...,...,...
1059,1059,ATTGTAGCATCAAAATTTGGGACCTGGCTC,0.343832
1060,1060,GCCGCCTGCTACGTCGTGCGCGGCGGGACC,0.315489
1061,1061,ACTGCTTTGACACTGCTTACGATTTGGAAG,0.089571
1062,1062,TACCCCGAGACCTCGACGGAGGCCTGGCCG,0.263017
