In [None]:
import pyrokebabs as pb
import pandas as pd
import numpy as np
from Bio.Seq import Seq
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, LeaveOneOut, KFold
#!pip install Bio strkernel
from sklearn.svm import SVC
import time
from sklearn.model_selection import GroupKFold
from sklearn.svm import LinearSVC
import random

In [None]:
# Creates random DNA sequences

# SUBROUTINE 
# Description: creates a random string from given chars
# @Dependencies     -> None
# @Routineparams:
# letters           -> contais the letters used for creating the sequence
# @Functionparams:
# length            -> defines the final length if every sequence 
# @Return           -> returns a random DNA sequence of choosen params
def createDNA(length):
    letters = ['C','G','T','A']
    return ''.join(random.choice('CGTA') for _ in range(length))


#ROUTINE
# Description: Sets the wanted params and calls subroutine createDNA
# @Dependencies:
# numpy         -> import numpy as np
# @Routineparams:
# amount        -> the number of sequences that should be created
# seqlength     -> is the length of every sequence
# seq           -> holds the created sequences
# target        -> holds the coressponding target vector alternating between 1 and -1 every created sequence
# @Return       -> returns the created sequences and their corresponding target vector
def createDNAvec(amount,seqlength):
    amount = amount
    seqlength = seqlength
    seq = np.zeros(amount,dtype=object)
    target = np.zeros(amount)
    for i in range(amount):
        seq[i] = createDNA(seqlength)
        if i & 2 == 0:
            target[i] = 1
        else:
            target[i] = -1
    return seq,target

In [None]:
# Reads a given CSV File and splits its content into a dictionary, in this case a table for a SVM problem

# ROUTINE
# Description: reads a CSV-File and splits its content into a dictionary
# @Dependencies     -> None
# @Routineparams:
# data_raw          -> contais the unprocessed data as it was read from the CSV
# data              -> contains a dictionary with certain collums split up into arrays
# @Functionparams:
# filepath          -> contais the filepath to the CSV that should be read
# sep               -> contains the separator used in the CSV
# @Return           -> returns the dictionary containing the specified arrays
def readInput(filepath,sep):
    data_raw = pd.read_csv(filepath, sep=sep)
    data = {'data': data_raw.iloc[:, -1],
             'target': np.where(data_raw.iloc[:, 0] == '-', -1 ,1),
             'feature_names': data_raw.columns[2],
             'target_names': ['pos', 'neg']}
    return data

In [None]:
# ROUTINE 
# Description: reads a CSV-File and splits its content into a dictionary
# @Dependencies:     
# Bio.Seq           -> from Bio.Seq import Seq
# @Routineparams:
# data_transformed  -> contais an array of chars that have been converted to a sequence
# @Functionparams:
# data              -> contais an array of chars that should be converted to a sequence
# @Return           -> returns an array with sequences
def transformToSeq(data):
    dataTransformed = [Seq(x) for x in data]
    return dataTransformed    

In [None]:
# Data preprocessing
DNAS = createDNAvec(10,50)
data = readInput('testdata/promoters.csv',",")
data['data'] = transformToSeq(data['data'])


In [None]:
# Creating a PyKebabs object
flo = pb.Pybabs(data,'dna')

In [None]:
#Test unbiasedCV specKernel creation
spec_model = flo.unbiasedCV(k=2,g=0)
spec_model

In [None]:
#Test unbiasedCV gappyKernel creation
gappy_model = flo.unbiasedCV(k=4,g=2)
gappy_model

In [None]:
#Test unbiasedCV outercv,innercv and verbose
cv_model = flo.unbiasedCV(k=4,g=0,outercv=5,innercv=3,verbose=3)
cv_model

In [None]:
#Test C
c_model = flo.unbiasedCV(k=4,g=0,verbose=5, C=[1,2])
c_model

In [None]:
#Test SVM
svm_model = flo.unbiasedCV(k=4,g=0,verbose=2, C=[1,2],svm='Lib')
svm_model

In [None]:
#Test kernopt spec
kernoptSpec_model = flo.unbiasedCV(k=4,g=0,verbose=2,kernopt=True)
kernoptSpec_model

In [None]:
#Test kernopt gappy
kernoptGappy_model = flo.unbiasedCV(k=2,g=2,verbose=2, C=[1,2],kernopt=True)
kernoptGappy_model

In [None]:
#Test sparse,norm and gram specKernel creation
spec_modelparams = flo.unbiasedCV(k=2,g=0,norm=True,sparse=True,gram=True)
spec_modelparams

In [None]:
#Test grouped-crossval
groups = [0,1,2,3,4,5,2,3,0,1,0,1,2,3,0,1,2,3,0,1,0,1,2,3,0,1,2,3,4,5,0,1,2,3,0,1,2,3,0,1,0,1,2,3,0,1,2,3,0,1,0,1,2,3,0,1,2,3,0,1,0,1,2,3,0,1,2,3,0,1,0,1,2,3,0,1,2,3,0,1,0,1,2,3,0,1,2,3,0,1,0,1,2,3,0,1,2,3,0,1,0,1,2,3,0,1]
flo.groupedCV(4,0,groups,verbose=3)

In [None]:
#Test kernelcreation spec
kernel = flo.createKernel(6,0,sparse=False,gram=False,norm=True)

In [None]:
#Test kernelcreation gappy
kernel = flo.createKernel(4,2,sparse=True,gram=False,norm=True)

In [None]:
#Test SVM-model creation
model = flo.pybabsSVMtrain(4,0,0.0125)

In [None]:
df,dict = flo.getWeights(model)
df1,dict1 = flo.getWeights(spec_model)
df

In [None]:
flo.getPredProfile(model,80)

In [None]:
flo.getPredProfile(spec_model,80)