## Import libraries and functions

In [None]:
import sys
import json
import glob
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LassoCV, RidgeCV, LinearRegression

sys.path.append('../Scripts')

In [None]:
from analysis import *
from features import *
from processing import *
from feature_selection import *

# Scoring (Spearman correlation)
spearman = make_scorer(Spearman)

## Load data

In [None]:
datasets = ['../Data/2. Training datasets/DeepSpCas9 (Library).csv',
            '../Data/2. Training datasets/Moreno-Mateos.csv']

# Select dataset for training/feature extraction
ds = pd.read_csv(datasets[0])

# Define X, Y
X = ds['30-nt sequence']
Y_train = ds['modFreq']

# Create initial features
X_train = feature_train(X)

# Create final features based on the appropriate promoter (for comparison)
X_seq = seq_train(X, promoter='u6')

print(X_train.shape, X_seq.shape)

## Feature selection

### LASSO coefficients

Keep the features with non-zero coefficients.

In [None]:
thresh = 1e-06
model = LassoCV(random_state=42)

X_lasso = feature_importance(model, X_train, Y_train, threshold=thresh, save=True)

### RFECV

Find the optimal number of features using RFECV.

In [None]:
# Model and CV split
model = LassoCV(random_state=42)
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

X_lasso_red = rfecv_selection(model, X_lasso, Y_train, cv=k_fold, scoring=spearman, figure=True, save=True)

### Genetic Algorithm search

In [None]:
# Model and CV split
model = RidgeCV()
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

X_ga = ga_selection(model, X_lasso, Y_train, cv=k_fold, scoring=spearman, max_features=10, save=True)

## Evaluate with 10-fold Cross Validation

In [None]:
# Define models and CV strategy
models = []
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

models.append(('XGB', XGBRegressor(objective='reg:squarederror', random_state=42)))
models.append((' LR', LinearRegression()))
models.append((' DT', DecisionTreeRegressor(random_state=42)))
models.append(('SVR', SVR(gamma='auto')))
models.append(('RFR', RandomForestRegressor(n_estimators=100, random_state=42)))

# Evaluate each model in turn
for name, model in models:
    cv_results = cross_val_score(model, X_seq, Y_train, cv=kfold, scoring=spearman)
    print("%s: %.3f (%f)" % (name, cv_results.mean(), cv_results.std()))