In [287]:
import numpy as np
import pandas as pd
import sklearn
import os
import itertools

In [288]:
source_file = "./data/combined_data.csv"

df = pd.read_csv(source_file)
df.head()

Unnamed: 0,seq,enrichment,class
0,GGGTGCAA,0.47122,0
1,CGGGTGCA,0.468569,0
2,ATTGCACC,0.434169,0
3,CCGGGTGC,0.433263,0
4,CGCACCCG,0.406588,0


In [289]:
# Hard labels for sequence to class

In [290]:
class_truth = df.groupby('seq')['class'].agg(lambda x: x.mode()[0]).to_dict()

## k-mer Nucleotide Frequency

In [291]:
def generate_kmers(k):
    base_pairs = list("ACGT")
    kmers = itertools.product(base_pairs, repeat=k)
    return kmers

# N = 4
for k in range(1, 5):
    kmers = generate_kmers(k)
    
    new = {}
    for kmer in kmers:
        kmer = "".join(kmer)
        new[f"count_{kmer}"] = df["seq"].apply(lambda x : x.count(kmer))
    
    df = pd.concat([df, pd.DataFrame(new)], axis=1)

## k-Spaced Nucleotide Pair Frequency

In [292]:
pairs = list(generate_kmers(2))

def count_kspaced(seq, pair, k):
    count = 0
    for i in range(len(seq) - 2 - k):
        if seq[i] == pair[0] and seq[i+k+1] == pair[1]:
            count += 1
    return count

#paper does k=0-4, but k=1-4 works without redudent feature
for k in range(1, 5):
    for p in pairs:
        df[f"count_{p[0] + k * '.' + p[1]}"] = df["seq"].apply(lambda x : count_kspaced(x, p, k))
        
#defragment
df = df.copy()

## Nucleotide Physicochemical Property

In [293]:
properties = ["h-bond", "func-group", "ring-struct"]
bps = ["AT", "AC", "AG"]

for p, bp in zip(properties, bps):
    for i in range(8):
        df[f"{p}_{i}"] = df["seq"].apply(lambda x : x[i] in bp)        

In [294]:
#defragment frame
df = df.copy()

## Pseudo k-Tuple Nucleotide Composition

In [295]:
def seq2int(seq):
    i = 0
    for c in list(seq):
        i *= 4
        i += list("ACGT").index(c)
    return i

def int2seq(num):
    seq = ""
    for i in range(8):
        c = num % 4
        num /= 4
        seq = list("ACGT")[int(c)] + seq
    return seq

In [296]:
#Write genes to fafsa file
file_path = "./data/8mers.fasta"
bp = list("ACGT")
with open(file_path, 'w') as file:
    for seq in df['seq']:
        file.write(f">{seq2int(seq)}\n")
        file.write(f"{seq}\n")

### Downloading mathfeatures

$ docker pull bio21061993/mathfeature:latest

$ docker run -it --name mathfeature-terminal bio21061993/mathfeature bash

$ git clone https://github.com/Bonidia/MathFeature.git MathFeature-Terminal

$ cd MathFeature-Terminal

$ conda activate mathfeature-terminal

### Creating PseKNC from fafsa file


$ python3.7 methods/PseKNC.py -i 8mers.fasta -o output5.csv -l 1 -x files/propNames-DNA-k2.txt -xp files/propValues-DNA-k2.txt -seq 1 -t 2 -k 2 -j 5 -w 1.0 -s 2

In [297]:
print(len(df))

14681


In [298]:
#Read and map files:
#PseKNC as created by MathFeatures with lambda = 5, weight = 1
PseKNC = pd.read_csv("./data/output5.csv")
PseKNC = PseKNC.drop(columns='label')
PseKNC["seq"] = PseKNC["nameseq"].apply(lambda x : int2seq(int(x)))
PseKNC = PseKNC.drop(columns='nameseq')
PseKNC = PseKNC.rename(columns = (lambda x : "pseknc-5-" + x[7:] if "pseknc-" in x else x))

df = pd.merge(df, PseKNC, on='seq', how='inner')

In [299]:
#Read and map files:
#PseKNC as created by MathFeatures with lambda = 4, weight = 1
PseKNC = pd.read_csv("./data/output4.csv")
PseKNC = PseKNC.drop(columns='label')
PseKNC["seq"] = PseKNC["nameseq"].apply(lambda x : int2seq(int(x)))
PseKNC = PseKNC.drop(columns='nameseq')
PseKNC = PseKNC.rename(columns = (lambda x : "pseknc-4-" + x[7:] if "pseknc-" in x else x))

df = pd.merge(df, PseKNC, on='seq', how='inner')

In [300]:
print(len(df))

14681


## Electron−Ion Interaction Pseudopotentials of Trinucleotide.

## Train Test split

In [301]:
from sklearn.model_selection import train_test_split


X = df.copy()
y = X["class"]
X = X.drop(columns=["class", "enrichment"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [302]:
X_train.columns

Index(['seq', 'count_A', 'count_C', 'count_G', 'count_T', 'count_AA',
       'count_AC', 'count_AG', 'count_AT', 'count_CA',
       ...
       'pseknc-4-10', 'pseknc-4-11', 'pseknc-4-12', 'pseknc-4-13',
       'pseknc-4-14', 'pseknc-4-15', 'pseknc-4-16', 'pseknc-4-17',
       'pseknc-4-18', 'pseknc-4-19'],
      dtype='object', length=470)

## Feature selection
Don't forget to do train test split first

In [303]:
#Paper uses XGBoost, ANOVA, Chi2, and LASSO
# from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.inspection import permutation_importance

In [304]:
X_train_numeric = X_train.drop(columns="seq")

In [305]:
#Feature selection from gradient boosting
clf = HistGradientBoostingClassifier().fit(X_train_numeric, y_train)

In [306]:
#Permutation-based feature selection, direct impurity based selection is misleading when many columns
result = permutation_importance(clf, X_train_numeric, y_train, n_repeats=20, random_state=42)

In [307]:
support = result.importances_mean > np.mean(result.importances_mean)
print(support)

[False False False False False False False False False False  True False
  True False False False False False False False False False False  True
 False False False False False False False False False  True False  True
 False False False False False False False False False False False False
 False False False False  True False False  True False False False False
 False False False False False False False False False False False False
 False  True False False False False False False False  True False False
 False False False False  True False False  True False False False False
 False False False False  True False False False False False False False
 False  True False False False False False False False False False False
 False False False False False False  True False False False False False
 False False False False False  True False False  True False False False
 False  True False False False False False  True False False  True False
 False False False False False False False  True  T

In [312]:
print(X_train_numeric.columns[support])

Index(['count_CG', 'count_GA', 'count_AAT', 'count_ATC', 'count_ATT',
       'count_GAA', 'count_GAT', 'count_TCC', 'count_TTC', 'count_AACA',
       'count_AACT', 'count_ACAA', 'count_ACGC', 'count_AGGG', 'count_ATCC',
       'count_ATGA', 'count_ATTC', 'count_CAAT', 'count_CACG', 'count_CATT',
       'count_CCAA', 'count_CCAC', 'count_CCCC', 'count_CCCT', 'count_CCTA',
       'count_CGCA', 'count_CGCG', 'count_CGGA', 'count_CGTG', 'count_GAAC',
       'count_GAAT', 'count_GATA', 'count_GCGC', 'count_GGCC', 'count_GGGC',
       'count_GTAA', 'count_GTCA', 'count_GTTC', 'count_TCCG', 'count_TCGA',
       'count_TGAC', 'count_TGCA', 'count_TGCG', 'count_TTCC', 'count_TTGT',
       'count_TTTA', 'count_T.A', 'count_A..G', 'count_C..G', 'pseknc-5-5',
       'pseknc-5-7', 'pseknc-5-10', 'pseknc-5-13', 'pseknc-5-16',
       'pseknc-5-17', 'pseknc-4-5', 'pseknc-4-7', 'pseknc-4-8', 'pseknc-4-13',
       'pseknc-4-16', 'pseknc-4-17', 'pseknc-4-18'],
      dtype='object')


In [308]:
print("Number features before:", len(support))
print("Number features before:", sum(support))

Number features before: 469
Number features before: 62


In [309]:
print(X_train_numeric.columns[support])

Index(['count_CG', 'count_GA', 'count_AAT', 'count_ATC', 'count_ATT',
       'count_GAA', 'count_GAT', 'count_TCC', 'count_TTC', 'count_AACA',
       'count_AACT', 'count_ACAA', 'count_ACGC', 'count_AGGG', 'count_ATCC',
       'count_ATGA', 'count_ATTC', 'count_CAAT', 'count_CACG', 'count_CATT',
       'count_CCAA', 'count_CCAC', 'count_CCCC', 'count_CCCT', 'count_CCTA',
       'count_CGCA', 'count_CGCG', 'count_CGGA', 'count_CGTG', 'count_GAAC',
       'count_GAAT', 'count_GATA', 'count_GCGC', 'count_GGCC', 'count_GGGC',
       'count_GTAA', 'count_GTCA', 'count_GTTC', 'count_TCCG', 'count_TCGA',
       'count_TGAC', 'count_TGCA', 'count_TGCG', 'count_TTCC', 'count_TTGT',
       'count_TTTA', 'count_T.A', 'count_A..G', 'count_C..G', 'pseknc-5-5',
       'pseknc-5-7', 'pseknc-5-10', 'pseknc-5-13', 'pseknc-5-16',
       'pseknc-5-17', 'pseknc-4-5', 'pseknc-4-7', 'pseknc-4-8', 'pseknc-4-13',
       'pseknc-4-16', 'pseknc-4-17', 'pseknc-4-18'],
      dtype='object')


In [310]:
X_train = X_train[["seq"] + list(X_train_numeric.columns[support])]
X_test = X_test[["seq"] + list(X_train_numeric.columns[support])]

### Saving to file

In [311]:
#Write genes to fafsa file
file_path = "./features/"
X_train.to_csv(f"{file_path}X_train.csv")
X_test.to_csv(f"{file_path}X_test.csv")
y_train.to_csv(f"{file_path}y_train.csv")
y_test.to_csv(f"{file_path}y_test.csv")