In [1]:
import sys
import sklearn
import numpy as np
import pandas as pd
from sklearn import model_selection

URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences/promoters.data'


def get_data(test_size=0.5):
    names = ['Class', 'id', 'Sequence']
    data = pd.read_csv(URL, names=names)
    # Turn into a pandas DF where each row is a DNA sequence with 57 nucleotides
    classes = data.loc[:, 'Class']
    sequences = list(data.loc[:, 'Sequence'])
    dataset = {}

    # loop through sequences and split into individual nucleotides
    for i, seq in enumerate(sequences):

        # split into nucleotides, remove tab characters
        nucleotides = [x for x in seq if x != '\t']    
        # append class assignment
        nucleotides.append(classes[i])    
        # add to dataset
        dataset[i] = nucleotides
    df = pd.DataFrame(dataset)
    df = df.transpose()
    df.rename(columns = {57: 'Class'}, inplace = True) 
    # Convert to one hot encoding for each nucleotide. Drop the extra class, we only need 1
    numerical_df = pd.get_dummies(df)
    df = numerical_df.drop(columns=['Class_-'])
    df.rename(columns = {'Class_+': 'Class'}, inplace = True)
    
    # Create X and Y datasets for training
    X = np.array(df.drop(['Class'], 1))
    y = np.array(df['Class'])
    
    # Make each example into a tuple of a single feature vector and an empty edge list
    X_ = [(np.atleast_2d(x), np.empty((0, 2), dtype=np.int)) for x in X]
    Y = y.reshape(-1, 1)
    # split data into training and testing datasets
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X_, Y, test_size=test_size)

    return X_train, X_test, y_train, y_test

In [2]:
X_train, X_test, y_train, y_test = get_data()

In [33]:
X_train

[(array([[1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
          0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
          0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
          0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
          1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
          0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
          0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
          1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
          0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
          0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
          0, 0, 1, 0, 1, 0, 0, 0]], dtype=uint8),
  array([], shape=(0L, 2L), dtype=int32)),
 (array([[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
          0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1

### Train and test an SVC model


In [4]:
# from sklearn.svm import SVC
# from sklearn.metrics import classification_report, accuracy_score

# classifier = SVC()

# classifier.fit(X_train, y_train)
# predictions = classifier.predict(X_test)

# print("Test accuracy: {}".format(accuracy_score(y_test, predictions)))
# print('\n\n')
# print(classification_report(y_test, predictions))


### Use the latent structured SVM from PyStruct

In [48]:
from pystruct.models import LatentGraphCRF
from pystruct.learners import OneSlackSSVM, LatentSSVM

latent_pbl = LatentGraphCRF(n_states_per_label=5,
                            inference_method='unary')
base_ssvm = OneSlackSSVM(latent_pbl, C=1, tol=.01,
                       inactive_threshold=1e-3)
latent_svm = LatentSSVM(base_ssvm=base_ssvm, latent_iter=2)

latent_svm.fit(X_train, y_train)

In [49]:
print("Train: {:2.2f}".format(latent_svm.score(X_train, y_train)))
print("Test: {:2.2f}".format(latent_svm.score(X_test, y_test)))

Train: 1.00
Test: 0.85


### Example of LatentSSVM

In [7]:
# """
# ================================================
# Latent SVM for odd vs. even digit classification
# ================================================
# A Latent CRF with one node is the same as a latent multiclass SVM
# Using the latent variables, we can learn non-linear models. This is the
# same as a simple Latent SVM model. It would obviously be more effiencent
# to implement a special case for Latent SVMs so we don't have to run an
# inference procedure.
# This example uses the scikit-learn digit classification dataset, but poses
# the problem as a binary one, discriminating between even and odd digits.
# """

# import numpy as np
# import matplotlib.pyplot as plt

# try:
#     from sklearn.model_selection import train_test_split
# except ImportError:
#     from sklearn.cross_validation import train_test_split
# from sklearn.datasets import load_digits

# from pystruct.models import GraphCRF, LatentGraphCRF
# from pystruct.learners import NSlackSSVM, LatentSSVM

# # Load the scikit-learn digits classification dataset.
# digits = load_digits()
# X, y_org = digits.data, digits.target
# X /= X.max()

# # Make binary task by doing odd vs even numers.
# y = y_org % 2

# # Make each example into a tuple of a single feature vector and an empty edge
# # list
# X_ = [(np.atleast_2d(x), np.empty((0, 2), dtype=np.int)) for x in X]
# Y = y.reshape(-1, 1)

# X_train_, X_test_, y_train, y_test =train_test_split(X_, Y, test_size=.5)

# # First, perform the equivalent of the usual SVM.  This is represented as
# # a CRF problem with no edges.

# pbl = GraphCRF(inference_method='unary')
# # We use batch_size=-1 as a binary problem can be solved in one go.
# svm = NSlackSSVM(pbl, C=1, batch_size=-1)

# svm.fit(X_train_, y_train)

# # Now, use a latent-variabile CRF model with SVM training.
# # 5 states per label is enough capacity to encode the 5 digit classes.

# latent_pbl = LatentGraphCRF(n_states_per_label=5,
#                             inference_method='unary')
# base_ssvm = NSlackSSVM(latent_pbl, C=1, tol=.01,
#                        inactive_threshold=1e-3, batch_size=10)
# latent_svm = LatentSSVM(base_ssvm=base_ssvm, latent_iter=2)
# latent_svm.fit(X_train_, y_train)

# print("Score with binary SVM:")
# print("Train: {:2.2f}".format(svm.score(X_train_, y_train)))
# print("Test: {:2.2f}".format(svm.score(X_test_, y_test)))

# print("Score with latent SVM:")
# print("Train: {:2.2f}".format(latent_svm.score(X_train_, y_train)))
# print("Test: {:2.2f}".format(latent_svm.score(X_test_, y_test)))

# h_pred = np.hstack(latent_svm.predict_latent(X_test_))
# print("Latent class counts: %s" % repr(np.bincount(h_pred)))

