In [24]:
from collections import defaultdict
import random

random.seed(42)

class StochasticModel:
    def __init__(self):
        self.probs = None

    def fit(self, y):
        self.probs = defaultdict(int)
        sum_vals = 0
        for input in y:
            sum_vals += len(input)
            for dssp in input:
                self.probs[dssp.upper()] += 1
        for keys in self.probs:
            self.probs[keys] /= sum_vals

    def predict(self, X):
        if self.probs is None:
            raise ValueError("Model must be fit before evaluation.")
        predictions = []
        for aa in X:
            pred = []
            for residue in aa:
                rand = random.random()
                proba = 0
                for dssp in self.probs:
                    proba += self.probs[dssp]
                    if rand < proba:
                        pred.append(dssp)
                        break
            predictions.append("".join(pred))
        return predictions


    def evaluate(self, X, y):
        if self.probs is None:
            raise ValueError("Model must be fit before evaluation.")
        predictions = self.predict(X)
        correct = 0
        total = 0
        i = 0
        for index, value in y.items():
            for (j, val) in enumerate(value):
                correct += 1 if predictions[i][j] == val else 0
                total += 1
            i += 1
        print(f"Accuracy of the model is {correct/total}")
        return correct/total


In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split


df = pd.read_csv("./data/data.csv")


y = df['dssp8']
X_original = df['input']
X_train, X_test, y_train, y_test = train_test_split(X_original, y, test_size=0.2, random_state=42)

# from models.simple_window_model import TreeWindowModel
# model = TreeWindowModel(window_length=17)
model = StochasticModel()
# model.fit(X_train, y_train)
model.fit(y_train)

print(y_test)
print(model.evaluate(X_test, y_test))


17805    CEEEEEEEECHHHHTTSSCCCHHHHHHHHTTTBSEECCEEEEECSC...
8778     CEEEEECSSCCSCCCCCCSEEEEETHHHHHHHHTTCCCSEEEECCT...
2549     CCHHHHHHHHHHHHTCCHHHHHHHHHHHHHHHHHHHHTTCCEEETT...
3106              CCCCCCCCCHHHHHHHHHHHHHHHHTCCCCTHHHHHHHCC
251      CCCCCCHHHHHHHHHTCSCHHHHHHHHHHHHTCCEETTTTEECCTT...
                               ...                        
10277    CCSCHHHHHHHHHHHHTCEEEEEETTSCEEEEEEEEECSSEEEEEC...
16593    CEEECSSEEEEEEEEEEECSSEEEEEEEETTTCCEEEEEEECSGGG...
293      CEEEEEEECCSSCCSSSCHHHHHHHHHHHHHHHHCSCCEEETTEEE...
9395     CCCEEEEEEEEESSCHHHHHHHHHCGGGGGGTSTTEEEEEESSSSS...
9614     CBCCEEEESCHHHHHHHHHHHHHHHTTTSSCCCCCEEPPPEEEEEE...
Name: dssp8, Length: 3747, dtype: object
Accuracy of the model is 0.2104687140091265
0.2104687140091265


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from models.naive_bayes_model import NaiveBayesModel

df = pd.read_csv("./data/data.csv")


y = df['dssp8']
X_original = df['input']
X_train, X_test, y_train, y_test = train_test_split(X_original, y, test_size=0.2, random_state=42)

model = NaiveBayesModel()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# print(model.evaluate(X_test, y_test))
# print(model.evaluate_rand(X_test, y_test))
# print(model.evaluate_bayes(X_test, y_test))
# print(model.evaluate_bayes_rand(X_test, y_test))

KeyError: 'MERAEILGVGTELLYGETLDTNTAEIARSLKPYALKVERTLRVADEVAPLAREVEEAFARARLVVLSGGLGPTPDDVTREAVALALGEPLELDEAVLGEIEAFFRARGRAMPEANRKQAMRIPSATWLKNPRGTAPGWWVRKGGKDLVLLPGPPPEWRPMWQEVLPRLGLPRRPYAERVLKTWGIGESEIVERLGPLFVREEEVEVGTYPKVHGVEVVVRGREDRVAELAERIKKKLLKEVWGEGEMTLAEAVKRRMEREGATLSTMESLTGGLLGAEITRVPGASRFYLGGVVSYSVGAKARFGVPQDLLSRTVSAETARAMAEAARSLFGSTYALATTGVAGPDPLEGEPPGTVYVALAGPTGAEVRRYRFPGDRETVRLRSVYAALALLVT'