In [11]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict
import math
from sklearn.preprocessing import StandardScaler
from itertools import product
from sklearn import metrics
from random import shuffle
import re

In [12]:
from random_dataset import create_random_dataset
from evolutionary_algorithm import EA
from greedy_algorithm import GreedyAlgorithm
from neg_sel import NegativeSelection, load_data

# Languages

In [7]:
char_aminoacids = [ '_', 'a', 'b', 'c', 'd', 'e',
	'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
	'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' ]
allowed = set(char_aminoacids)

In [8]:
languages_data_dir = "./data/languages/"
english_data_dir = languages_data_dir + "english/"
input_path = english_data_dir + "moby_dick.txt"
output_path = english_data_dir + "english_6_train.txt"

with open(input_path, "r", encoding="utf-8") as f:
    text = f.read().lower()

processed = ''.join([c if c in allowed else '_' for c in text])

# remove any uncesseary '_' characters
processed = re.sub(r'_+', '_', processed)

length = 6
lines = [processed[i:i+length] for i in range(0, len(processed), length)]

# lines must have character amount equal to length
lines = [line for line in lines if len(line) == length]

with open(output_path, "w", encoding="utf-8") as f:
    f.write('\n'.join(lines))

In [10]:
# create test set
import random

languages_data_dir = "./data/languages/"
english_data_dir = languages_data_dir + "english/"
input_path = english_data_dir + "bible.txt"
output_path = english_data_dir + "english_6_test.txt"

with open(input_path, "r", encoding="utf-8") as f:
    text = f.read().lower()

processed = ''.join([c if c in allowed else '_' for c in text])

# remove any uncesseary '_' characters
processed = re.sub(r'_+', '_', processed)

length = 6
lines = [processed[i:i+length] for i in range(0, len(processed), length)]

# lines must have character amount equal to length
lines = [line for line in lines if len(line) == length]

sample_size = 2000
sampled_lines = random.sample(lines, sample_size)

with open(output_path, "w", encoding="utf-8") as f:
    f.write('\n'.join(sampled_lines))

# Language Discrimination

In [18]:
english_train_path = english_data_dir + "english_6_train.txt"
self_english_data = pd.read_csv(english_train_path, header=None)

sampled_english_data = create_random_dataset(df=self_english_data, n=10000, seed=42)
sampled_english_data = sampled_english_data[0].to_list()

len(sampled_english_data)

10000

In [19]:
amino_acids = "_abcdefghijklmnopqrstuvwxyz"

motifs = ["".join(motif) for motif in product(amino_acids, repeat=6)]

print(f"Total number of motifs: {len(motifs)}")

Total number of motifs: 387420489


In [20]:
num_sampled_motifs = math.ceil(len(motifs) * 0.01)
sampled_motifs = random.sample(motifs, num_sampled_motifs)

In [22]:
greedy_optimizer = GreedyAlgorithm(peptides=sampled_english_data, motifs=sampled_motifs, t=3, seed=42)
optimized_dataset = greedy_optimizer.run()

with open(english_train_path + "greedy_english_10000_1.txt", "w") as f:
    for item in optimized_dataset:
        f.write(f"{item}\n")
        
print(f"Size of greedily optimized data set: {len(optimized_dataset)}")

test12
Size of greedily optimized data set: 3574
