In [264]:
import gensim
from gensim import corpora, models
import math
import csv
import random

In [274]:
tokenizer = RegexpTokenizer(' ')
TEST_DATA_SET = []
DATA_FILE = "../Data/data_full_simple.csv"
PROTEIN_SEQUENCE_INDEX = 1
TM_INDEX = 0
INTERVAL = 10
TOPICS_TO_GET = 20
WORDS_PER_TOPIC = 2

In [275]:
def set_dictionary(interval, min_temp, max_temp):
    """
    :param interval: 
    :param min_temp: 
    :param max_temp: 
    :return: an array of ranges [min, max, string representing the range]
    :return: a dictionary mapping the range to a (min, max) tuple
    """
    vals_and_strings = []
    range_dict = {}
    for i in [x for x in range(int(min_temp) - interval, int(max_temp) + interval) if x % interval == 0]:
        vals_and_strings.append([int(i), int(i + interval - 1), str(i) + "-" + str(i + interval - 1)])
        range_dict[str(i) + "-" + str(i + interval - 1)] = (int(i), int(i + interval - 1))
    return vals_and_strings, range_dict

def separate_learn_and_test_data(data):
    """
    :param data: 
    :return: 
    """
    random.shuffle(data)
    return(data[int(len(data) / 5):], data[:int(len(data) / 5)])
    
def set_data_based_on_dictionary(data, vals_and_strings):
    """
    :param data: 
    :param vals_and_strings:
    :return: 
    """
    topic_analysis_data = {}
    for item in vals_and_strings:
        for point in data:
            if point[0] >= item[0] and point[0] <= item[1]:
                if topic_analysis_data.get(item[2]) is not None:
                    topic_analysis_data[item[2]].append(point[1])
                else:
                    topic_analysis_data[item[2]] = [point[1]]
    return topic_analysis_data

def get_predicted_range(learned_topics, test_result_topics, range_dict):
    """
    :param learned_topics: a dictionary matching each learned topic to the amino acids in that topic
    :param test_result_topic: a 2d array of length TOPICS_TO_GET, returning topic from running the tests
    :param range_dict: a dictionary mapping the range to a (min, max) tuple
    :return: returns the range predicted, a (min, max) tuple
    """
    matching_topics = []
    for topic in learned_topics.keys():
        for result in test_result_topics:
            if result in learned_topics[topic]:
                matching_topics.append(topic)
    # Convert matching topics array of strings (e.g. ["320-329", "330-339",...]) to integer form
    min_val = None
    max_val = None
    for t in matching_topics:
        if min_val is None or range_dict[t][0] < min_val:
            min_val = range_dict[t][0]
        if max_val is None or range_dict[t][1]  > max_val:
            max_val = range_dict[t][0] 
    return min_val, max_val
    # Get entire range and return
    
def get_topic_data(data_array):
    """
    :param data_array: an array of strings, representing the a range of data
    :return: a 2D array of topics, where each topic is a few amino acids, of the form [['A', 'G', 'L'], ['G', 'F', 'L']...]
    """
    texts = []
    for item in data_array:
        amino_acid = item.upper()
        new_str = ""
        for ch in amino_acid:
            new_str += ch + " "
        new_str.strip()
        if new_str != "":
            to_append = new_str.split(" ")
            to_append.pop(len(to_append) - 1)
            texts.append(to_append)
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    # LDA - Latent Dirichlet Allocation
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=TOPICS_TO_GET, id2word=dictionary, passes=10) #alpha='auto', eval_every=5)  #
    # print(ldamodel.print_topics(num_topics=1, num_words=3))
    topics = []
    for i in range(TOPICS_TO_GET):
        topic_filetered = []
        topic = ldamodel.show_topic(i)
        for j in range(WORDS_PER_TOPIC):
            topic_filetered.append(topic[j][0])
        topics.append(topic_filetered)
#     print(topics)
    return topics


def get_data(file_name):
    """
    :param file_name: the file name to be read
    :return: the data, where the TM_INDEX gives the index of the TM and PROTEIN_SEQUENCE_INDEX gives the index of the protein
    """
    with open(file_name, encoding='utf-8-sig') as f:
        csv_reader = csv.reader(f)
        two_d_arr = []
        for row in csv_reader:
            row_in_row = []
            for v in row:
                v = v.strip()
                if len(row_in_row) == PROTEIN_SEQUENCE_INDEX:
                    row_in_row.append(str(v))
                elif len(row_in_row) == TM_INDEX:
                    row_in_row.append(float(v))
            two_d_arr.append(row_in_row)
        return two_d_arr

In [276]:
# Get data
print("Starting...")
data = get_data(DATA_FILE)
min_val = None
max_val = None
for i in data:
    if min_val is None or i[TM_INDEX] < min_val:
        min_val = i[TM_INDEX]
    if max_val is None or i[TM_INDEX] > max_val:
        max_val = i[TM_INDEX]
ranges, range_dict = set_dictionary(INTERVAL, min_val, max_val)
test_data, learning_data = separate_learn_and_test_data(data)

# Add to topics in dictionary
# Learn
dictionary_data = set_data_based_on_dictionary(learning_data, ranges)
learned_topics = {}
for key in dictionary_data.keys():
    learned_topics[key] = get_topic_data(dictionary_data[key])
    
# Test
dictionary_data = set_data_based_on_dictionary(test_data, ranges)
test_results = {}
for item in test_data:
    data = get_topic_data(item[PROTEIN_SEQUENCE_INDEX])
#     print(data)
    if data is not None:
        test_results[item[PROTEIN_SEQUENCE_INDEX]] = data, item[TM_INDEX]

# Get 3 ranges
win = 0
loss = 0
none_loss = 0
# print(test_results)
for result in test_results.keys():
    true_val = test_results[result][1]
    range_result = get_predicted_range(learned_topics, test_results[result][0], range_dict)
    if range_result[0] is not None and range_result[1] is not None:
        if int(true_val) >= range_result[0] and int(true_val) <= range_result[1]:
            print("win: " + str(range_result))
            win += 1
        else:
            print("loss:" + str(range_result))
            loss += 1
    else:
        none_loss += 1
print(win)
print(loss)
print(none_loss)
print("Final results = " + str(win/(loss + win + none_loss)))
        
# Then get topics for test
# If you get similar topics for test set, choose that

Starting...
win: (310, 360)
win: (330, 380)
loss:(310, 360)
loss:(340, 340)
win: (310, 360)
win: (310, 380)
loss:(330, 330)
win: (310, 340)
win: (320, 380)
win: (320, 380)
loss:(310, 340)
loss:(330, 360)
win: (330, 360)
win: (320, 380)
win: (310, 380)
win: (320, 380)
loss:(320, 320)
win: (320, 360)
loss:(320, 380)
loss:(330, 340)
loss:(320, 360)
win: (310, 360)
win: (310, 360)
loss:(310, 360)
loss:(320, 380)
win: (310, 360)
loss:(310, 340)
win: (310, 360)
win: (310, 380)
win: (320, 340)
win: (330, 380)
win: (310, 380)
loss:(310, 360)
win: (320, 360)
win: (320, 380)
win: (320, 360)
win: (310, 360)
win: (330, 360)
win: (310, 360)
win: (310, 360)
win: (320, 360)
loss:(320, 340)
win: (320, 360)
loss:(320, 360)
loss:(320, 320)
win: (320, 360)
win: (320, 340)
win: (310, 340)
win: (310, 360)
win: (320, 360)
win: (310, 380)
win: (310, 360)
win: (310, 360)
win: (310, 360)
loss:(310, 330)
loss:(340, 340)
win: (310, 340)
win: (310, 380)
loss:(310, 340)
win: (310, 340)
win: (330, 340)
win: (310, 3