# AI - Project 2 - Genetics

### Part 0 - Cleaning data

Useless characters and punctuation marks are all considered to be out of alphabets. Therefore, by removing non-alphabetical characters we can obtain a dictionary.

In [None]:
import os
import random
import string
import numpy as np
from functools import cmp_to_key
import time


def get_dict(text):
    
    s = ''
    words = []
    
    for i in text:
        if i.isalpha():
            s += i.upper()
        elif s != '':
            words.append(s)
            s = ''
    if s != '':
        words.append(s)
            
    return words


def rand_chromosome(m):
    chro = ''
    for i in range(m):
        chro += random.choice(string.ascii_letters).upper()
    return chro


def build_pool(n, m):
    
    pool = []
    for i in range(n):
        pool.append(rand_chromosome(m))
        
    return pool

def decode_char(key, ch):
    x = (ord(ch) - ord(key) + 26) % 26
    x += ord('A')
    return chr(x)

def repeat(text, letters_no):
    
    d = int(letters_no/len(text))
    r = letters_no%len(text)
    text = text*d + text[0:r]
    
    return text

def count_letters(text):
    i = 0
    for letter in text:
        if letter.isalpha():
            i += 1
            
    return i


def decode_text(key, encoded_text):
    
    output = encoded_text
    key = repeat(key, count_letters(encoded_text)).upper()
    k = 0
    
    for i in range(len(encoded_text)):
        
        if encoded_text[i].isalpha():   
            
            output = output[:i] + decode_char(key[k], encoded_text[i]) + output[i+1:]
            k += 1
    
    return output


def decode_dict(key, encoded_text):
    
    decoded_text = decode_text(key, encoded_text)
    decoded_dict = get_dict(decoded_text)
    return decoded_dict




def fitness(chromosome, encoded_text, global_dict):

    score = 1 - len(list(set(decode_dict(chromosome, encoded_text)) - set(global_dict))) / len(encoded_text)    
    return score


def fitness2(chromosome, encoded_text, global_dict):
    
    len_t = len(''.join(e for e in encoded_text))
    remainder = list(set(decode_dict(chromosome, encoded_text)) - set(global_dict))
    len_r = len(''.join(e for e in remainder))
    
    score = 1 - len_r / len_t
    return score


def crossover(chr1, chr2, n):
    
    c_points = random.sample(range(len(chr1)), n)
    
    for p in c_points:
        
        temp = chr1[p:]
        chr1 = chr1[0:p] + chr2[p:]
        chr2 = chr2[0:p] + temp
        
    return chr1, chr2


def u_crossover(chr1, chr2, p):
        
    for i in range(len(chr1)):
        
        if np.random.binomial(1, p) == 1:
            
            temp = chr1[i]
            chr1 = chr1[:i] + chr2[i] + chr1[i+1:]
            chr2 = chr2[:i] + temp + chr2[i+1:]
            
    return chr1, chr2


def crossover_or_pass(chr1, chr2, p, p2):
    
    out = np.random.binomial(1, p)
    
    if out == 0:
        return chr1, chr2
        
    return u_crossover(chr1, chr2, p2)


def mutate(chro, p):
    
    ch = ''
    
    for i in chro:
        
        if np.random.binomial(1, p) == 1:
            ch += random.choice(string.ascii_letters).upper()
        
        else:
            ch += i
        
    return ch


def get_upsprings(pool, p_cross, p_ucross, p_mutate):
    
    random.shuffle(pool)
    i = 0
    
    while i < len(pool):
        
        if len(pool) > i + 1:
            pool[i], pool[i+1] = crossover_or_pass(pool[i], pool[i+1], p_cross, p_ucross)
            pool[i] = mutate(pool[i], p_mutate)
            pool[i+1] = mutate(pool[i+1], p_mutate)
            
        elif len(pool) == i + 1:
            pool[i] = mutate(pool[i], p_mutate)
            
        else:
            break
            
        i += 1
        
    return pool


class Decoder:
    
    def __init__(self, globalText, encodedText, keyLength):
        
        self.global_txt = globalText
        self.encoded_txt = encodedText
        self.keyLen = keyLength
        self.encoded_dict = get_dict(self.encoded_txt)
        self.global_dict = get_dict(self.global_txt)
        self.best_is_repeated = 0
        self.last_best = None
        self.pool_size = 100
        self.pass_fittest = 0.01

        
    def find_key(self):
        
        pool = build_pool(self.pool_size, self.keyLen)
        key = ''
        i = 0
        w = list(range(int(len(pool)*(1-self.pass_fittest))))
        w.reverse()
        scores = []
        
        while True:
            
            random.shuffle(pool)
            pool = sorted(pool, key=lambda x: fitness2(x, self.encoded_txt, self.global_dict), reverse = True)
            top_score = fitness2(pool[0], self.encoded_txt, self.global_dict)
            
            if self.last_best is not None and self.last_best == pool[0]:
                self.best_is_repeated += 1
            else:
                self.best_is_repeated = 0
            
            self.last_best = pool[0]
            
            
            if top_score == 1:
                key =  pool[0]
                break
                  
                    
            #if i % 100 == 0:
            #print('i={} || best key {} || score {} || repeated {}'.format(i, pool[0], top_score, self.best_is_repeated))
                
            if self.best_is_repeated >= 10:
                pool[0] = mutate(pool[0], 0.4)
            
            start  = int(len(pool)*self.pass_fittest)
            pool[start:] = random.choices(pool[start:], weights=w, k=int(self.pool_size*(1-self.pass_fittest)))
            pool[start:] = get_upsprings(pool[start:], 0.95, 0.9, 0.02)
            
            i += 1
            
        return key
    
    
    def decode(self):
        
        key = self.find_key()
        print(key, self.encoded_txt)
        
        
encodedText = open('encoded_text.txt').read()
globalText = open('global_text.txt').read()
d = Decoder(globalText, encodedText, keyLength = 14)
decodedText = d.decode()

### Part 1

Genes and chromosomes: Obviously, our chromosome should be 14-letter string and as a result the genes will be letters used in chromosome.

** For having a uniform answer all letters are converted to uppercase.



### Part 3

In this part we define the fitness function that help us determine the better fit chromosome in the pool. The score returned by the function is the number of decoded words that are available in the global_dict.



#### Answering the questions

1. Too big or small mating pool: Having a very small mating pool means going through a very small number of states for our answer and can make the time taken to find the answer extremely long. On the other hand, while a large mating pool can grant us checking various states more quickly, it certainly increases the amount of computation needed. As a result a medium sized pool is the best option which will be calculated best by trial and error.

2. Increasing the size of mating pool will not only imensely magnify the computation time needed (The algorithm does not guarantee returning an answer and searches the space rather randomly.) but also will help reproduce the least fit asnwers as well as the most fit. In other words the competetion aspect of the algorithm will be removed.

3. To expand the searched space by making random changes sometimes unavailable in the parents. In this way, the effect of the initial randomly selected pool will be decreased and the diversity will improve. || By using the mutation we ensure that it is possible to have features not available in parents for offspring. By using crossover, we try to make better fit solutions in a purposeful way. 

4. Crossover has a higher impact in getting a better precision and usually has a rate between 0.8 and 0.9. On the other hand, the mutation rate is commonly between 0.05 to 0.1 and increases the chance that the algorithm will reach the optimal solution. Because of the randomness in mutation's nature it has a higher influence on the outcome. In other words, adding 5 percent to mutation rate can effect the algorithm far more than adding (or decreasing) 5 percent from the crossover rate. || Summary: An increase in crossover rate can improve the precision rate much more than in mutation. But the effect of certain changes in mutation rate (on precision) is higher and faster.

5. By defining a repetition detection function and increasing the crossover rate and mutation rate (Setting the mutation rate equal to 1), we can possibily solve this problem. But in case the problem persisted, reseting the algorithm (randomly initialzing the pool) will be a good option.

6. By using crossover in a loop and assigning a random mating pool at a certain limit we can help the problem with diversity (that is usually solved by mutation.) On the other hand, working on the breading the fittest choices and evolving our answer in every generation is almost impossible by using mutation alone.

7.  
    - toggling between various crossover functions
    - choosing optimal rates for passing current generation, mutation, crossover, and ...
    - choosing an optimal mating pool size
    - Writing a class for chromosomes (To sort, rank, and select faster)
    - ...