In [1]:
import os
import pandas as pd
os.chdir("../")
os.chdir("data")

In [2]:
import collections
from collections import OrderedDict
from matplotlib import pyplot as plt
from matplotlib import cm
import pylab
import math
import numpy as np
plt.ioff()

In [3]:
from tqdm import tqdm

In [4]:
class CGR():
    K = 0
    c = None
    h = ""
    Data = ""
    i = 0
    def __init__(self,a):
        self.i=a
        
    def read_fasta(self,loc):
        f = open(loc)
        s1 = f.read()
        data = "".join(s1.split("\n")[1:])
        head = "".join(s1.split("\n")[0:1])
        return data,head
    
    def count_kmers(self,sequence, k):
        d = collections.defaultdict(int)
        for i in range(len(self.Data)-(k-1)):
            d[sequence[i:i+k]] +=1
        d.pop("N",None)
        return d

    def probabilities(self,kmer_count, k):
        probabilities = collections.defaultdict(float)
        N = len(self.Data)
        for key, value in kmer_count.items():
            probabilities[key] = float(value) / (N - k + 1)
        return probabilities

    def chaos_game_representation(self,probabilities, k):
        array_size = int(math.sqrt(4**k))
        chaos = []
        for i in range(array_size):
            chaos.append([0]*array_size)
        maxx = array_size
        maxy = array_size
        posx = 1
        posy = 1
        for key, value in probabilities.items():
            for char in key:
                if char == "T":
                    posx +=  maxx/2
                elif char == "C":
                    posy += maxy/2
                elif char == "G":
                    posx += maxx/2
                    posy += maxy/2
                maxx /= 2
                maxy /= 2

            chaos[int(posy-1)][int(posx-1)] = value
            maxx = array_size
            maxy = array_size
            posx = 1
            posy = 1
        m = float(np.amax(chaos))
        c = np.array(chaos)/m
        return c

    def load_fasta(self,loc,k):
        data,head = self.read_fasta(loc)
        self.Data = data
        f4 = self.count_kmers(data, k)
        f4_prob = self.probabilities(f4, k)
        chaos_k4 = self.chaos_game_representation(f4_prob, k)
        self.c = chaos_k4
        self.h = head
        return chaos_k4
    
    def read(self,data,head,k):
        #data,head = self.read_fasta(loc)
        self.Data = data
        f4 = self.count_kmers(data, k)
        f4_prob = self.probabilities(f4, k)
        chaos_k4 = self.chaos_game_representation(f4_prob, k)
        self.c = chaos_k4
        self.h = head
        return chaos_k4
    
    def show(self):
        plt.figure(figsize=(12,12))
        plt.title('CGR of '+str(self.K)+'-mers for '+self.h[2:])
        plt.pcolor(self.c,cmap=cm.gray_r)#,interpolation = "spline36")
        #pylab.savefig(str(self.i)+".PNG")
        plt.show()
    
    def show_v2(self):
        fig = plt.figure(figsize=(12,12))
        plt.axis('off')
        plt.imshow(self.c, cmap=cm.gray_r)#,interpolation = "spline36")
        plt.savefig(str(self.i)+".png",bbox_inches='tight',pad_inches=0)
        plt.close(fig)

In [5]:
oslist = os.listdir()

In [7]:
for i in oslist:
    if (i.split('.')[-1]=='xlsx'):
        x = pd.read_excel(i,engine = 'openpyxl',header=None)
        acc = x[0].values.tolist()
        seq = x[1].values.tolist()
        di = i.split('.')[0]
        os.makedirs(di)
        os.chdir(di)
        for b in tqdm(range(len(acc))):
            try:
                a = CGR(acc[b].replace(',',"").replace('.',""))
                a.read(seq[b],acc[b],7)
                a.show_v2()
            except:
                continue
        os.chdir('..')

  fig = plt.figure(figsize=(12,12))
100%|██████████| 29/29 [00:03<00:00,  9.37it/s]
100%|██████████| 470/470 [01:00<00:00,  7.73it/s]
100%|██████████| 53/53 [00:08<00:00,  6.42it/s]
100%|██████████| 198/198 [00:34<00:00,  5.75it/s]
100%|██████████| 500/500 [01:04<00:00,  7.78it/s]
100%|██████████| 20/20 [00:03<00:00,  6.06it/s]
100%|██████████| 20/20 [00:02<00:00,  6.85it/s]
100%|██████████| 20/20 [00:02<00:00,  7.20it/s]
100%|██████████| 125/125 [00:17<00:00,  7.18it/s]
100%|██████████| 126/126 [00:19<00:00,  6.46it/s]
100%|██████████| 121/121 [00:18<00:00,  6.62it/s]
100%|██████████| 122/122 [00:17<00:00,  6.86it/s]
100%|██████████| 403/403 [00:36<00:00, 11.05it/s]
100%|██████████| 500/500 [01:25<00:00,  5.86it/s]
100%|██████████| 210/210 [00:32<00:00,  6.55it/s]
100%|██████████| 20/20 [00:02<00:00,  7.05it/s]
100%|██████████| 49/49 [00:07<00:00,  6.91it/s]
100%|██████████| 49/49 [00:06<00:00,  7.02it/s]
100%|██████████| 222/222 [00:31<00:00,  7.04it/s]
100%|██████████| 9/9 [00:01<00