In [1]:
import pandas as pd
import numpy as np
import os
import itertools as it

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *

import collections
init_notebook_mode(connected=True)

In [3]:
def recursively_default_dict():
        return collections.defaultdict(recursively_default_dict)


### Summary_Home
Home= 'Summary_CSVs/'

## read_summary-stats

Summary= recursively_default_dict()

for Chr in range(1,13):
    filename= 'Summary_assignments_CHR'+ str(Chr).zfill(2)+ '_Z4.0_bin5.txt'
    
    chr_stats= pd.read_csv(filename,sep= '\t')
    Summary[Chr]= chr_stats

Summary= pd.concat([frame for frame in Summary.values()])

## read accession data
order_core= pd.read_csv('Order_core.txt')


In [4]:
order_core.head()

Unnamed: 0,ID,NAME,COUNTRY,REGION,sNMF_K3,Jap_K4,K9_cluster,Initial_subpop,genoIndex
0,CX59,"MILAGROSA,_ZAWA_BANDAY",Philippines,As5,4,1,cB_(Bas),aro,296
1,CX65,DOMSIAH,Iran,As1,4,1,cB_(Bas),aro,301
2,CX67,BINAM,Iran,As1,4,1,cB_(Bas),aro,303
3,CX104,SADRI_RICE_1,Iran,As1,4,1,cB_(Bas),aro,338
4,CX143,KHASAR,Iran,As1,4,1,cB_(Bas),aro,372


In [5]:
Summary.head()

Unnamed: 0,ID,chrom,color,t_length,mean_size,N
0,IRIS_313-11293,1,silver,6398963,10337.581583,619
1,IRIS_313-11293,1,purple,4375921,8415.232692,520
2,IRIS_313-11293,1,black,1945008,4642.023866,419
3,IRIS_313-11293,1,blue,14079545,18848.119143,747
4,IRIS_313-11293,1,green,4471495,10070.934685,444


In [7]:
print('Countries: {}'.format(len(order_core.COUNTRY.unique())))

print('population codes: ' + ','.join([str(x) for x in order_core.sNMF_K3.unique()]))
print('populations: ' + ','.join([str(x) for x in order_core.Initial_subpop.unique()]))
print('Number of accessions in data.file: {}, number of columns: {}'.format(order_core.shape[0],order_core.shape[1]))

print('population / code:')

print(order_core[['sNMF_K3','Initial_subpop']].drop_duplicates())

print(Summary.head())

Countries: 24
population codes: 4,5,0,1,2
populations: aro,admix,temp,aus,ind1A,ind2,japx,trop,indx,ind3,ind1B,subtrop
Number of accessions in data.file: 948, number of columns: 9
population / code:
     sNMF_K3 Initial_subpop
0          4            aro
48         4          admix
62         5           temp
64         5          admix
69         5            aus
71         5          ind1A
74         5            aro
75         5           ind2
78         5           japx
81         5           trop
84         5           indx
104        5           ind3
162        5          ind1B
169        0          ind1A
170        0           indx
171        0          ind1B
223        0           ind3
231        0           ind2
563        1            aus
628        2           temp
630        2           japx
651        2        subtrop
662        2           trop
               ID  chrom   color  t_length     mean_size    N
0  IRIS_313-11293      1  silver   6398963  10337.581583  619
1  IR

## Classification by group.

Analysis of classication extent across hierarchical factor variables sNMF_K3 and Initial_subpop.

In [8]:
pop_codes= {
    0: 'Indica',
    1: 'Aus',
    2: 'Japonica',
    4: 'cBasmati',
    5: 'Admix'
}


### Select sNMF group (layer 1)
snmf= 2

snmf_subset= order_core[(order_core.sNMF_K3 == snmf)]
#

Merged= Summary.merge(snmf_subset,left_on='ID',right_on='ID')

group_colors= Merged.groupby(['color','ID'])['t_length','mean_size','N'].sum()
print(group_colors.shape)

(2560, 3)


In [9]:
Merged.head()

Unnamed: 0,ID,chrom,color,t_length,mean_size,N,NAME,COUNTRY,REGION,sNMF_K3,Jap_K4,K9_cluster,Initial_subpop,genoIndex
0,B001,1,silver,5987056,8909.309524,672,HEIBIAO,China,As6,2,5,GJ-tmp,temp,-1
1,B001,1,purple,4543974,7794.123499,583,HEIBIAO,China,As6,2,5,GJ-tmp,temp,-1
2,B001,1,black,6870268,4757.803324,1444,HEIBIAO,China,As6,2,5,GJ-tmp,temp,-1
3,B001,1,blue,21648431,10073.723127,2149,HEIBIAO,China,As6,2,5,GJ-tmp,temp,-1
4,B001,1,green,2566633,5206.15213,493,HEIBIAO,China,As6,2,5,GJ-tmp,temp,-1


In [10]:
##################################################
### Proportion across Subgroups ###############
#################################################

color_choice= ['red','yellow']

dict_color= {
    'blue': 'Japonica',
    'yellow': 'Aus',
    'red': 'Indica',
    'green': 'Aus-Jap',
    'purple': 'Ind-Jap',
    'orange': 'Aus-Ind',
    'silver': 'Aus-Jap-Ind',
    'black': 'None'
}

Values= []

subgroup= order_core[(order_core.sNMF_K3 == snmf)]


sub_ids= [x for x in subgroup.ID]

## Use Merged data frame to sort values by subgroup
Nrow= Merged.shape[0]
sub_summary= Merged.to_dict(orient='list')
sub_summary.keys()

Store= recursively_default_dict()

for row in range(Nrow):
    if sub_summary['ID'][row] in sub_ids:
        for color in color_choice:
            Store[sub_summary['K9_cluster'][row]][sub_summary['color'][row]][sub_summary['ID'][row]][sub_summary['chrom'][row]]= sub_summary['t_length'][row]

        
Vals= []

for subg in Store.keys():
    for guy in Store[subg][color].keys():
        Vals.append([subg,sum([y for y in it.chain(*[[Store[subg][color][guy][CHR] for CHR in Store[subg][color][guy].keys()] for color in color_choice])])])

Vals= np.array(Vals)
Vals.shape

(320, 2)

In [11]:
Vals_indexes= {
    z:[x for x in range(Vals.shape[0]) if Vals[x,0] == z] for z in Store.keys()
}

Vals_fig= [go.Box(
    y= [Vals[x,1] for x in Vals_indexes[i]],
    name= i,
    marker= dict(
        color= 'blue'
    ),
) for i in Vals_indexes.keys()]

layout= go.Layout(
    title= 'distribution of summed sizes of windows colored to: ' + ', '.join([dict_color[x] for x in color_choice]),
    xaxis= dict(
        title= pop_codes[snmf]
    )
)

fig = go.Figure(data=Vals_fig, layout= layout)
iplot(fig)

**Fig. 1 Average assignment by global structure classification** Total genomic assignment by class was summed for each subclass of the selected reference group.

## Analysis of classification structure.

This section focus on identifying groups of individuals that might bear particular patterns of classification.

User selects classificaiton variables to use as color. Analysis is performed using individuals selected the `ID` class selected above.

In [17]:
##################################################
### look at stuff across some desired colors 
##################################################

Abstain= ['red','blue','black','green']


Hellnames= []
Hell= []

for ID in list(set(snmf_subset.ID)):
    Hellnames.append(ID)
    
    New_guy= Summary[(Summary.ID == ID)]
    total_length= New_guy.t_length.sum()
    color_guy= [x for x in New_guy.color]
    length_guy= [x for x in New_guy.t_length]
    
    colors_rekt= []
    
    for col in Abstain:
        Abs = [length_guy[x] for x in range(len(color_guy)) if color_guy[x] == col]
        Abs= sum(Abs) / total_length
        colors_rekt.append(Abs)
    
    colors_rekt.append(sum(colors_rekt))
    Hell.append(colors_rekt)

Hell = np.array(Hell)
Min_guys= sorted(Hell[:,Hell.shape[1]-1],reverse= False)

In [20]:
Min_threshold= .01
factor_below= [int(x <= Min_threshold) for x in Hell[:,Hell.shape[1]-1]]
factor_dict= {x:[z for z in range(len(factor_below)) if factor_below[z] == x] for x in list(set(factor_below))}

Fig_dat= [go.Scatter3d(
        x = np.exp(Hell[factor_dict[i],0]),
        y = np.exp(Hell[factor_dict[i],1]),
        z = np.exp(Hell[factor_dict[i],2]),
        type='scatter3d',
        mode= "markers",
        text= [Hellnames[c] for c in factor_dict[i]],
        name= ['below threshold','above threshold'][i],
        marker= {
        'line': {'width': 0},
        'size': 4,
        'symbol': 'circle',
      "opacity": .8
      }
    ) for i in factor_dict.keys()]

layout= go.Layout(
    title= 'threshold= {} %'.format(Min_threshold * 100),
    scene= Scene(
    xaxis= dict(
        title= Abstain[0]
    ),
    yaxis= dict(
        title= Abstain[1]
    ),
    zaxis= dict(
        title= Abstain[2]
    )
    )
)

fig = go.Figure(data=Fig_dat,layout= layout)
iplot(fig)

**Fig. 2 Analysis of percentage of assignment to class variables selected**

## Individual classification summary.

Explore classification summaries by group.

### i. Mean, max and minimum percentages.

In [12]:
### Individual_sizes
### get percentages of average block size and genome coverage
INDS_size= Merged.groupby(['ID'])['t_length'].sum()

## create new columns w/ percentual values
group_colors['length_%']= group_colors.t_length / INDS_size

## print
filename= Home + pop_codes[snmf] + '_summary.txt'
os.makedirs(os.path.dirname(filename), exist_ok=True)
group_colors.to_csv(filename,sep= '\t')

In [17]:
# Mean assignment by color.
mean_colors= group_colors.groupby(['color']).mean()

filename= Home + pop_codes[snmf] + '_summary_MEAN.txt'
os.makedirs(os.path.dirname(filename), exist_ok=True)
mean_colors.to_csv(filename,sep= '\t')

mean_colors


Unnamed: 0_level_0,t_length,mean_size,N,length_%
color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
black,2773260.0,50917.196927,588.852792,0.007438
blue,4344934.0,89568.476222,497.885787,0.011653
green,3378765.0,98686.416311,430.751269,0.009062
orange,85627500.0,115717.506755,8659.423858,0.22966
purple,43068130.0,99073.400834,5172.327411,0.115512
red,160105900.0,178564.648087,10851.380711,0.429417
silver,65608650.0,112996.748139,6963.086294,0.175968
yellow,7938056.0,81865.346975,895.228426,0.02129


In [18]:
# Maximum assignment by color
max_colors= group_colors.groupby(['color']).max()

filename= Home + pop_codes[snmf] + '_summary_MAX.txt'
os.makedirs(os.path.dirname(filename), exist_ok=True)
max_colors.to_csv(filename,sep= '\t')

max_colors

Unnamed: 0_level_0,t_length,mean_size,N,length_%
color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
black,26329395,156548.149026,5509,0.070617
blue,26645162,217950.024756,1528,0.071464
green,8751040,151966.122264,1015,0.023471
orange,96499809,131320.653656,9926,0.258818
purple,51824076,113114.992053,6700,0.138992
red,185608483,202523.496598,15167,0.497821
silver,80914685,118929.669873,8148,0.217013
yellow,41235655,229305.894878,3247,0.110597


In [19]:
# Minimum assignment by color
min_colors= group_colors.groupby(['color']).min()

filename= Home + pop_codes[snmf] + '_summary_MIN.txt'
os.makedirs(os.path.dirname(filename), exist_ok=True)
min_colors.to_csv(filename,sep= '\t')

min_colors

Unnamed: 0_level_0,t_length,mean_size,N,length_%
color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
black,131074,28100.270455,49,0.000352
blue,783029,38492.006693,85,0.0021
green,2150924,62811.984941,253,0.005769
orange,73250856,88650.325238,7504,0.196464
purple,32013044,81243.864143,4243,0.085861
red,130077612,113084.207742,9111,0.348877
silver,57998557,107066.199131,6348,0.155556
yellow,895367,34224.281834,258,0.002401


In [24]:
## Maximum and minimum coverage of assignment by color, top N individuals.
N= 15

group_colors.groupby(['color'])['length_%'].nsmallest(N)


#group_colors.groupby(['color'])['t_length'].nsmallest(N)

color   color   ID            
black   black   IRIS_313-11668    0.000352
                IRIS_313-11746    0.000374
                B208              0.000455
                IRIS_313-11667    0.000494
                IRIS_313-11727    0.000517
                IRIS_313-11273    0.000546
                IRIS_313-11665    0.000609
                IRIS_313-11797    0.000708
                IRIS_313-11799    0.000734
                B074              0.000749
                IRIS_313-11752    0.000821
                IRIS_313-11745    0.000915
                IRIS_313-11733    0.000937
                IRIS_313-11622    0.000993
                IRIS_313-11795    0.000998
blue    blue    IRIS_313-11273    0.002100
                IRIS_313-11667    0.002785
                IRIS_313-11738    0.002803
                IRIS_313-8454     0.003053
                IRIS_313-8900     0.003068
                IRIS_313-10863    0.003084
                IRIS_313-11804    0.003125
                IRIS_31

### ii. Individual classification summary

Provide classification summary of individual with provided ID.

In [15]:
ID= 'IRIS_313-11375'

Single= Merged[(Merged.ID==ID)]


In [16]:
order_core[(order_core.ID == ID)]

Unnamed: 0,ID,NAME,COUNTRY,REGION,sNMF_K3,Jap_K4,K9_cluster,Initial_subpop,genoIndex
58,IRIS_313-11375,CODE_NO_31225,India,As2a,4,1,cB_(Bas),aro,2254
