In [None]:
import scipy
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA

import pandas as pd

from IPython.display import clear_output
import matplotlib.pyplot as plt
import itertools as it

import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)


PCA_color_ref= ['darkseagreen','crimson', 'darkorange', 'darkblue', 'darkcyan',
            'darkgoldenrod', 'darkgray', 'darkgrey', 'darkgreen',
            'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange',
            'darkorchid', 'darkred', 'darksalmon', 'darkseagreen',
            'darkslateblue', 'darkslategray', 'darkslategrey',
            'darkturquoise', 'darkviolet', 'deeppink']

## vcf analysis
Jupyter notebook for the local analysis of genetic data stored in .vcf format.

Perform analysis of structure across data set, followed by a more detailed study of variation across local genomic windows.

### Input

In [1]:
from structure_tools.vcf_geno_tools import simple_read_vcf
from structure_tools.vcf_geno_tools import read_geno_nanum

vcf_file= 'D:/GitHub/Tools_and_toys/VCF_analysis/Extract/vcf/Extract_Chr8_15000.vcf'
#vcf_file= 'D:/GitHub/Tools_and_toys/VCF_analysis/Simu_17-03-2019/data.vcf'

row_info= 6
header_info= 9
phased= False

genotype, summary, Names= read_geno_nanum(vcf_file, row_info= row_info, header_info= header_info,phased= phased)

print('Number of markers: {}'.format(genotype.shape[1]))
print('Number of individuals: {}'.format(genotype.shape[0]))

control_subset= 1
clean= 1
subbed= 1

{'fileformat': 'VCFv4.2', 'fileDate': '20190327', 'source': 'PLINKv1.90', 'contig': '<ID8,length28422468>', 'INFO': '<IDPR,Number0,TypeFlag,Description"Provisional reference allele, may not be based on real reference genome">', 'FORMAT': '<IDGT,Number1,TypeString,Description"Genotype">'}
Number of markers: 15000
Number of individuals: 3023


In [2]:
## read passport information

Input_file= 'D:/Rice/Project_external/metadata/orderCore_INFO.txt'

RG_info= pd.read_csv(Input_file,sep= '\t')

RG_info.head()

NameError: name 'pd' is not defined

In [None]:
summary.head()

In [None]:
## Process Names vcf names.
## Instance specific processing due to ID copy in VCF file.
Names_vcf= list(Names)

for x in range(len(Names_vcf)):
    ind= Names_vcf[x]
    newid= ind.split('_')
    
    if len(newid) > 2:
        newid= '_'.join(newid[:2])
    else:
        newid= newid[0]
    
    Names_vcf[x]= newid


In [None]:
### subset core
core_subset= True

coreID_file= 'D:/Rice/Project_external/metadata/Order_core.txt'


with open(coreID_file,'r') as fp:
    coreIDs= fp.readlines()

coreIDs= [x.strip() for x in coreIDs]

core_idx= [x for x in coreIDs if x in Names_vcf]
core_idx= [Names_vcf.index(x) for x in core_idx]

ID_pop= {
    RG_info['ID'][x]: RG_info['Initial_subpop'][x] for x in range(RG_info.shape[0])
}

core_names= [Names_vcf[x] for x in core_idx]
core_pop= [ID_pop[x] for x in core_names]


In [None]:

if control_subset:
    genotype= genotype[core_idx,:]
    control_subset= 0
    

In [None]:
from impute_tools.genome_adapt import (
    clean_geno
)

sub_sel= ['subtrop','trop','temp','aro']
#sub_sel= ['ind1A','temp','aro','aus']

if sub_sel and subbed: 
    sub_idx= [x for x in range(len(core_pop)) if core_pop[x] in sub_sel]
    genotype= genotype[sub_idx]
    core_pop= [core_pop[x] for x in sub_idx]
    core_names= [core_names[x] for x in sub_idx]
    subbed= 0
    ##


if clean:
    keep_pos= clean_geno(genotype, nan_char= 9,
               het_char= 1)
    
    genotype= genotype[:,keep_pos]
    summary= summary.iloc[keep_pos]
    summary= summary.reset_index()
    clean= 0
    



In [None]:
summary.head()

### Global variation

Perform PCA across data set.

Perform Mean shift clustering to attempt to extract genetically coherent groups of accessions.

These will later be used for supervised analysis.

In [None]:
## Perform PCA
n_comp= 4
pca_global = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')

feats= pca_global.fit_transform(genotype)

In [None]:
from sklearn.cluster import MeanShift, estimate_bandwidth

from structure_tools.Tutorial_subplots import plot_global_pca
## perform MeanShift clustering.
bandwidth = estimate_bandwidth(feats, quantile=0.2)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=True, min_bin_freq=15)
ms.fit(feats)
labels1 = ms.labels_
label_select = {y:[x for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1)))}

idvector= ['i{}_pop{}_{}'.format(x,labels1[x],core_pop[x]) for x in range(len(labels1))]

###
label_pops= {z: [core_pop[x] for x in g] for z,g in label_select.items()}
label_connect= {z: g[np.random.randint(0,len(g))] for z,g in label_pops.items()}
colordict= {z: PCA_color_ref[z] for z in label_pops.keys()}


In [None]:
###
plot_global_pca(feats,label_select,colordict,labels= core_pop,title= 'global_pca',height= 500,width= 950)

In [None]:
genotype.shape

#########################################    ############################################################################

# Heterozygotes

In [None]:
tfoc= np.where(het_char==1)
tfoc= np.array(tfoc).T

select_idx= np.random.randint(0,tfoc.shape[0],1)[0]
tf= tfoc[select_idx]

