# Modulos

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import preprocessing
import matplotlib.pyplot as plt
import os
# import the relevant modules

from sklearn.cluster import KMeans
from Bio import SeqIO
import copy
import math
import glob
import re


In [None]:
#notebook_path = os.path.abspath("./Python/variations.xls")


# Tratamento de Dados

Criar data frame e colocar primeira coluna como sample

In [None]:
raw_data = pd.read_excel(
    "../Python/variations.xls", sheet_name="variations")
    #"./projects/covid/variations.xls", sheet_name="variations")

raw_df = pd.DataFrame(raw_data)
raw_df.rename(columns={'Unnamed: 0': 'sample'}, inplace=True)
raw_df

preencher os valores NA da primeira coluna (sample) com os valores anteriores




In [None]:
raw_df['sample'].ffill(inplace=True)
raw_df
# test_df.tail(50)

Retirar valores NA das restantes colunas (Ref, Alt e #)

In [None]:
raw_df = raw_df.dropna(axis=0, how="any")
raw_df

#Como alternativa:
#data_no_na = test_df[['REF', 'ALT', '#']].apply(lambda x: pd.Series(x.dropna().values)) -> alternativa para remover os NAs


Usar `pivot` para colocar a coluna sample como index, as colunas REF e ALT passam para as primeiras 2 rows 

In [None]:
# final_data = raw_df.pivot(index="sample",
#                               columns=["REF", "ALT"], values=("#"))
# final_data.columns.name

final_df = (raw_df.set_index(['sample', 'REF', 'ALT'])
                ['#']
                .unstack(['REF', 'ALT'], fill_value=0)
                )
final_df

In [None]:
for column in final_df.columns[1:]:
	if sum(final_df[column]) < 10:
		final_df.drop([column], axis=1, inplace=True)

final_df

Guardar em excel



In [None]:
final_df.to_excel('final_df_new.xlsx')

# PCA analysis


* Centrar e fazer scaling aos dados
NOTA: Se as nossas amostras estivessem nas colunas, colocar `preprocessing.scale(final_data_2.T)` para fazer a transposiçao

In [None]:
scaled_data = preprocessing.scale(final_df) # Fazer scaling dos dados para que fique centrado (0,0) 
# Como alternativa podemos usar o seguinte código para centrar os dados:
# StandardScaler().fit_transform(final_data_2)

pca = PCA(n_components=2) #n_componentes=X onde X é o numero de PC que quermos colocar no spree plot
pca.fit(scaled_data)  # calcular loading scores e variaçao para cada PCA
pca_data = pca.transform(scaled_data)
pca_data

### Scree plot para ver quantas componentes devem estar presentes no plot final. Este plot é usado para determinar o numero de fatores/principal components para uma analise de PCA.

* Calcular % de variabilidade que cada PC tem

In [None]:
per_var = np.round(pca.explained_variance_ratio_*100, decimals=1)
per_var


* Criar labels para cada PC (PC1, PC2, PC3...) tendo em conta o tamanho da variabilidade

In [None]:
labels = ['PC' + str(x) for x in range(1, len(per_var)+1)]
labels


* Fazer plot do spree plot

In [None]:
plt.bar(x=range(1, len(per_var)+1), height=per_var, tick_label=labels)
plt.ylabel('Percentage of explained variability')
plt.xlabel('Principal component')
plt.title('Scree plot')
plt.show()  # grande parte da variabilidade está presente nos primeiros 9 componentes (cut-off point de 2). Estes 9 PC podem fazer uma boa representaçao geral dos dados

5. Colocar as principais coordenadas (9) numa data frame onde os rows são os samples e as colunas tem a PC label


In [None]:

#Guardar as labels das amostras numa variavel unica de modo a colocar como indice na data frame dos PCs
sample_labels = final_df.index

pca_df = pd.DataFrame(pca_data, index=[sample_labels], columns=labels)
pca_df

In [None]:


plt.scatter(pca_df.PC1, pca_df.PC2, alpha=0.2)
plt.title("PCA graph")
plt.xlabel('PC1- {0}%'.format(per_var[0]))
plt.ylabel('PC2- {0}%'.format(per_var[1]))
plt.show()


* KMeans clustering para identificar cluster (extrair os nossos clusters)


In [None]:
kmeans = KMeans(n_clusters=2, random_state=0) #como estamos a olhar para 2 PC, o n_clusters vai ser 2. O metodo 

#Compute cluster centers and predict cluster indices
X_clustered = kmeans.fit_predict(pca_df)
X_clustered

In [None]:
color_map = {0 : 'blue',
                   1 : 'red'} #alterar consoante o numero de componentes a analisar

label_color = [color_map[i] for i in X_clustered]
plt.figure(figsize = (10,10))
plt.scatter(pca_df.PC1,pca_df.PC2, c= label_color, alpha=0.3)
plt.show()


# Separar 20k sequencias em 5 ficheiros com ~ 4k


In [None]:
ids = pd.read_excel(
    "../Python/samples_ID.xlsx")


In [None]:
#Read fasta files with seqIO

for i in SeqIO.parse("sequences.fasta", "fasta"):
	#print(i.id)
	#print(str(i.seq)[11288:11296]) #deleçoes
	print(str(i.seq)) #deleçoes
	#print(len(i))
	break

records = list(SeqIO.parse("sequences.fasta", "fasta"))
len(records)

In [None]:

#Estes valores de x e y tem que estar fora do for loop caso contrario o x iria fazer reset para 0 a cada iteraçao e o y tambem fazia reset para 4030 a cada iteraçao
x = 0; #começar no 1º elemento da lista de records
y = 4030; #ir até ao 4000º elemento da lista de records

for i in range(1,6):
	SeqIO.write(records[x:y], "sequence_{id}.fasta".format(id=i), "fasta")
	x = y; #atualizar o valor de X com o ultimo valor de Y usado
	y += 4000; #adicionar 4000 ao y


#Tambem podiamos fazer de forma manual:
# SeqIO.write(records[0:4001], "sequences_01.fasta", "fasta")
# SeqIO.write(records[4001:8001], "sequences_02.fasta", "fasta")
# SeqIO.write(records[8001:12001], "sequences_03.fasta", "fasta")
# SeqIO.write(records[12001:16001], "sequences_04.fasta", "fasta")
# SeqIO.write(records[16001:], "sequences_05.fasta", "fasta")


* Concatenar ficheiros csv numa data frame



In [None]:
path = r'C:\Users\Rafael\Desktop\main\University\BioinformaticaClinica\1Semestre\FEM\Projeto\FEM\Python'
all_files = glob.glob(path + "/*.csv")
df_from_each_file = (pd.read_csv(f,sep=";") for f in all_files)
concatenated_df = pd.concat(df_from_each_file, ignore_index = True)

* Dataframe com dados que interessam (seqName, clade, substitutions, deletions insertions)

In [2]:

final_df=pd.read_csv("data_final.csv")
#final_df = concatenated_df.iloc[:,[0,1,13,14,15]] #selecionar colunas que interessam

final_df.to_csv(r'./data_final.csv', index= False, header=True)

pd.set_option('display.max_rows', 200) 


# Contar substituições, inserções e deleções

## Substituições


In [314]:
# Criar nova data frame para colocar linhas como seqName / Clade e colunas como C-T, etc

pca_df = final_df.iloc[:,[0,1]]
pca_df.head(10)
#A>T

Unnamed: 0,seqName,clade
0,Wuhan/Hu-1/2019,19A
1,Portugal/CV62/2020,20B
2,Portugal/CV63/2020,20A
3,Portugal/PT0001b/2020,20B
4,Portugal/PT0003/2020,20A
5,Portugal/PT0004/2020,20A
6,Portugal/PT0005/2020,20A
7,Portugal/PT0006a/2020,20A
8,Portugal/PT0006b/2020,20A
9,Portugal/PT0007/2020,20B


In [315]:

for line in range(1,len(final_df)): 

	substitutions = final_df.iloc[line,2] #substitutions vai dando reset a medida que vao atribuindo novos valores. /TODO: posso tentar passar isto para lista e fazer o loop por elemento de lista em vez de usar split(",")
	snv = [] #Criar nova lista por cada linha lida 

	#print(substitutions)
	#print(line)

	for content in substitutions.split(","):
		reference = content[0] #Obter o 1º character (a referencia)
		substitution = content[-1] #Obter o ultimo caracter (a substituiçao)
		output = "{0}>{1}".format(reference,substitution)
		pca_df
		snv.append(output)
		# C>T
		#print(snv)

	d = dict() #criar dicionario para fazer as contagens
	for key in snv:
		d[key] = d.get(key,0) + 1
	#print(d)

	for key,value in d.items():
		#print(key)
		#print(value)
		#if key not in pca_df.columns[:]:
		#	pca_df.from_dict(d)
		#print(line)
		#print(key)
		#print(value)
		pca_df.loc[line, key] = value #aqui usamos loc pois loc = label-based, ou seja, temos que especificar o nome das rows e colunas que queremos filtrar. iloc (i) - integer index-based, ou seja, temos que especificar as rows e colunas pelo index



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


## Deleções

* Para a deleçao temos que:
	* Fazer loop por cada linha da coluna 'deletions'
		* Se a linha for NaN, passar a frente
		* Caso contrario, fazer split da linha a partir do "," e guardar os resultados numa lista:
			* Ver a possibilidade de usar enumerate para guardar o indice das posiçoes (pos1 e pos2) das deleçoes, de modo a colocar tudo no mesmo sitio no final
			* Para cada lista, fazer split com "-" de modo a obter 2 posiçoes (onde ocorre a deleçao)

			* Tentar colocar um if statement que diz:
				* Se estas posiçoes contem ",", entao fazer split das virgulas. Podemos depois colocar cada um dos valores de forma individual numa lista 

	* Fazer um novo loop 
		* Passar no dicionario da sequencia fasta original de modo a ver onde houve deleçao, usando posiçao 1 e 2 calculada no loop inicial
		* 

In [316]:
record_dict_2 = SeqIO.to_dict(SeqIO.parse("sequences.fasta", "fasta")) #guardar a sequencia de referencia para obter as deleçoes

* Remover indices sozinhos (tamanho impar) e fazer o split dos "-" e ","
* Guardar a lista separada e indices/posiçoes de cada deleçao

In [317]:
lst_separated = []
populated_indices = []

for line_del in final_df['deletions']: #fazer o split das , e -

	if type(line_del) != str and math.isnan(line_del): #se o tipo da linha nao for string e for nan
		lst_separated.append(None)
	else:
		pos = re.split(',', line_del) #se for string, fazer split 
		lst_separated.append(pos)

# for element,pos_indices in zip(lst_separated, range(0,len(pca_df))): #se o tamanho for impar, remover o ultimo elemento da lista - so vamos ver as deleçoes que tem posiçao1:posiçao2
# 	# if type(element) != list and math.isnan(element): 
# 	# 	continue
# 	# elif len(element) %2 != 0: #se o tamanho das sublistas for impar, remover o ultimo elemento da sub-lista
# 	# 	element.pop()
		
# 	populated_indices.append(pos_indices) #guardar os indices em que ocorrem as deleçoes






In [318]:
outcome_del = []
outcome_del_list = []


In [319]:
dt_deletion = dict()
dt_deletions_total = dict()
for index in range(0,len(pca_df)):
	# i = 0
	# j = 1
	k = 0
	dt_temp = dict()
	if lst_separated[index] is None or len(lst_separated[index]) == 0: # se for uma lista vazia, ignorar e contnuar
		k += 1
		# positions.append(populated_indices[i])
		# var1 = int(populated_indices[i])
		# #print(populated_indices[i],var1, lst_separated[var1])
		# lst_separated.pop(var1)
		continue
	else:
		for data_ in lst_separated[index]:
			lst_data = data_.split("-")
			if len(lst_data) == 2:
				pos1 = int(lst_data[0])-1
				pos2 = int(lst_data[1])
			else:
				pos1 = int(lst_data[0])-1
				pos2 = int(lst_data[0])
			deletion = str(record_dict_2["Wuhan/Hu-1/2019"].seq)[pos1:pos2] #obter a deleçao na posiçao - 1
			deletion = deletion + ">del" #adicionar identificador de deleçao
			if deletion in dt_temp:
				dt_temp[deletion] += 1
			else:
				dt_temp[deletion] = 1
			if deletion in dt_deletions_total:
				dt_deletions_total[deletion] += 1
			else:
				dt_deletions_total[deletion] = 1
		dt_deletion[pca_df['seqName'][index]] = dt_temp

		# while k < len(lst_separated[populated_indices[index]])/2: #enquanto k for inferior ao tamanho da sub-lista aos pares (exemplo, se a sublist tiver 4 elementos, vamos buscar os pares do indice 0 e 1, depois do indice 2 e 3, etc -> por isso é que fazemos +2 no j e  i)
			# pos1 = int(lst_separated[populated_indices[index]][i]) #guardar posiçoes dos indices pares
			# pos2 = int(lst_separated[populated_indices[index]][j]) #guardar posiçoes dos indices impares
			# i += 2
			# j += 2
			# k += 1
			# #print(pos1,pos2)
			# deletion = str(record_dict_2["Wuhan/Hu-1/2019"].seq)[pos1-1:pos2] #obter a deleçao na posiçao - 1
			# deletion = deletion + ">del" #adicionar identificador de deleçao
			# outcome_del.append([populated_indices[index], deletion]) #guardar o index em que a deleçao se encontra na dataframe e guardar a deleçao
			# outcome_del_list.append(deletion)



* Criar colunas a partir das keys do dicionar com as deleçoes

In [320]:
for key in dt_deletion:
	for subkey,value in dt_deletion[key].items():
		if subkey not in pca_df.columns[:]: #len(subkey) < 14 and 
			pca_df[subkey] = None

		#pca_df.loc[line, key] = value


In [334]:
pca_df.set_index('seqName', inplace=True, drop = True)

In [None]:
#pca_df.set_index('seqName')

In [335]:
for sample in dt_deletion:
	for column,value in dt_deletion[sample].items():
		#print(str(sample),column,value)
		#print(pca_df.loc[0, 'ATG>del'])
		pca_df.loc[sample, column] = value

In [None]:
pca_df.set_index('seqName', inplace=True, drop = True)

In [None]:
# dt_count_deletion = {}
# for key in dt_deletions_total:
# 	if dt_deletions_total[key] > 50:
# 		dt_count_deletion[key] = 0
	
# dt_to_df = {}
# for key in dt_deletion:
# 	dt_temp = {}
# 	for key_count in dt_count_deletion:

# 		if (key_count in dt_deletion[key]): dt_temp[key_count] = dt_deletion[key][key_count]
# 		else: dt_temp[key_count] = 0
# 	dt_to_df[key] = dt_temp

# # print(dt_to_df)
# for line in range(1, len(pca_df)):
# 	seq_name = pca_df['seqName'][line]
# 	for key_count in dt_count_deletion:
# 		if seq_name in dt_to_df:
# 			pca_df.loc[line, key_count] = dt_to_df[seq_name][key_count]
# 		else: pca_df.loc[line, key_count] = 0
		
# pca_df.head()
#pca_df.loc[line, key] = value #aqui usamos loc pois loc = label-based, ou seja, temos que especificar o nome das rows e colunas que queremos filtrar. iloc (i) - integer index-based, ou seja, temos que especificar as rows e colunas pelo index


* Obter os elementos unicos para popular as colunas

In [76]:

def unique(input_list):
	return list(set(input_list))

columns_del = unique(outcome_del_list)



Remover todas as colunas com tamanho superior a 16 carateres

In [None]:

for element, index in zip(columns_del, range(0,len(columns_del))): #Remover os elementos unicos da lista que tem tamanho superior a 16 
	if len(element) > 16:
		columns_del.pop(index)
		continue

columns_del

* Este codigo ja retira todos os elementos da lista outcome_del (com o indice e a deleçao) com tamanho superior a 16

In [None]:
# for element, index in zip(outcome_del, range(0,len(outcome_del))):
# 	if len(element[1]) > 16:
# 		outcome_del.pop(index)
# 		continue

# print(outcome_del)

* Populate the columns from pca_df with the unique values (columns_del)

In [91]:
# for column in columns_del:
# 	pca_df[column] = np.empty((len(pca_df),0)).fill(np.nan)

* Create separated lists to save indices and deletions to use in zip()

In [237]:
#pca_df.to_csv(r'./test.csv', index= False, header=True)


## Inserções

In [None]:
lst_separated_insert = []

for line_insert in final_df['insertions']:

	if type(line_insert) != str and math.isnan(line_insert):
		lst_separated_insert.append(None)
	else:
		pos = re.split(',',line_insert)
		lst_separated_insert.append(pos)


Split de , e :

In [324]:
dt_insertion = dict()
lst_insert = []

for line,index in zip(final_df['insertions'], range(len(pca_df))):
	if type(line) is float:
		continue
	else:
		pos = re.split(",|:",line)
		lst_insert.append(pos)

# for element in lst_insert:
# 	for index in range(len(lst_insert)):
# 		print(index)
# 		if not element[index][2].isdigit():
# 			print(element)
		

Remover posiçoes  e deixar apenas as inserçoes:

In [None]:
for sublist in lst_insert:
	print(sublist)
	length = range(len(sublist))
	for index,element in enumerate(sublist):
		if element.isdigit():
			sublist.remove(element)

Obter os indices das samples:

In [328]:
dt_insertion = {}
sample_indices = []
for index,sample in enumerate(final_df['insertions']):
	if type(sample) != float:
		sample_indices.append(index)

Criar dicionario:

In [330]:
dt_insertion = dict()
for samp_indices, index in zip(sample_indices, range(len(pca_df))):
	dt_temp = dict()
	for element in lst_insert[index]:
		#pos = re.split(',',element)
		
		# if not element.isdigit():
		# 	lst_insert.append(pos)

		if element in dt_temp:
			dt_temp[element + ">ins"] += 1
		else:
			dt_temp[element + ">ins"] = 1

	dt_insertion[final_df['seqName'][samp_indices]] = dt_temp


In [332]:
for key in dt_insertion:
	for subkey,value in dt_insertion[key].items():
		if subkey not in pca_df.columns[:]: #len(subkey) < 14 and 
			pca_df[subkey] = None


In [337]:
for sample in dt_insertion:
	for column,value in dt_insertion[sample].items():
		#print(str(sample),column,value)
		#print(pca_df.loc[0, 'ATG>del'])
		pca_df.loc[sample, column] = value

In [338]:
pca_df

Unnamed: 0_level_0,clade,C>T,A>G,G>A,G>C,T>C,C>A,G>T,A>T,T>A,...,GAA>ins,AAAGCATACAATAACTCGTCTATCT>ins,ACT>ins,TTTC>ins,GAGCTGTGCGAC>ins,CCT>ins,CCC>ins,ATATT>ins,CCCCCC>ins,CAA>ins
seqName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Wuhan/Hu-1/2019,19A,,,,,,,,,,...,,,,,,,,,,
Portugal/CV62/2020,20B,4.0,1.0,2.0,1.0,,,,,,...,,,,,,,,,,
Portugal/CV63/2020,20A,4.0,1.0,,,,,,,,...,,,,,,,,,,
Portugal/PT0001b/2020,20B,4.0,1.0,2.0,1.0,,,,,,...,,,,,,,,,,
Portugal/PT0003/2020,20A,4.0,1.0,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Portugal/PT20090/2021,21J (Delta),16.0,4.0,2.0,,2.0,2.0,9.0,1.0,,...,,,,,,,,,,
Portugal/PT20091/2021,21J (Delta),21.0,7.0,3.0,,2.0,2.0,9.0,,,...,,,,,,,,,,
Portugal/PT20092/2021,21J (Delta),17.0,5.0,3.0,1.0,2.0,1.0,10.0,,,...,,,,,,,,,,
Portugal/PT20093/2021,21J (Delta),21.0,7.0,3.0,,2.0,2.0,10.0,,,...,,,,,,,,,,


In [10]:
pca_df.to_csv(r'./snv_data.csv', header = True)

NameError: name 'pca_df' is not defined

In [55]:
snv_data = pd.read_csv("snv_data.csv")
snv_data


Unnamed: 0,seqName,clade,C>T,A>G,G>A,G>C,T>C,C>A,G>T,A>T,...,GAA>ins,AAAGCATACAATAACTCGTCTATCT>ins,ACT>ins,TTTC>ins,GAGCTGTGCGAC>ins,CCT>ins,CCC>ins,ATATT>ins,CCCCCC>ins,CAA>ins
0,Wuhan/Hu-1/2019,19A,,,,,,,,,...,,,,,,,,,,
1,Portugal/CV62/2020,20B,4.0,1.0,2.0,1.0,,,,,...,,,,,,,,,,
2,Portugal/CV63/2020,20A,4.0,1.0,,,,,,,...,,,,,,,,,,
3,Portugal/PT0001b/2020,20B,4.0,1.0,2.0,1.0,,,,,...,,,,,,,,,,
4,Portugal/PT0003/2020,20A,4.0,1.0,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20105,Portugal/PT20090/2021,21J (Delta),16.0,4.0,2.0,,2.0,2.0,9.0,1.0,...,,,,,,,,,,
20106,Portugal/PT20091/2021,21J (Delta),21.0,7.0,3.0,,2.0,2.0,9.0,,...,,,,,,,,,,
20107,Portugal/PT20092/2021,21J (Delta),17.0,5.0,3.0,1.0,2.0,1.0,10.0,,...,,,,,,,,,,
20108,Portugal/PT20093/2021,21J (Delta),21.0,7.0,3.0,,2.0,2.0,10.0,,...,,,,,,,,,,


In [56]:
for column in snv_data:
	if len(column) > 10:
		snv_data.drop([column], axis=1, inplace=True)

In [57]:
snv_data

Unnamed: 0,seqName,clade,C>T,A>G,G>A,G>C,T>C,C>A,G>T,A>T,...,TA>ins,AGATCT>ins,GAA>ins,ACT>ins,TTTC>ins,CCT>ins,CCC>ins,ATATT>ins,CCCCCC>ins,CAA>ins
0,Wuhan/Hu-1/2019,19A,,,,,,,,,...,,,,,,,,,,
1,Portugal/CV62/2020,20B,4.0,1.0,2.0,1.0,,,,,...,,,,,,,,,,
2,Portugal/CV63/2020,20A,4.0,1.0,,,,,,,...,,,,,,,,,,
3,Portugal/PT0001b/2020,20B,4.0,1.0,2.0,1.0,,,,,...,,,,,,,,,,
4,Portugal/PT0003/2020,20A,4.0,1.0,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20105,Portugal/PT20090/2021,21J (Delta),16.0,4.0,2.0,,2.0,2.0,9.0,1.0,...,,,,,,,,,,
20106,Portugal/PT20091/2021,21J (Delta),21.0,7.0,3.0,,2.0,2.0,9.0,,...,,,,,,,,,,
20107,Portugal/PT20092/2021,21J (Delta),17.0,5.0,3.0,1.0,2.0,1.0,10.0,,...,,,,,,,,,,
20108,Portugal/PT20093/2021,21J (Delta),21.0,7.0,3.0,,2.0,2.0,10.0,,...,,,,,,,,,,


In [52]:
snv_data.to_csv(r'./snv_data_10.csv', header = True)