# Proyecto 1

In [None]:
#!pip install scikit-learn

In [None]:
# importar pandas numpy sklearn
import pandas as pd
import numpy as np

El siguiente trabajo esta basado en el artículo Miller, C.A., Settle, S.H., Sulman, E.P. et al. Discovering functional modules by identifying recurrent and mutually exclusive mutational patterns in tumors. BMC Med Genomics 4, 34 (2011). https://doi.org/10.1186/1755-8794-4-34

En el cual desarrollaron un método para detectar automáticamente módulos funcionales en tumores basandose únicamente en patrones de aberración genómica recurrente. 

Resulta interesante debido a que los proyectos de caracterización de tumores están empezando a producir un gran volumen de datos sobre aberraciones genómicas, epigenómicas y de expresión génica en muestras tumorales. Este volumen de información sin precedentes tiene el potencial de transformar nuestra comprensión de la biología del cáncer, revelar nuevos biomarcadores y dianas farmacológicas y acelerar el desarrollo de nuevas terapias contra el cáncer.


En el artículo describe la métodología para obtener la matriz de mutación de la colección CGA de 145 muestras de glioblastoma. Para esto usaron en R el paquete DNAcopy, descargarmos algunos ejemplares en la carpeta genomica del repositorio, e intentamos ver los datos en analysis.r sin mucho exito.

Afortudamente el artículo nos comparte la matriz de mutación.

In [None]:
link_database = ' http://brl.bcm.tmc.edu/rme/gbm.dat'

# Descarga el archivo de la base de datos
import urllib.request
urllib.request.urlretrieve(link_database, 'gbm.dat')

In [None]:
# Lee el archivo con pandas
data = pd.read_csv('gbm.dat', sep='\t')
columns = data.columns
data.describe()

In [None]:
data.head()

In [None]:
data

# Constructing a gene network with Winnow

El primer paso en nuestro proceso de detección de módulos fue filtrar la matriz de mutaciones y retener sólo los genes que cumplen una frecuencia de recurrencia establecida, ya que los genes alterados en sólo una o unas pocas muestras no contienen suficiente información para calcular puntuaciones de exclusividad significativas.

In [None]:
filtered_data = data.loc[:, (data != 0).sum() > 6]
filtered_data

In [None]:
# Diferencia entre los datos originales y los datos filtrados
print(data.shape)
print(filtered_data.shape)

In [38]:
# Calculate the exclusivity score between each pair of genes, defined as the number of samples where exactly one of the pair is mutated divided by the number of samples where at least one of the pair is mutated.

# Diccionario sample -> genes mutados
sample_genes = {}

num_exclusive_samples = 0
num_samples = 0

# Recorre las muestras y muestra los genes que estan mutados
for index, row in filtered_data.iterrows():
    num_samples = len(row[row != 0])
    sample_genes[index] = row[row != 0]
    #gen = row[0], sample = row[1:]
    #print(sample_genes[index])
    print('Mutated gene:', row[row != 0]['Unnamed: 0'])
    print("Samples with at least one mutation:", num_samples)
    if num_samples == 1:
        num_exclusive_samples += 1

Mutated gene: ZNF708
Samples with at least one mutation: 7
Mutated gene: XPOT
Samples with at least one mutation: 2
Mutated gene: RNF38
Samples with at least one mutation: 2
Mutated gene: CCT2
Samples with at least one mutation: 2
Mutated gene: INSR
Samples with at least one mutation: 1
Mutated gene: ESR2
Samples with at least one mutation: 2
Mutated gene: ATP1A2
Samples with at least one mutation: 2
Mutated gene: DGKD
Samples with at least one mutation: 2
Mutated gene: chr6:168107192-168276703
Samples with at least one mutation: 16
Mutated gene: GYPA
Samples with at least one mutation: 3
Mutated gene: MN1
Samples with at least one mutation: 12
Mutated gene: RADIL
Samples with at least one mutation: 3
Mutated gene: TRIM24
Samples with at least one mutation: 2
Mutated gene: GYPB
Samples with at least one mutation: 2
Mutated gene: PHIP
Samples with at least one mutation: 2
Mutated gene: chr6:164016427-164050128
Samples with at least one mutation: 22
Mutated gene: USF1
Samples with at lea

In [36]:
for sample, genes in sample_genes.items():
    print('Gene:', genes['Unnamed: 0'])
    print(sample_genes[sample])

Gene: ZNF708
Unnamed: 0      ZNF708
TCGA-02-0055         1
TCGA-06-0143         1
TCGA-06-0159         1
TCGA-06-0169         1
TCGA-06-0173         1
TCGA-06-0190         1
Name: 0, dtype: object
Gene: XPOT
Unnamed: 0      XPOT
TCGA-06-0176       1
Name: 1, dtype: object
Gene: RNF38
Unnamed: 0      RNF38
TCGA-06-0138        1
Name: 2, dtype: object
Gene: CCT2
Unnamed: 0      CCT2
TCGA-02-0099       1
Name: 3, dtype: object
Gene: INSR
Unnamed: 0    INSR
Name: 4, dtype: object
Gene: ESR2
Unnamed: 0      ESR2
TCGA-02-0011       1
Name: 5, dtype: object
Gene: ATP1A2
Unnamed: 0      ATP1A2
TCGA-02-0083         1
Name: 6, dtype: object
Gene: DGKD
Unnamed: 0      DGKD
TCGA-02-0114       1
Name: 7, dtype: object
Gene: chr6:168107192-168276703
Unnamed: 0      chr6:168107192-168276703
TCGA-02-0003                           1
TCGA-02-0007                           1
TCGA-02-0015                           1
TCGA-02-0027                           1
TCGA-02-0058                           1
TCGA-02-

In [None]:
# number of samples where at least one of the pair is mutated
num_samples = len(sample_genes)
print('Number of samples:', num_samples)

# number of samples where exactly one of the pair is mutated
print('Number of exclusive samples:', num_exclusive_samples)

In [None]:

# Calculate the exclusivity score between each pair of genes
exclusivity_scores = np.zeros((len(filtered_data.columns), len(filtered_data.columns)))
for i, gene1 in enumerate(filtered_data.columns):
    for j, gene2 in enumerate(filtered_data.columns):
        print('Gene 1:', gene1)
        if i == j:
            continue
        num_samples_where_both_mutated = 0
        num_samples_where_one_mutated = 0
        for sample, genes in sample_genes.items():
            if gene1 in genes and gene2 in genes:
                num_samples_where_both_mutated += 1
            if gene1 in genes or gene2 in genes:
                num_samples_where_one_mutated += 1
        exclusivity_scores[i, j] = num_samples_where_both_mutated / num_samples_where_one_mutated


In [None]:
exclusivity_scores

These data could be used to create a network where each node is a gene and each edge weight is the exclusivity between the genes.
The highly connected sub-networks would then be used as a starting point for a focused combinatorial search for modules. The disadvantage of this approach is that the networks quickly becomes much too large and densely connected to effectively identify sub-network

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

In [None]:
# These data could be used to create a network where each node is a gene and each edge weight is the exclusivity between the genes.
#The highly connected sub-networks would then be used as a starting point for a focused combinatorial search for modules. The disadvantage of this approach is that the networks quickly becomes much too large and densely connected to effectively identify sub-network

def network_from_exclusivity_scores(exclusivity_scores, genes):
    G = nx.Graph()
    for i, gene1 in enumerate(genes):
        for j, gene2 in enumerate(genes):
            if i == j:
                continue
            G.add_edge(gene1, gene2, weight=exclusivity_scores[i, j])

    pos = nx.spring_layout(G)
    nx.draw(G, pos)
    # Labels en los nodos
    nx.draw_networkx_labels(G, pos)
    plt.show()

network_from_exclusivity_scores(exclusivity_scores, filtered_data.columns[1:10])

These data could be used to create a network where each node is a gene and
each edge weight is the exclusivity between the genes.
The highly connected sub-networks would then be used
as a starting point for a focused combinatorial search for
modules. The disadvantage of this approach is that the
networks quickly becomes much too large and densely
connected to effectively identify sub-networks

Thus, we used an online-learning linear threshold algorithm called Winnow to detect signals of exclusivity against the noisy background of passenger mutations in many irrelevant genes

The Winnow algorithm is a linear threshold algorithm that is used for online learning. It is a variant of the Perceptron algorithm that is designed to work with binary features. The algorithm is based on the idea of a “window” of features that are used to make a prediction. The window is updated at each iteration of the algorithm, and the weights of the features are updated based on the prediction made by the algorithm.


The Winnow algorithm was run in an online setting, using one gene as a classifier and the rest of the mutation array as training data.