In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
import plotly.express as px
import time

In [None]:
# Conjunto de amostras cânceriginas

df = pd.read_csv("data_mrna_seq_v2_rsem.txt", sep="\t", header=None)
df

In [None]:
# Deixando somente os valores de expressão gênica

df = df.T

new = df.drop(0)
new = new.drop(1)
new = new.drop(0, axis=1)

new

In [None]:
# Conjunto de amostras normais

clas = pd.read_csv("data_mrna_seq_v2_rsem_normal_samples.txt", sep="\t", header=None)
clas

In [None]:
# Deixando somente os valores de expressão gênica

clas = clas.T

clas = clas.drop(0)

clas = clas.drop(1)

clas = clas.drop(0, axis=1)

clas

In [None]:
# Criando a lista da classe câncerigina de acordo com o tamanho do conjunto e adicionando ao conjunto

arr0 = []
arr0 = ['Câncer' for i in range(1082)]
new["Classe"] = arr0

new

In [None]:
# Criando a lista da classe normal de acordo com o tamanho do conjunto e adicionando ao conjunto

arr = []
arr = ['Normal' for i in range(114)]
clas["Classe"] = arr

clas

In [None]:
# Juntando os dois conjuntos

final = new.append(clas, ignore_index=True)
final

In [None]:
# Aplicando o algoritmo SMOTE para balancear o conjunto

smote_over_sampling = SMOTE(random_state=50, n_jobs=-1)    
X, Y = smote_over_sampling.fit_resample(final.drop('Classe',axis=1), final['Classe'])

In [None]:
# Reestruturando o novo conjunto e embaralhando o mesmo.

dataframe = pd.DataFrame(X)

dataframe["Classe"] = Y

dataframe = dataframe.sample(frac = 1)

dataframe = dataframe.reset_index(drop=True)

dataframe

dataframe.to_csv('Conjunto.csv') # Salva o novo conjunto

In [None]:
# A partir daqui será trabalhado apenas com o novo conjunto criado acima
# Não sendo necessário rodar novamente as etapas anteriores.

In [2]:
# Ler Novo conjunto

dataframe = pd.read_csv('Conjunto.csv')

In [3]:
dataframe

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,20523,20524,20525,20526,20527,20528,20529,20530,20531,Classe
0,0.0,18.015500,9.163100,49.180300,785.591000,0.0,231.665000,0.431400,0.000000,0.0,...,251.510000,101.812000,891.717000,1548.320000,36.238100,848.145000,3399.050000,1373.600000,1309.750000,Câncer
1,0.0,13.172000,12.363300,78.340200,832.025000,0.0,133.528000,0.000000,0.000000,0.0,...,354.302000,77.137900,722.437000,1176.750000,97.885400,983.110000,2840.270000,1507.120000,990.025000,Câncer
2,0.0,5.441800,9.231300,124.514000,880.702000,0.0,512.281000,0.000000,0.000000,0.0,...,891.547000,47.527900,456.459000,2027.430000,22.647500,728.230000,5980.540000,1115.150000,958.533000,Câncer
3,0.0,6.967840,3.346473,104.967370,761.844074,0.0,164.762599,0.150499,0.000000,0.0,...,71.107733,81.102161,447.442344,1077.573032,8.442580,1711.453702,2875.606321,1734.778012,957.395697,Normal
4,0.0,12.874000,7.971200,87.041400,968.053000,0.0,148.001000,0.416900,0.000000,0.0,...,669.132000,86.299300,636.613000,1211.110000,251.810000,958.047000,5257.170000,1888.990000,821.302000,Câncer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2159,0.0,12.754516,10.281505,81.894691,529.664091,0.0,371.691441,0.957970,0.000000,0.0,...,347.783741,82.694462,474.761497,1528.342025,63.660482,1413.768351,4411.055994,1784.497846,1594.255213,Normal
2160,0.0,2.527500,0.821173,97.207360,876.559365,0.0,253.230452,0.000000,0.000000,0.0,...,209.110564,49.724603,275.317720,928.458963,11.786694,813.979671,10705.931284,1306.242605,580.589242,Normal
2161,0.0,6.804516,10.033889,94.850326,633.156715,0.0,236.292938,0.602854,0.185377,0.0,...,265.684648,74.259252,527.958289,1458.762104,40.684530,1071.526651,5327.770902,1854.706079,1271.171213,Normal
2162,0.0,0.000000,3.157700,86.445600,1137.560000,0.0,364.318000,0.000000,0.000000,0.0,...,433.787000,35.918700,383.264000,864.812000,37.892200,678.113000,5364.120000,600.355000,955.595000,Câncer


In [None]:
# Utilização do T-SNE para visualização bidimensional do conjunto original e depois da aplicação do SMOTE
# Foi retirado do artigo já que não é o foco principal.

tsne = TSNE(n_components=2, random_state=0)
projections = tsne.fit_transform(dataframe.drop("Classe", axis=1))

In [None]:
# imprime a disperção do conjunto ao aplicar o t-sne.

fig = px.scatter(
    projections, x=0, y=1,
    color=dataframe.Classe, labels={'color': 'Classe'},
    category_orders=({"color":['Normal','Câncer']}),
    width=550,
    height=350
)
fig.show()

In [None]:
# Coloca os valores da classe em uma variavel para salva-la

y = dataframe["Classe"]

In [None]:
# Essa parte deixa os genes como atributos já que eles estão na primeira coluna apenas
# Só será aplicado com 10 genes e é necessário ler o conjunto original para seleciona-los

df = df.T

new_header = df.iloc[0] 
df = df[1:] 
df.columns = new_header 

In [None]:
# Utiliza o ANOVA para seleção de atributos

X_new = SelectKBest(f_classif, k=1000) # k = quantidade de atributos para selecionar

nov = X_new.fit_transform(dataframe.drop('Classe',axis=1), dataframe['Classe'])

# Pega os indices dos atributos selecionados para verificar qual gene foi selecionado.
liste = X_new.get_support(indices=True)

In [None]:
# Trunca o conjunto de k atributos para inteiros servindo de entrada para a camada Embedding dos modelos de RNN

dataframe = pd.DataFrame(nov, dtype=int)
dataframe

In [None]:
# Pega os genes que foram selecionados pelo ANOVA através dos indices e atribui ao novo conjunto de atributos selecionados
# Somente com 10 genes foi utilizado para verificar quais são os genes apenas

feat = df.columns[liste]

dataframe.columns = feat

dataframe['Classe'] = y
dataframe

In [None]:
# Substitui as classes por números

dataframe['Classe'].replace(['Normal'], 0, inplace=True)

dataframe['Classe'].replace(['Câncer'], 1, inplace=True)

dataframe

In [None]:
# Separa o conjunto em 20% para teste do modelo. Não embaralha para obter sempre os mesmos resultados.

X_train, X_test, y_train, y_test = train_test_split(dataframe.drop('Classe',axis=1), dataframe['Classe'], test_size=0.20, shuffle=False)

In [None]:
# Tranforma os conjuntos de treino e teste em tensores para aplicar no pytorch.

X_test = torch.tensor(X_test.values, dtype=torch.long)
X_train = torch.tensor(X_train.values, dtype=torch.long)

y_train = torch.tensor(y_train.values, dtype=torch.long)
y_test = torch.tensor(y_test.values, dtype=torch.long)

In [None]:
# Verifica qual o valor máximo do conjunto para servir de entrada ao inicializar o modelo

print(torch.max(X_train))
print(torch.max(X_test))

In [None]:
# Configurações Gerais

RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

LEARNING_RATE = 0.005
BATCH_SIZE = 100
NUM_EPOCHS = 5
DEVICE = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

EMBEDDING_DIM = 100
HIDDEN_DIM = 100
NUM_CLASSES = 2

In [None]:
# Inicializa os Dataloaders do pytorch juntando os tensores respectivos

train_loader = DataLoader(torch.utils.data.TensorDataset(X_train, y_train), batch_size=BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(torch.utils.data.TensorDataset(X_test, y_test), batch_size=BATCH_SIZE, shuffle=False)

In [None]:
# Modelo de RNN simples

class RNN(torch.nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
        
        self.rnn = torch.nn.RNN(embedding_dim,
                                hidden_dim,
                                nonlinearity='relu', batch_first=True)      
        
        self.fc = torch.nn.Linear(hidden_dim, output_dim)
        

    def forward(self, text):
    
        embedded = self.embedding(text)
        
        output, hidden = self.rnn(embedded)

        hidden.squeeze_(0)

        output = self.fc(hidden)

        return output

In [None]:
# Modelo LSTM

class RNN(torch.nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
       
        self.rnn = torch.nn.LSTM(embedding_dim,
                                 hidden_dim, batch_first=True)        
        
        self.fc = torch.nn.Linear(hidden_dim, output_dim)
        
        

    def forward(self, text):
      
        embedded = self.embedding(text)
        
        output, (hidden, cell) = self.rnn(embedded)

        hidden.squeeze_(0)

        output = self.fc(hidden)
    
        return output

In [None]:
# Modelo GRU

class RNN(torch.nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)

        self.rnn = torch.nn.GRU(embedding_dim,
                                 hidden_dim, batch_first=True)        
        
        self.fc = torch.nn.Linear(hidden_dim, output_dim)
        
        

    def forward(self, text):

        embedded = self.embedding(text)
        
        output, hidden = self.rnn(embedded)

        hidden.squeeze_(0)

        output = self.fc(hidden)
    
        return output

In [None]:
# inicializando o modelo escolhido

torch.manual_seed(RANDOM_SEED)
model = RNN(input_dim=256880, # maior valor dos tensores + 1
            embedding_dim=EMBEDDING_DIM,
            hidden_dim=HIDDEN_DIM,
            output_dim=NUM_CLASSES
)

model = model.to(DEVICE)
print(model)

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
# Função para computar a acurácia do modelo em questão

def compute_accuracy(model, data_loader, device):

    with torch.no_grad():

        correct_pred, num_examples = 0, 0

        for i, (features, targets) in enumerate(data_loader):

            features = features.to(device)
            targets = targets.float().to(device)

            logits = model(features)
            
            _, predicted_labels = torch.max(logits, 1)
            num_examples += targets.size(0)
            correct_pred += (predicted_labels == targets).sum()
            
    return correct_pred.float()/num_examples * 100

In [None]:
# Treinamento do modelo

start_time = time.time()
for epoch in range(NUM_EPOCHS):
    model.train()
    for batch, (X, y) in enumerate(train_loader):

        logits = model(X)
        loss = F.cross_entropy(logits, y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
                   f'Batch {batch:03d}/{len(train_loader):03d} | '
                   f'Loss: {loss:.4f}')

    with torch.set_grad_enabled(False):
        print(f'training accuracy: 'f'{compute_accuracy(model, train_loader, DEVICE):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')

# Após o treinamento verificar a acurácia no conjunto de teste
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')