In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import nn
import torchmetrics
import os
import DL_models.utils as du
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from time import time
import copy
import gc

In [2]:
def generate_train_test_samples( df: pd.DataFrame, target_columns: list, columns_to_drop: list = [], device: str = 'cuda',
                                 test_size: float = 0.2):
            
        y = df[target_columns].to_numpy()
        X = df.drop(columns= columns_to_drop).to_numpy()
        
        X_train , X_test , y_train , y_test = train_test_split(X, y, test_size=test_size, random_state=42)

        X_train_tensor = torch.from_numpy(X_train).type(torch.float).to(device)
        X_test_tensor = torch.from_numpy(X_test).type(torch.float).to(device)
        y_test_tensor = torch.from_numpy(y_test).type(torch.float).to(device).squeeze()
        y_train_tensor = torch.from_numpy(y_train).type(torch.float).to(device).squeeze()
        
        return (X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor)

In [3]:
def generate_train_test_samples2( df: pd.DataFrame, target_columns: list, features_list: list, device: str = 'cuda',
                                 test_size: float = 0.2):
            
        y = df[target_columns].to_numpy()
        
        X = df[features_list].to_numpy()
        
        X_train , X_test , y_train , y_test = train_test_split(X, y, test_size=test_size, random_state=42)

        X_train_tensor = torch.from_numpy(X_train).type(torch.float).to(device)
        X_test_tensor = torch.from_numpy(X_test).type(torch.float).to(device)
        y_test_tensor = torch.from_numpy(y_test).type(torch.float).to(device).squeeze()
        y_train_tensor = torch.from_numpy(y_train).type(torch.float).to(device).squeeze()
        
        return (X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor)

In [4]:
class LinuxDatasetObject(Dataset):
    def __init__(self, dataset, labels):
        self.dataset = dataset
        self.labels = labels

    def NumberOfFeatures(self):
        return self.dataset.shape[1]
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        return self.dataset[idx], self.labels[idx]

In [5]:
def train_dataloader(train_data: LinuxDatasetObject, batch_size: int, shuffle: bool = True):
    return DataLoader(train_data, batch_size=batch_size, shuffle=shuffle)
    
def test_dataloader(test_data: LinuxDatasetObject, batch_size: int, shuffle: bool = False):
    return DataLoader(test_data, batch_size=batch_size, shuffle=shuffle)

def validation_dataloader(val_data: LinuxDatasetObject, batch_size: int, shuffle: bool = False):
    return DataLoader(val_data, batch_size=batch_size, shuffle=shuffle)

In [6]:
class EarlyStopping:
    def __init__(self, pation: int = 15, min_delta: int = 0):
        self.pation = pation
        self.min_delta = min_delta
        self.best_model = None
        self.best_loss = None
        self.counter = 0

    def __call__(self, model: nn.Module, val_loss: float, name_of_the_model_save: str = "result"):
        if self.best_loss == None:
            self.best_loss = val_loss
            self.best_model = copy.deepcopy(model.state_dict())

        elif val_loss <= self.best_loss:
            self.best_loss = val_loss
            self.best_model = copy.deepcopy(model.state_dict())
            self.counter = 0

        else:
            self.counter += 1
            if self.counter >= self.pation:
                #model.load_state_dict(self.best_model)
                #torch.save(self.best_model, f=name_of_the_model_save)
                return True
            
        return False

In [7]:
device: str = "cuda" if torch.cuda.is_available() else "cpu"

In [8]:
class Model(nn.Module):
    def __init__(self,features, classes, activationLayer):
        super().__init__()
        self.dropout = nn.Dropout(p=0.2)
        self.activationLayer = activationLayer
        self.inputLayer = nn.Linear(features, features)
        self.hiddenLayer1 = nn.Linear(features,features)
        self.hiddenLayer2 = nn.Linear(features,features)
        self.outputLayer = nn.Linear(features, classes)

    def forward(self,x):
        z = self.inputLayer(x)
        z = self.activationLayer(self.hiddenLayer1(z))
        z = self.dropout(z)
        z = self.activationLayer(self.hiddenLayer2(z))
        z = self.dropout(z)
        z = self.outputLayer(z)
        return z

In [9]:
#es = EarlyStopping()

In [10]:
#df = pd.read_hdf("DL_models/data/dados.h5")


#features_pca = len(df_pca.axes[1])
#features_selctk = len(df_selectk.axes[1]) - 1
#features_variance = len(df_variance.axes[1])

In [11]:
#data = generate_train_test_samples(df_pca,["perf"],["perf"])

In [12]:
#train = LinuxDatasetObject(data[0],data[1])

In [13]:
#model1 = Model(features_pca -1 ,1).to(device)

In [14]:
#optimizer = torch.optim.AdamW(params=model1.parameters(), lr=0.001, fused=True)
#loss_fn = nn.SmoothL1Loss().to(device)

In [15]:
#train_loader = train_dataloader(train, 4096)

In [16]:
def get_cofg_for_model(version: int, model):
    if version == 1:
        activation = nn.PReLU()
        loss_fn = torchmetrics.MeanAbsolutePercentageError().to(device)
    elif version == 2:
        activation = nn.PReLU()
        loss_fn = torchmetrics.MeanSquaredError().to(device).to(device)
    elif version == 3:
        activation = nn.ELU()
        loss_fn = nn.SmoothL1Loss().to(device)
    
    #retorna loss activation e optimizer nessa ordem
    return(loss_fn, activation, torch.optim.AdamW(model.parameters(), lr=0.001, fused=True))
        

In [17]:
def train_model(model1, loss_fn, optimizer, es, train_loader, data, feature,percentage):
    epoch = 400
    star_time = time()
    for i in range(epoch):
        for batch_data, batch_labels in train_loader:
            model1.train()
            y_pred = model1(batch_data).squeeze()

            loss = loss_fn(y_pred, batch_labels)

            optimizer.zero_grad()

            loss.backward()

            optimizer.step()

            with torch.inference_mode():
                test_pred = model1(data[2]).squeeze()
                loss_test = loss_fn(test_pred, data[3])
            
                loss_test = torch.Tensor.cpu(loss_test).detach().numpy()
            
            if es(model1, loss_test, f"{feature}_{percentage}"):
                model1.to('cpu')
                loss_fn.to('cpu')
                del loss_fn
                del optimizer
                del model1
                torch.cuda.empty_cache()
                gc.collect()
                torch.cuda.empty_cache()
                return  (es.best_loss, time() - star_time, i)
    model1.to('cpu')
    loss_fn.to('cpu')
    del loss_fn
    del optimizer
    del model1
    torch.cuda.empty_cache()
    gc.collect()
    torch.cuda.empty_cache()
    return  (es.best_loss, time() - star_time, i)     
    

In [18]:
def train_model_for_all_features(files: list[str], percentages: list[int]):
    with open("feature_selection_experimente.csv", "w") as f: 
        df = pd.read_hdf("DL_models/data/dados.h5")
        for file in files:
            name = file.split("_")
            rank = pd.read_csv(f"results/{file}")
            for percentage in percentages:
                #EarlyStopping
                es = EarlyStopping()
            
                #select the data
                cut = rank['importance'].quantile(percentage)
                top_percent = rank[rank['importance'] >= cut]["features"]
                number_of_features = len(top_percent)
                data = generate_train_test_samples2(df,["perf"],top_percent)
                train = LinuxDatasetObject(data[0],data[1])
                train_loader = train_dataloader(train, 4096)
            
                #get hyperparamiters
                model1 = Model(number_of_features,1,nn.ELU())
                model1.to(device)
                loss_fn, activation, opt = get_cofg_for_model(1, model1)
            
                #train the model
                best_loss, time, epoch = train_model(model1, loss_fn, opt, es, train_loader, data, name, percentage)
            
                #write the results
                #output = pd.read_csv("~")
                nova_linha = {"name":name,"percentage":1-percentage,"best_loss":best_loss,"time":time,"epoch":epoch}
                #df = df.append(nova_linha, ignore_index=True)
                #df.to_csv("feature_selection_experimente.csv")
                print(nova_linha)
                f.write(str(name)+","+str(1-percentage)+","+str(best_loss)+","+str(time)+","+str(epoch))
                print(f"{name}_{1-percentage} exectuded")
            
            

In [19]:
percentages =[0.9,0.8,0.7,0.5,0.3,0.1]

In [20]:
fiels = os.listdir("results")
files = fiels[:-1]

In [21]:
train_model_for_all_features(files, percentages)

{'name': ['feature', 'importance', 'GB.csv'], 'percentage': 0.09999999999999998, 'best_loss': array(0.16208723, dtype=float32), 'time': 534.7122533321381, 'epoch': 29}
['feature', 'importance', 'GB.csv']_0.09999999999999998 exectuded
{'name': ['feature', 'importance', 'GB.csv'], 'percentage': 0.19999999999999996, 'best_loss': array(0.16555892, dtype=float32), 'time': 511.45708179473877, 'epoch': 28}
['feature', 'importance', 'GB.csv']_0.19999999999999996 exectuded
{'name': ['feature', 'importance', 'GB.csv'], 'percentage': 0.30000000000000004, 'best_loss': array(0.16273493, dtype=float32), 'time': 522.2792568206787, 'epoch': 28}
['feature', 'importance', 'GB.csv']_0.30000000000000004 exectuded
{'name': ['feature', 'importance', 'GB.csv'], 'percentage': 0.5, 'best_loss': array(0.18378714, dtype=float32), 'time': 426.4525911808014, 'epoch': 23}
['feature', 'importance', 'GB.csv']_0.5 exectuded
{'name': ['feature', 'importance', 'GB.csv'], 'percentage': 0.7, 'best_loss': array(0.17915295,

In [22]:
#plt.plot(data[3].cpu().detach().numpy()[:100], color = 'b')
#pred = model1(data[2]).squeeze()
#plt.plot(pred.cpu().detach().numpy()[:100], color = 'r', linestyle = 'dashed')
#plt.show()