### Importaciones


In [66]:
import numpy as np
import polars as pl
import torch
from torch.utils.data import Dataset

### Leemos el csv

In [67]:
solarDataset = pl.read_csv("flare.data",dtypes={
            "column_1": pl.Categorical,
            "column_2": pl.Categorical,
            "column_3": pl.Categorical,
        }, 
        has_header=False, 
        skip_rows=1, 
        separator=' ',
        new_columns=["modified Zurich class","largest spot size","spot distribution","activity","evolution","previous 24 hour flare activity", "historically-complex",
                          "became complex on this pass", "area", "area of largest spot", "common flares", "moderate flares", "severe flares"])
solarDataset

modified Zurich class,largest spot size,spot distribution,activity,evolution,previous 24 hour flare activity,historically-complex,became complex on this pass,area,area of largest spot,common flares,moderate flares,severe flares
cat,cat,cat,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""C""","""S""","""O""",1,2,1,1,2,1,2,0,0,0
"""D""","""S""","""O""",1,3,1,1,2,1,2,0,0,0
"""C""","""S""","""O""",1,3,1,1,2,1,1,0,0,0
"""D""","""S""","""O""",1,3,1,1,2,1,2,0,0,0
"""D""","""A""","""O""",1,3,1,1,2,1,2,0,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…
"""C""","""R""","""O""",1,2,1,2,2,1,2,0,0,0
"""D""","""R""","""O""",1,3,1,1,2,1,2,0,0,0
"""E""","""A""","""O""",1,3,1,1,2,1,2,0,0,0
"""C""","""R""","""O""",1,3,1,1,2,1,1,0,0,0


In [68]:
grupo_numericos = solarDataset.select(["activity","evolution","previous 24 hour flare activity", "historically-complex",
                          "became complex on this pass", "area", "area of largest spot", "common flares", "moderate flares", "severe flares"])
display(grupo_numericos)

medias_solo = torch.tensor(grupo_numericos.mean().to_numpy()).squeeze()
stds_solo = torch.tensor(grupo_numericos.std().to_numpy()).squeeze()

display(medias_solo)
display(stds_solo)

activity,evolution,previous 24 hour flare activity,historically-complex,became complex on this pass,area,area of largest spot,common flares,moderate flares,severe flares
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
1,2,1,1,2,1,2,0,0,0
1,3,1,1,2,1,2,0,0,0
1,3,1,1,2,1,1,0,0,0
1,3,1,1,2,1,2,0,0,0
1,3,1,1,2,1,2,0,0,0
…,…,…,…,…,…,…,…,…,…
1,2,1,2,2,1,2,0,0,0
1,3,1,1,2,1,2,0,0,0
1,3,1,1,2,1,2,0,0,0
1,3,1,1,2,1,1,0,0,0


tensor([1.1393, 2.4861, 1.1920, 1.3684, 1.9474, 1.0279, 1.7554, 0.1331, 0.1362,
        0.0217], dtype=torch.float64)

tensor([0.3468, 0.6020, 0.5900, 0.4831, 0.2236, 0.1648, 0.4305, 0.3990, 0.4794,
        0.1458], dtype=torch.float64)

### Escalamos

In [69]:
class StandardScaler:

    def __init__(self, mean=None, std=None, epsilon=1e-7):
        self.mean = mean
        self.std = std
        self.epsilon = epsilon

    def fit(self, values):
        dims = list(range(values.dim() - 1))
        self.mean = torch.mean(values, dim=dims)
        self.std = torch.std(values, dim=dims)
        

    def transform(self, values):
        return (values - self.mean) / (self.std + self.epsilon)

    def fit_transform(self, values):
        self.fit(values)
        return self.transform(values)

    def __call__(self, sample):
        values,saidas = sample
        return ((values - self.mean) / (self.std + self.epsilon), saidas)
        
    def __repr__(self):
        return f"mean: {self.mean}, std:{self.std}, epsilon:{self.epsilon}"

In [70]:
scaler = StandardScaler(medias_solo, stds_solo)
display(scaler)

mean: tensor([1.1393, 2.4861, 1.1920, 1.3684, 1.9474, 1.0279, 1.7554, 0.1331, 0.1362,
        0.0217], dtype=torch.float64), std:tensor([0.3468, 0.6020, 0.5900, 0.4831, 0.2236, 0.1648, 0.4305, 0.3990, 0.4794,
        0.1458], dtype=torch.float64), epsilon:1e-07

In [71]:
npSample = solarDataset[0][["activity","evolution","previous 24 hour flare activity", "historically-complex",
     "became complex on this pass", "area", "area of largest spot", "common flares", "moderate flares", "severe flares"]].to_numpy()
tSample = torch.Tensor(npSample)
display(tSample)
sample_escalado = scaler.transform(tSample)
display(sample_escalado)

tensor([[1., 2., 1., 1., 2., 1., 2., 0., 0., 0.]])

tensor([[-0.4017, -0.8074, -0.3253, -0.7626,  0.2353, -0.1690,  0.5681, -0.3336,
         -0.2841, -0.1486]], dtype=torch.float64)

### Convertimos los categoricos

In [72]:
proba = solarDataset.select([pl.col(pl.Categorical)])
print(proba.columns)
dummies = [[ {"nome": columna, 'valor': i}
        for i in proba.get_column(columna).cat.get_categories()
    ] for columna in proba.columns]
dummies_flat = [item for row in dummies for item in row]
display(dummies_flat)
print(len(dummies_flat))

['modified Zurich class', 'largest spot size', 'spot distribution']


[{'nome': 'modified Zurich class', 'valor': 'C'},
 {'nome': 'modified Zurich class', 'valor': 'D'},
 {'nome': 'modified Zurich class', 'valor': 'B'},
 {'nome': 'modified Zurich class', 'valor': 'F'},
 {'nome': 'modified Zurich class', 'valor': 'H'},
 {'nome': 'modified Zurich class', 'valor': 'E'},
 {'nome': 'largest spot size', 'valor': 'S'},
 {'nome': 'largest spot size', 'valor': 'A'},
 {'nome': 'largest spot size', 'valor': 'K'},
 {'nome': 'largest spot size', 'valor': 'R'},
 {'nome': 'largest spot size', 'valor': 'X'},
 {'nome': 'largest spot size', 'valor': 'H'},
 {'nome': 'spot distribution', 'valor': 'O'},
 {'nome': 'spot distribution', 'valor': 'I'},
 {'nome': 'spot distribution', 'valor': 'X'},
 {'nome': 'spot distribution', 'valor': 'C'}]

16


In [73]:
novo_expr = [(pl.col(item['nome']) == item["valor"] ).alias(f'{item["nome"]}-{item["valor"]}') for item in dummies_flat]
print(novo_expr)

[<Expr ['[(col("modified Zurich class")…'] at 0x7EFC88464E20>, <Expr ['[(col("modified Zurich class")…'] at 0x7EFC884655D0>, <Expr ['[(col("modified Zurich class")…'] at 0x7EFC88467A60>, <Expr ['[(col("modified Zurich class")…'] at 0x7EFC88467F40>, <Expr ['[(col("modified Zurich class")…'] at 0x7EFC884661D0>, <Expr ['[(col("modified Zurich class")…'] at 0x7EFC88466CE0>, <Expr ['[(col("largest spot size")) ==…'] at 0x7EFCA357F940>, <Expr ['[(col("largest spot size")) ==…'] at 0x7EFCA357D810>, <Expr ['[(col("largest spot size")) ==…'] at 0x7EFCA357FFA0>, <Expr ['[(col("largest spot size")) ==…'] at 0x7EFCA357F340>, <Expr ['[(col("largest spot size")) ==…'] at 0x7EFCA357CFA0>, <Expr ['[(col("largest spot size")) ==…'] at 0x7EFCA357FB20>, <Expr ['[(col("spot distribution")) ==…'] at 0x7EFCA357FC70>, <Expr ['[(col("spot distribution")) ==…'] at 0x7EFCA357F910>, <Expr ['[(col("spot distribution")) ==…'] at 0x7EFCA357D7E0>, <Expr ['[(col("spot distribution")) ==…'] at 0x7EFCA357FD30>]


In [74]:
class SolarDataset(Dataset):
  def __init__(self, src_file, root_dir, transform=None, expr_dummies = None):
    
    self.transform = transform
    self.expr_dummies = expr_dummies
    self.dataSet = pl.scan_csv(src_file, dtypes={
            "column_1": pl.Categorical,
            "column_2": pl.Categorical,
            "column_3": pl.Categorical,
            "column_4": pl.Int64,
            "column_5": pl.Int64,
            "column_6": pl.Int64,
            "column_7": pl.Int64,
            "column_8": pl.Int64,
            "column_9": pl.Int64,
            "column_10": pl.Int64,
            "column_11": pl.Int64,
            "column_12": pl.Int64,
            "column_13": pl.Int64,
        },
        has_header=False, 
        skip_rows=1, 
        separator=' ',null_values=['?']).drop_nulls().rename({
            "column_1": "modified Zurich class",
            "column_2": "largest spot size",
            "column_3": "spot distribution",
            "column_4": "activity",
            "column_5": "evolution",
            "column_6": "previous 24 hour flare activity",
            "column_7": "historically-complex",
            "column_8": "became complex on this pass",
            "column_9": "area",
            "column_10": "area of largest spot",
            "column_11": "common flares",
            "column_12": "moderate flares",
            "column_13": "severe flares"
        }).with_row_index("id")

  def __len__(self):
    return self.dataSet.select(pl.len()).collect().item()
  
  def __getitem__(self, idx):
    if torch.is_tensor(idx):
      idx = idx.tolist()
    else:
      idx = [idx]
    
    seccion = self.dataSet.filter(pl.col("id").is_in(idx)).drop("id")
    datos = seccion.collect()

    datosNumericos = datos.select([pl.col(pl.Int64)])
    predsA =  self.transform.transform(torch.tensor(datosNumericos.to_numpy()).squeeze())
    
    predsC = datos.select([pl.col(pl.Categorical)]).with_columns(
      self.expr_dummies
    ).drop(datos.select([pl.col(pl.Categorical)]).columns)
    tensorB = torch.tensor(predsC.to_numpy().astype(np.int64)).squeeze()

    entrada = torch.cat((predsA[:7], tensorB),dim=-1)
    sample = (entrada,predsA[7:] if predsA.dim() == 1 else predsA[:,7:].flatten())
    return sample



dataset = SolarDataset("flare.data",".",transform=scaler,expr_dummies=novo_expr)
print(dataset[0])
print(dataset[10])
print(dataset[torch.tensor([0,10,100])])

(tensor([-0.4017, -0.8074, -0.3253, -0.7626,  0.2353, -0.1690,  0.5681,  1.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000],
       dtype=torch.float64), tensor([-0.3336, -0.2841, -0.1486], dtype=torch.float64))
(tensor([-0.4017,  0.8537, -0.3253, -0.7626,  0.2353, -0.1690, -1.7547,  0.0000,
         0.0000,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000],
       dtype=torch.float64), tensor([-0.3336, -0.2841, -0.1486], dtype=torch.float64))
(tensor([[-0.4017, -0.8074, -0.3253, -0.7626,  0.2353, -0.1690,  0.5681, -0.3336,
         -0.2841, -0.1486,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,
          0.0000,  0.0000],
        [-0.4017,  0.8537, -0.3253, -0.7626,  0.2353, -0.1690, -1.7547, -0.3336,
       

### División train y test

In [75]:
from torch.utils.data import random_split


lonxitudeDataset = len(dataset)
print(f"Tamanho dataset {lonxitudeDataset}")
tamTrain =int(lonxitudeDataset*0.8)
tamVal = lonxitudeDataset - tamTrain
print(f"Tam dataset: {lonxitudeDataset} train: {tamTrain} tamVal: {tamVal}")
train_set, val_set = random_split(dataset,[tamTrain,tamVal])
train_ldr = torch.utils.data.DataLoader(train_set, batch_size=2,
    shuffle=True, drop_last=False)
validation_loader =torch.utils.data.DataLoader(val_set, batch_size=4, shuffle=False)    

Tamanho dataset 323
Tam dataset: 323 train: 258 tamVal: 65


### Creamos a NN

In [76]:
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.autograd import Variable

class Model(nn.Module):
    def __init__(self, input_dim):
        super(Model, self).__init__()
        self.layer1 = nn.Linear(input_dim, 50)
        self.layer2 = nn.Linear(50, 50)
        self.layer3 = nn.Linear(50, 3)
        
    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = self.layer3(x)
        return x


model     = Model(23)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn   = nn.CrossEntropyLoss()
compiled_model = torch.compile(model)
model
compiled_model

OptimizedModule(
  (_orig_mod): Model(
    (layer1): Linear(in_features=23, out_features=50, bias=True)
    (layer2): Linear(in_features=50, out_features=50, bias=True)
    (layer3): Linear(in_features=50, out_features=3, bias=True)
  )
)

In [77]:
entradaProba,dest = next(iter(train_ldr))
entradaProba = entradaProba.to(torch.float32)
print(entradaProba.dtype)
print(model.layer1.weight.dtype)
display(entradaProba)
saida = model(entradaProba)
display(dest)
display(saida)
display(loss_fn(saida, dest ))

torch.float32
torch.float32


tensor([[-0.4017,  0.8537, -0.3253, -0.7626,  0.2353, -0.1690,  0.5681,  0.0000,
          1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000],
        [-0.4017,  0.8537, -0.3253,  1.3073,  0.2353, -0.1690,  0.5681,  0.0000,
          1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000]])

tensor([[-0.3336, -0.2841, -0.1486],
        [-0.3336, -0.2841, -0.1486]], dtype=torch.float64)

tensor([[ 0.0126,  0.0076, -0.1645],
        [-0.0418, -0.0351, -0.0997]], grad_fn=<AddmmBackward0>)

tensor(-0.8308, dtype=torch.float64, grad_fn=<DivBackward1>)

### Entrenamos

In [78]:
def train_one_epoch(epoch_index, tb_writer):
    running_loss = 0.
    last_loss = 0.
    for i, data in enumerate(train_ldr):
        
        inputs, labels = data
        optimizer.zero_grad()

        outputs = compiled_model(inputs.to(torch.float32))
        loss = loss_fn(outputs, labels)
        loss.backward()

        optimizer.step()

        running_loss += loss.item()

        if i % 10 == 9:
            last_loss = running_loss / 10 
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            
            running_loss = 0.
    return last_loss

In [99]:
from torchmetrics import MeanSquaredError, MeanAbsoluteError
epoch_number = 0

EPOCHS = 2

loss_list = np.zeros((EPOCHS,))

for epoch in range(EPOCHS):

    print('EPOCH {}:'.format(epoch + 1))

    compiled_model.train(True)
    avg_loss = train_one_epoch(epoch, None)
    loss_list[epoch] = avg_loss
    mean_squared_error = MeanSquaredError()
    mean_absolute_error = MeanAbsoluteError()

    compiled_model.train(False)

    running_vloss = 0.0

    for i, vdata in enumerate(validation_loader):

        vinputs, vlabels = vdata
        print('Entrada: ', vinputs)
        voutputs = compiled_model(vinputs.to(torch.float32))
        vloss = loss_fn(voutputs, vlabels)
        print(torch.argmax(voutputs, dim=1))
        print('Salida: ', voutputs)
        print('Salida deseada: ', vlabels)
        mean_squared_error(voutputs,vlabels)
        mean_absolute_error(voutputs,vlabels)
    
    errorMedio = mean_squared_error.compute()
    errorAbsolute =mean_absolute_error.compute()

    print('Error medio: ',errorMedio)
    print('Error absoluto', errorAbsolute)
        


EPOCH 1:
  batch 10 loss: -63129.94628169264
  batch 20 loss: -46511.12561004345
  batch 30 loss: -13434.492601655616
  batch 40 loss: -45179.5987801054
  batch 50 loss: -62815.07239748794
  batch 60 loss: -43528.14805250388
  batch 70 loss: -10270.921397259797
  batch 80 loss: -48780.8303690353
  batch 90 loss: 4645.749420833482
  batch 100 loss: -51912.823025302074
  batch 110 loss: -44470.673115002435
  batch 120 loss: -28416.214173778124
Entrada:  tensor([[ 2.4817, -0.8074, -0.3253, -0.7626,  0.2353, -0.1690,  0.5681,  0.0000,
          1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          1.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000],
        [ 2.4817,  0.8537,  3.0643,  1.3073,  0.2353,  5.8975,  0.5681,  0.0000,
          1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000],
        [-0.4017,  0.8537, -0.3253, -0.7626,  0.2353, -0.1690,  0.5681,  0