## Imports de bibliotecas

In [None]:
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd 
from tqdm import tqdm 
import plotly.io as pio
import torch
import torchvision
from torchvision import transforms
from torch.utils.data import DataLoader,random_split
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.manifold import TSNE
import plotly.express as px
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
import numpy as np
import cv2
from PIL import Image
import torch
from torchvision.transforms.functional import to_pil_image
from torch.utils.data import Dataset
import random

## Carregando e transformando Datasets

In [None]:
coords = "..\\data\\nova_base\\estacoes_pluviometricas.csv"
coords = pd.read_csv(coords)

latitude_pc = 4.4997
longitude_pc = 4.9321

def pontos(x,y):
    
    lat_pc, long_pc = -22.464278, -43.297476

    dist_x =  (x - long_pc) 
    dist_y = (y - lat_pc)

    x1 = 125 + (dist_x * 250 / longitude_pc)
    y1 = 125 - (dist_y * 250 / latitude_pc)

    return y1,x1

def gera_df_est():
    
    df_est = pd.DataFrame()

    for n in coords['N']:
 
     e = coords.iloc[(n-1),1]
     lat = coords.iloc[(n-1),2]
     long =coords.iloc[(n-1),3]

     ya, xa = pontos(long,lat)
     xa, ya = round(xa), round(ya)

     d1 = {"Estação": [e], "X": [xa], "Y": [ya]}
     df = pd.DataFrame(d1)
     df_est = pd.concat([df_est,df])
  
    return df_est

def get_subplot(image, raio):
    i = 5  # COPACABANA
    x = int(df_est.iloc[i, 1])
    y = int(df_est.iloc[i, 2])

    top_left_x = max(0, x - raio)
    top_left_y = max(0, y - raio)
    bottom_right_x = min(image.shape[1], x + raio)
    bottom_right_y = min(image.shape[0], y + raio)

    subimage = image[top_left_y:bottom_right_y, top_left_x:bottom_right_x]

    return subimage

df_est = gera_df_est()
df_est

In [None]:
class CustomImageDataset(Dataset):
    def __init__(self, images, transform=None):
        self.images = images
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.images[idx]

        if isinstance(image, np.ndarray):
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            image = Image.fromarray(image)

        elif isinstance(image, torch.Tensor):
            image = to_pil_image(image)

        elif isinstance(image, Image.Image):
            pass
        else:
            raise TypeError(f"Unsupported image type: {type(image)}")

        if self.transform:
            image = self.transform(image)

        return image

In [None]:

def load_and_split_dataset(path,test_size=0.2):
    images = []

    for year_folder in os.listdir(path):
        year_path = os.path.join(path, year_folder)
        if os.path.isdir(year_path):
            for month_folder in os.listdir(year_path):
                month_path = os.path.join(year_path, month_folder)
                if os.path.isdir(month_path):
                    for day_folder in os.listdir(month_path):
                        day_path = os.path.join(month_path, day_folder)
                        if os.path.isdir(day_path):
                            for filename in os.listdir(day_path):
                                if filename.endswith(".png"):
                                    file_path = os.path.join(day_path, filename)
                                    image = cv2.imread(file_path)
                                    if image is not None:
                                        image = get_subplot(image,10)
                                        images.append(image)
                            

    train_dataset, test_dataset = train_test_split(images, test_size=test_size)
    return train_dataset, test_dataset

PATH = "..\\data\\nova_base\\img\\" 
train_dataset, test_dataset = load_and_split_dataset(PATH)
print(len(train_dataset),len(test_dataset))

In [None]:
import pickle
with open('train_dataset.pkl', 'wb') as file:
    pickle.dump(train_dataset, file)

In [None]:
with open('test_dataset.pkl', 'wb') as file:
    pickle.dump(test_dataset, file)

In [None]:
transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
])

train_dataset = CustomImageDataset(train_dataset, transform=transform)
test_dataset = CustomImageDataset(test_dataset, transform=transform)

In [None]:
m = len(train_dataset)

val_size = int(m * 0.2)

train_size = m - val_size

train_data, val_data = random_split(train_dataset, [train_size, val_size])

batch_size = 256
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(val_data, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

images = next(iter(test_loader))
print(images.shape)  

## Definição do codificador e do decodificador


In [None]:
class Encoder(nn.Module):
    def __init__(self, encoded_space_dim):
        super().__init__()

        self.encoder_cnn = nn.Sequential(
            nn.Conv2d(3, 8, 3, stride=2, padding=1), 
            nn.ReLU(True),
            nn.Conv2d(8, 16, 3, stride=2, padding=1), 
            nn.BatchNorm2d(16),
            nn.ReLU(True),
            nn.Conv2d(16, 32, 3, stride=2, padding=1), 
            nn.ReLU(True),
            nn.Conv2d(32, 64, 3, stride=2, padding=1), 
            nn.ReLU(True)
        )

        self.flattened_size = 64 * 16 * 16

        self.encoder_lin = nn.Sequential(
            nn.Linear(self.flattened_size, encoded_space_dim)
        )

    def forward(self, x):
        x = self.encoder_cnn(x)
        x = x.view(-1, self.flattened_size)
        x = self.encoder_lin(x)
        return x

In [None]:
class Decoder(nn.Module):
    def __init__(self, encoded_space_dim):
        super().__init__()

        self.decoder_lin = nn.Sequential(
            nn.Linear(encoded_space_dim, 64 * 16 * 16)
        )

        self.unflatten = nn.Unflatten(dim=1, unflattened_size=(64, 16, 16))

        self.decoder_conv = nn.Sequential(
            nn.ConvTranspose2d(64, 32, 3, stride=2, padding=1, output_padding=1),
            nn.ReLU(True),
            nn.ConvTranspose2d(32, 16, 3, stride=2, padding=1, output_padding=1),
            nn.ReLU(True),
            nn.ConvTranspose2d(16, 8, 3, stride=2, padding=1, output_padding=1),
            nn.ReLU(True),
            nn.ConvTranspose2d(8, 3, 3, stride=2, padding=1, output_padding=1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.decoder_lin(x)
        x = self.unflatten(x)
        x = self.decoder_conv(x)
        return x

## Construção do modelo

In [None]:
torch.manual_seed(0)
d = 32 #FEATURES

encoder = Encoder(encoded_space_dim=d)
decoder = Decoder(encoded_space_dim=d)

In [None]:
loss_fn = torch.nn.MSELoss()

lr= 0.01 

params_to_optimize = [
    {'params': encoder.parameters()},
    {'params': decoder.parameters()}
]

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f'Selected device: {device}')

optim = torch.optim.Adam(params_to_optimize, lr=lr)

encoder.to(device)
decoder.to(device)

## Treinamento do modelo

In [None]:
def add_noise(inputs,noise_factor=0):
     noise = inputs + torch.randn_like(inputs)*noise_factor
     noise = torch.clip(noise, 0., 1.)
     return noise

In [None]:
all_train = []
def train_epoch_den(encoder, decoder, device, dataloader, loss_fn, optimizer,noise_factor=0):
    encoder.train()
    decoder.train()
    train_loss = []

    for image_batch in dataloader: 

        image_noisy = add_noise(image_batch,noise_factor)
        image_noisy = image_noisy.to(device)
        encoded_data = encoder(image_noisy)
        decoded_data = decoder(encoded_data)
        loss = loss_fn(decoded_data, image_noisy)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print('\t partial train loss (single batch): %f' % (loss.data))
        train_loss.append(loss.detach().cpu().numpy())

    return np.mean(train_loss)

In [None]:
def test_epoch_den(encoder, decoder, device, dataloader, loss_fn,noise_factor=0):
    encoder.eval()
    decoder.eval()
    with torch.no_grad():
        conc_out = []
        conc_label = []
        for image_batch in dataloader:
            image_noisy = add_noise(image_batch,noise_factor)
            image_noisy = image_noisy.to(device)
            encoded_data = encoder(image_noisy)
            decoded_data = decoder(encoded_data)
            conc_out.append(decoded_data.cpu())
            conc_label.append(image_batch.cpu())
        conc_out = torch.cat(conc_out)
        conc_label = torch.cat(conc_label)
        val_loss = loss_fn(conc_out, conc_label)
    return val_loss.data

In [None]:
def plot_ae_outputs_den(encoder, decoder, n=5):
    plt.figure(figsize=(10, 4.5))
    for i in range(n):
        img = test_dataset[i+10]
        img = img.unsqueeze(0)

        encoder.eval()
        decoder.eval()

        with torch.no_grad():
            print("Input shape before encoder:", img.shape)
            encoded_img = encoder(img)
            rec_img = decoder(encoded_img) 

        original_img = img.cpu().squeeze().numpy()
        if original_img.shape[0] == 3: 
            original_img = np.transpose(original_img, (1, 2, 0)) 

        # Original images
        ax = plt.subplot(3, n, i + 1)
        plt.imshow(original_img)
        ax.get_xaxis().set_visible(False)
        ax.get_yaxis().set_visible(False)
        if i == n // 2:
            ax.set_title('Original images')

        # Reconstructed images
        rec_img_np = rec_img.cpu().squeeze().numpy()
        if rec_img_np.shape[0] == 3:
            rec_img_np = np.transpose(rec_img_np, (1, 2, 0))

        ax = plt.subplot(3, n, i + 1 + n + n)
        plt.imshow(rec_img_np)
        ax.get_xaxis().set_visible(False)
        ax.get_yaxis().set_visible(False)
        if i == n // 2:
            ax.set_title('Reconstructed images')

    plt.subplots_adjust(left=0.1, bottom=0.1, right=0.7, top=0.9, wspace=0.3, hspace=0.3)
    plt.show()

In [None]:
noise_factor = 0
num_epochs = 30
history_da={'train_loss':[],'val_loss':[]}

for epoch in range(num_epochs):
    print('EPOCH %d/%d' % (epoch + 1, num_epochs))
    train_loss = train_epoch_den(
        encoder=encoder,
        decoder=decoder,
        device=device,
        dataloader=train_loader,
        loss_fn=loss_fn,
        optimizer=optim,
        noise_factor=noise_factor)
    val_loss = test_epoch_den(
        encoder=encoder,
        decoder=decoder,
        device=device,
        dataloader=valid_loader,
        loss_fn=loss_fn,
        noise_factor=noise_factor)
    history_da['train_loss'].append(train_loss)
    history_da['val_loss'].append(val_loss)
    print('\n EPOCH {}/{} \t train loss {:.3f} \t val loss {:.3f}'.format(epoch + 1, num_epochs,train_loss,val_loss))
    plot_ae_outputs_den(encoder,decoder)


In [None]:
test_epoch_den(encoder,decoder,device,test_loader,loss_fn).item()

## Avaliação do modelo

In [None]:
train_loss = history_da['train_loss']
val_loss = [loss.item() for loss in history_da['val_loss']]

epochs = range(1, len(train_loss) + 1)

# Plotting
plt.figure(figsize=(8, 5))
plt.plot(epochs, train_loss, label='Training Loss')
plt.plot(epochs, val_loss, label='Validation Loss')

plt.title('Autoencoder')
plt.xticks(epochs)
plt.legend()
plt.show()

In [None]:
len(all_train)

## Freeze e geração de features

In [None]:
images = next(iter(test_loader))
images = images.to(device)

with torch.no_grad():
    encoded_images = encoder(images)
    encoded_space_dim = encoded_images.size(1)
    print('Encoded space dimension:', encoded_space_dim)

In [None]:
COORDS_FILE = "..\\data\\nova_base\\estacoes_pluviometricas.csv"
OUTPUT_FILE = "..\\data\\nova_base\\FEATURE_A652_AUTOENCODER.csv"
IMAGE_SIZE = (256, 256) 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.to(device)
encoder.eval()

transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize(IMAGE_SIZE),
    transforms.ToTensor(),
])

df_estacoes = pd.DataFrame(columns=['date', 'Estação'] + [f'feature_{i}' for i in range(encoded_space_dim)]) 

for year_folder in os.listdir(PATH):
    year_path = os.path.join(PATH, year_folder)
    if os.path.isdir(year_path):
        for month_folder in os.listdir(year_path):
            month_path = os.path.join(year_path, month_folder)
            if os.path.isdir(month_path):
                for day_folder in os.listdir(month_path):
                    day_path = os.path.join(month_path, day_folder)
                    if os.path.isdir(day_path):
                        for filename in os.listdir(day_path):
                            if filename.endswith(".png"):
                                file_path = os.path.join(day_path, filename)
                                image = cv2.imread(file_path, cv2.IMREAD_COLOR)
                                if image is None:
                                    continue
                                image = cv2.resize(image, IMAGE_SIZE)
                                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                                image = transform(image)
                                image = image.unsqueeze(0).to(device)

                                with torch.no_grad():
                                    encoded_images = encoder(image)

                                date = filename
                                station = "Copacabana"
                                encoded_images = encoded_images.cpu().numpy().flatten()
                                df_row = pd.DataFrame([[date, station] + encoded_images.tolist()], columns=df_estacoes.columns)
                                df_estacoes = pd.concat([df_estacoes, df_row], ignore_index=True)

df_estacoes.to_csv(OUTPUT_FILE, index=False)

In [None]:
df_estacoes.head()