In [1]:
#!pip uninstall visualkeras

In [2]:
#!pip install scikit-learn

In [3]:
#!pip install tensorflow-gpu

In [4]:
#!pip install pandas

In [5]:
#!pip install wandb

In [6]:
#!pip install numpy

In [1]:
#Manejo de Datos
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


#Machine learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import datasets, layers, models
from tensorflow.keras.callbacks import Callback

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

#Librerias estandar (Extras)
import re
import os
import time
import random

In [6]:
x = [[1,1,1],[2,2,2],[3,3,3]]
y = [[3,3,3],[4,4,4],[5,5,5]]
#x.shape

In [7]:
z = np.mean( np.array([ x, y ]), axis=0 )
z

array([[2., 2., 2.],
       [3., 3., 3.],
       [4., 4., 4.]])

In [8]:
!python --version
print(tf. __version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Python 3.9.11
2.8.0
Num GPUs Available:  1


In [9]:
#conda list cudnn

In [10]:
#conda list cudatoolkit

In [11]:
"""
Metodos para realizar el entrenamient - evaluacion del modelo
"""

'\nMetodos para realizar el entrenamient - evaluacion del modelo\n'

In [12]:

#Lee el archivo "filename" de datos de precipitacion y
#regresa un df que facilite la lectura del dataset para el entrenmaiento
def obtenerDatos(filename):
    start_time = time.time()
    pdata = pd.read_csv(filename)
    
    # Quitamos los valores NA
    pdata = pdata[pdata['dato'].notna()]

    # Definimos un solo tipo (str) pora asi poder convertirlo a tensor
    pdata = pdata.astype({"dato": str, "XO": str, "XA": str, "fecha": str})

    #Definimos la nueva columna para guardar el XO, XA y fecha
    pdata['imagen'] = pdata.apply(obtenerDir, axis=1)

    # Seleccionamos solo las columnas necesarias :
    # precipitacion, Estacion (Longitud), Estacion (Latitud), Fecha (año-mes-dia-hora)
    #pdataX = pdata.loc[:, ['dato','umbral','altura', 'imagen', 'fecha']]
    pdata = pdata.astype({"dato": str, "umbral": str, "altura": str, "imagen": str, "fecha": str})

    # Barajeamos los datos
    pdata = shuffle(pdata)

    print(f'{len(pdata)} datos leidos')
    print("Tiempo tomado en leer datos: %.2fs" % (time.time() - start_time))
    return pdata

In [13]:

# Devuelve una lista con lo indices que no se encontraron lso archivos y el producto
# Servira para ver si se teinen todas los frames de la fecha
def comprobarFrames(dfOrignial, path_base, products, times, delete=1):
    
    #dfOrignial = obtenerDatos(datafile)
    
    
    start_time = time.time()
    
    dfTotal = pd.unique(dfOrignial['fecha'])
    no_fecha = []
    for fecha in dfTotal:
        year, month, day, hour = fecha.split('-')
        existe = True
        for p in products:
            for t in range(len(times)):             
                filename = f'{path_base}PNG/{fecha}/{fecha}_{t}.png'
                try:                    
                    file_size = os.path.getsize(filename)
                    existe = file_size > 4100000
                except: 
                    existe = False
                    break
                
            if not existe:
                break
        if not existe:
            no_fecha.append(fecha)
            

    if delete:
        antes = len(dfOrignial)
        df2 = dfOrignial[~dfOrignial['fecha'].isin(no_fecha)]
        despues = len(df2)
        print(f'{antes - despues}/{antes} datos eliminados: No se encontraron los archivos de imagenes satelitales')
    else:
        df2 = dfOrignial

    print("Tiempo tomado en verificar datos: %.2fs" % (time.time() - start_time))
    return df2, no_fecha

In [14]:
#Del dataset guardamos los datos mas importantes en una columna para facilitar su lectura
def obtenerDir(row):
    fecha = row['fecha']

    year, month, day, hour = fecha.split('-')
    # filename = f'{path_base}comprimido/{year}_{month}_{day}/{hour}/'
    return f"{row['XO']}--{row['XA']}--{fecha}"

In [15]:
def limpiarDatos(listNames, path_imagenes, products, times ,delete=1):
    df = []
    start_time = time.time()
    print(f'Se leera los archivos de datasets...')
    for name in listNames:
        try:
            df.append(pd.read_csv(name))   
        except:
            print(f'No se pudo leer el archivo {name} de dataset')
            return -1
           
    dsCompleto =  pd.concat(df, ignore_index=True) 
    print("Tiempo tomado: %.2fs" % (time.time() - start_time))
    print(f'+Cantidad de datos leidos {len(dsCompleto)}')
    
    # Quitamos los NA valores
    print(f'Se elimnara los valores nulos')
    dsCompleto.dropna(subset=['dato'], axis='index', inplace=True)    
    dsCompleto = dsCompleto[dsCompleto['flag']!='ND']
    print("Tiempo tomado: %.2fs" % (time.time() - start_time))
    print(f'+Cantidad de datos luego de elimnar nulos {len(dsCompleto)}')
    
    # Buscamos imagenes satelitales para lso archivos
    print(f'Se buscara las imagenes satelitales para los datos...')
    dfImagenes, no_fecha = comprobarFrames(dsCompleto, path_imagenes, products, times, delete)    
    print("Tiempo tomado: %.2fs" % (time.time() - start_time))
       
    
    #Agregamos lso datos de las estaciones al dataset
    print(f'Se agregara los datos de las estaciones(cordenadas, umbral)...')
    dfImagenes['imagen'] = dfImagenes.apply(obtenerDir, axis=1)    
    print("Tiempo tomado: %.2fs" % (time.time() - start_time))
    print(f'+Cantidad Final de datos total {len(dfImagenes)}')  
    return shuffle(dfImagenes), no_fecha

In [16]:
def crearModelo2D(p,run):    
    print(f"Creadno modelo con input ({p['margen'][run]},{p['margen'][run]},{p['canales'][run]})) tipo ({p['outputs']})")
    # Imagen
    input_1 = tf.keras.layers.Input(shape=(p['margen'][run],p['margen'][run],p['canales'][run]))
    
    # first conv layer :
    conv2d_1 = tf.keras.layers.Conv2D(64, kernel_size=3,activation=tf.keras.activations.relu)(input_1)

    # Second conv layer :
    conv2d_2 = tf.keras.layers.Conv2D(32, kernel_size=3,activation=tf.keras.activations.relu)(conv2d_1)
    
    # Flatten layer :
    flatten = tf.keras.layers.Flatten()(conv2d_2)
    
    final = flatten
    listConcat = [flatten]
    listInputs = [input_1]
    
    if len(p['inputs'])>2:
        #Agregamos los otros atrbutos        
        for attr in p['inputs'][1:]:
            # The other input
            input_x = tf.keras.layers.Input(shape=(1,))
            listConcat.append(input_x)
            listInputs.append(input_x)

            
        # Concatenate
        final = tf.keras.layers.Concatenate()(listConcat)
        
            
        
    # output
    if p['outputs'] == 'dato':
        output = tf.keras.layers.Dense(units=1, activation=tf.keras.activations.softmax)(final)
        dimOutput = 1
    elif p['outputs'] == 'umbral':
        output = tf.keras.layers.Dense(units=2, activation=tf.keras.activations.relu)(final)
        dimOutput = 2
    else:
        print(f"No se pudo crear el modelo outputs no esta bien definido {p['outputs']}")
        return -1      
    

    full_model = tf.keras.Model(inputs=listInputs, outputs=[output])
    
    print('DONE')
    
    #print(full_model.summary())
    return full_model

In [17]:
def crearModelo3D(p,run):    
    print(f"Creando modelo con input ({p['tiempos'][run]},{p['margen'][run]},{p['margen'][run]},{p['canales'][run]})) y ({p['outputs']})...")
    # Imagen
    input_1 = tf.keras.layers.Input(shape=(p['tiempos'][run],p['margen'][run],p['margen'][run],p['canales'][run]))
    
    # first conv layer :
    conv3d_1 = tf.keras.layers.Conv3D(64, kernel_size=3,activation=tf.keras.activations.relu)(input_1)

    # Second conv layer :
    conv3d_2 = tf.keras.layers.Conv3D(32, kernel_size=3,activation=tf.keras.activations.relu)(conv3d_1)
    
    # Flatten layer :
    flatten = tf.keras.layers.Flatten()(conv3d_2)
    
    final = flatten
    listConcat = [flatten]
    listInputs = [input_1]
    
    if len(p['inputs'])>2:
        #Agregamos los otros atrbutos        
        for attr in p['inputs'][1:]:
            # The other input            
            input_x = tf.keras.layers.Input(shape=(1,))
            listConcat.append(input_x)
            listInputs.append(input_x)
            
        # Concatenate
        final = tf.keras.layers.Concatenate()(listConcat)
        
        
    # output
    if p['outputs'] == 'dato':
        output = tf.keras.layers.Dense(units=1, activation=tf.keras.activations.softmax)(final)
        dimOutput = 1
    elif p['outputs'] == 'umbral':
        output = tf.keras.layers.Dense(units=2, activation=tf.keras.activations.relu)(final)
        dimOutput = 2
    else:
        print(f"No se pudo crear el modelo outputs no esta bien definido {p['outputs']}")
        return -1          

    full_model = tf.keras.Model(inputs=listInputs, outputs=[output])
    
    
    print('DONE')
    #print(full_model.summary())
    return full_model

In [18]:
def crearModelo(params,run):    
    if params['tiempos'][run] == 1:
        #Se crea un modelo conv2D
        return crearModelo2D(params,run)         
    else:
        #Se crea un modelo conv3D
        return crearModelo3D(params,run)       

In [19]:
def splitDataset(p, run, dataset, path_imagenes, products, times,val_split= 0.2):
    #Dataset de etnrenamiento
    train, test = train_test_split(dataset, test_size=val_split, shuffle=True)
    print(f'Tamaño del dataset: Train {len(train)}  - Val {len(test)}')
    
    inputsList = {}
    for inp in p['inputs']:
        inputsList[inp] = train[inp].tolist()
        
       
        
    train_dataset = tf.data.Dataset.from_tensor_slices(((inputsList),train[p['outputs']].tolist()))           
    val_dataset = tf.data.Dataset.from_tensor_slices(((inputsList),train[p['outputs']].tolist()))     
    
    train_dataset = train_dataset.map(lambda x ,y : read_png_file(x,y,p,run,path_imagenes,products,times))
    val_dataset = val_dataset.map(lambda x ,y : read_png_file(x,y,p,run,path_imagenes,products,times))
    
    train_dataset = train_dataset.batch(p['batch']).cache().prefetch(tf.data.AUTOTUNE)
    val_dataset = val_dataset.batch(p['batch']).prefetch(tf.data.AUTOTUNE)  
    
    return train_dataset, val_dataset

In [20]:
#Transformamos un filename tensor en una imagen
def read_png_file(item, value, p,run, path_base, products, times):
    # imagenData[0] = XO     # imagenData[1] = XA     # imagenData[2] = Fecha
    imagenData = tf.strings.split(item['imagen'], sep='--')
    size = int(p['margen'][run] / 2)

    timeJoin = []
    for j in range(p['tiempos'][run]-1,-1,-1):
        filename = path_base + 'PNG/' + imagenData[2] + '/' + imagenData[2] + '_' + str(j) + '.png'
        
        image_string = tf.io.read_file(filename)

        img_decoded = tf.io.decode_png(image_string, dtype=tf.uint16, channels=3)
        #print(img_decoded.shape)
                
        timeJoin.insert(0,img_decoded[int(imagenData[1]) - size:int(imagenData[1]) + size,
                                      int(imagenData[0]) - size:int(imagenData[0]) + size,
                                      0:p['canales'][run]])
    #return timeJoin
        
    if p['tiempos'][run]==1:
        imagenData = tf.reshape(timeJoin[0],(p['margen'][run],p['margen'][run],p['canales'][run]))
    else:
        img = tf.stack(timeJoin, axis=0)
        imagenData = tf.reshape(img,(p['tiempos'][run],p['margen'][run],p['margen'][run],p['canales'][run]))
        
    
    
    if len(p['inputs']) == 1:
        return imagenData, int(value)
    
    item['imagen'] = imagenData
    itemL = []
    for inpL in p['inputs']:
        itemL.append(item[inpL])
    
    return tuple(itemL), int(value)

In [21]:
def getMetrics(modelType, lr):
    
    if modelType == 'umbral':
        optimizer = keras.optimizers.RMSprop(learning_rate=1e-3)
        loss_fn=keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        train_acc_metric = keras.metrics.SparseCategoricalCrossentropy()
        val_acc_metric = keras.metrics.SparseCategoricalCrossentropy()
        early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_acc", patience=10, mode="max")  
        checkpoint = tf.keras.callbacks.ModelCheckpoint("model-epoch:{epoch:02d}-loss:{loss:.3f}-val_acc:{val_acc:.3f}-weights",
                                                        monitor="val_acc", mode="max")
        metrics = ['acc']
        

    if modelType == 'dato':
        optimizer = keras.optimizers.RMSprop(learning_rate=1e-3)
        loss_fn=keras.losses.MeanSquaredError()
        train_acc_metric = keras.metrics.MeanSquaredError()
        val_acc_metric = keras.metrics.MeanSquaredError()
        early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_mean_squared_error", patience=10, mode="max")  
        checkpoint = tf.keras.callbacks.ModelCheckpoint("model-epoch:{epoch:02d}-loss:{loss:.3f}-val_acc:{val_mean_squared_error:.3f}-weights",
                                                        monitor="val_mean_squared_error", mode="max")
        metrics = [tf.keras.metrics.MeanSquaredError()]
        
        
    else:
        print('No se pudo crear las metricas')
        return -1
        
        
    logs = Callback()
    callbacks = [checkpoint, early_stopping,logs]                     
        
    metrics = {'optimizer': optimizer, 'loss_fn':loss_fn,'train_acc_metric': train_acc_metric,
               'val_acc_metric': val_acc_metric, 'metrics': metrics,'callbacks': callbacks}
    
    return metrics
        

In [148]:
def trainModel(params,dataset,path_imagenes, path_base ,products, times, val_split=0.2):
        
    config = dict(learning_rate=params['lr'], epochs = params['epocas'],
                     batch_size =params['batch'],architecture="CNN",)
    
    resultados = []
    for run in range(params['runs']):
        history = {'loss':[],'val_loss':[],'acc':[],'val_acc':[]}        
        #wandb.init(project='Tesis-DiegoJN', config=config, name= f"Experimetno_{run}")
        
        #Metricas y parametros de entrenaiento
        optimizer, loss_fn, train_acc_metric, val_acc_metric, metrics = getMetrics(params['outputs'], params['lr'])
        
                          
        #Modelo 
        model = crearModelo(params,run)        
        model.compile(optimizer=optimizer,loss=loss_fn,metrics=metrics,)
    
        #Dataset        
        train_dataset, val_dataset = splitDataset(params,run, dataset, path_imagenes, products, times, val_split)
                
        
        print(f'Inicio de la prueba N°: {run}/{params["runs"]}')        
        print(f'- Cantidad de dataset: Train = {len(train_dataset)} - Val = {len(val_dataset)} ')
        print(f'- Numero batch:  {params["batch"]}')
    
        

        
        checkpoint_path = f'{path_base}/Modelos/{run}_{params["outputs"]}_Model.epoch{params["epocas"]:02d}.hdf5'        
        

        # Create a callback that saves the model's weights
        cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                         save_weights_only=True,
                                                         verbose=1)

        #Entrenamos
        history = model.fit(train_dataset,batch_size=params['batch'],
                            epochs=params['epocas'],callbacks=[logs,cp_callback],
                            validation_data=val_dataset,
                            validation_batch_size=params['batch'],)
               
               
        
        #wandb.finish()
        resultados.append(history.history)
        """       
        history['Product'] = products
        history['Time'] = times
        history['Margen'] = margen   
        
        #wandb.log({'epochs': epoch,
        #           'loss': np.mean(train_loss),
        #           'acc': float(train_acc),
        #           'val_loss': np.mean(val_loss),
        #           'val_acc': float(val_acc)})
        """
    return resultados

In [23]:
"""
Variables generales
"""

'\nVariables generales\n'

In [24]:
#Variables generales
path_base = 'C:/Users/Shounen/Desktop/Ciclo XI/Tesis 2/GPUTesis'
FAnalisis = f'{path_base}/Reportes/analisis_2020.csv'
path_imagenes = 'F:/GOES/'
products = ['C07','C08','C13']
times  = ['10','20','30','40','50','00']
listDataset = [f'{path_base}/Dataset/datasetCompleto_2020.csv',
               f'{path_base}/Dataset/datasetCompleto_2021.csv',]

In [25]:
"""
Leemos el dataset completo
"""

'\nLeemos el dataset completo\n'

In [26]:
%%time
#Vamos a unir los dos datasets y limpiarlos
dsCompleto, no_fecha = limpiarDatos(listDataset, path_imagenes, products, times ,1)

#Agregamos el umbral al dataset
dsCompleto['umbral'] = (dsCompleto['dato']>=dsCompleto['99%']).astype(int)
dsCompleto.head(2)

Se leera los archivos de datasets...
Tiempo tomado: 2.72s
+Cantidad de datos leidos 2688688
Se elimnara los valores nulos
Tiempo tomado: 3.17s
+Cantidad de datos luego de elimnar nulos 2491192
Se buscara las imagenes satelitales para los datos...
421712/2491192 datos eliminados: No se encontraron los archivos de imagenes satelitales
Tiempo tomado en verificar datos: 5.35s
Tiempo tomado: 8.52s
Se agregara los datos de las estaciones(cordenadas, umbral)...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfImagenes['imagen'] = dfImagenes.apply(obtenerDir, axis=1)


Tiempo tomado: 34.09s
+Cantidad Final de datos total 2069480
CPU times: total: 34.6 s
Wall time: 34.7 s


Unnamed: 0,nombre,codigo,XO,XA,longitud,latitud,altura,dato,90%,99%,75%,fecha,flag,imagen,umbral
250092,CORDOVA GORE,4728F216,465,855,-75.16667,-14.03333,3181.0,0.0,0.0,1.4,0.0,2020-12-19-05,C0000001,465--855--2020-12-19-05,0
2423426,SANTIAGO DE TUNA,472CA750,390,742,-76.52415,-11.98311,2924.0,0.0,0.0,0.8,0.0,2021-10-25-10,C0000001,390--742--2021-10-25-10,0


In [27]:
import wandb
from wandb.keras import WandbCallback

#wandb.login()

In [28]:
"""
Realizamos los bucles
"""

'\nRealizamos los bucles\n'

In [29]:
#Separamos para los tests
dataset = dsCompleto
dataset = dataset[0:1000]

In [151]:
"""
# Definimos las varibles para las iteraciones
Los parametros que van a cambiar son:
- Canales (products)
- Tiempos (Min de las imagenes)
- margen
"""

params = {'inputs' : ['imagen', '99%','altura'],
          'outputs': 'umbral',  #umbral o dato
          'lr'     : 0.01,
          'batch'  : 64,
          'epocas' : 1,          
          'canales': [3,2,3,1,2,3],
          'tiempos': [6,1,1,6,6,6],
          'margen' : [110,110,110,110,110,110],
          'runs'   : 1
         }

In [152]:
%%time
resultados = trainModel(params,dataset,path_imagenes,path_base,products,times)

Creando modelo con input (6,110,110,3)) y (umbral)...
DONE
Tamaño del dataset: Train 800  - Val 200
Inicio de la prueba N°: 0/1
- Cantidad de dataset: Train = 13 - Val = 13 
- Numero batch:  64
Epoch 1: saving model to C:/Users/Shounen/Desktop/Ciclo XI/Tesis 2/GPUTesis/Modelos\0_umbral_Model.epoch01.hdf5
CPU times: total: 9min 58s
Wall time: 1min 16s


In [144]:
modeloTest = crearModelo3D(params,0)

Creando modelo con input (6,110,110,3)) y (umbral)...
DONE


In [145]:
fileModelo = f'{path_base}/Modelos/0_Model.epoch64.hdf5'        
modeloTest.load_weights(fileModelo)


In [146]:
#imgX = np.random.randint(2000, size=(6,110,110,3)).tolist()
imgX = tf.data.Dataset.from_tensor_slices(np.random.randint(10, size=(2,6,110,110,3)).tolist())
umbX  = tf.data.Dataset.from_tensor_slices([0.8,0.8])
altX = tf.data.Dataset.from_tensor_slices([1.5,0.8])
zipped_input = tf.data.Dataset.zip(((imgX, umbX, altX), )).batch(1)
zipped_input

<BatchDataset element_spec=((TensorSpec(shape=(None, 6, 110, 110, 3), dtype=tf.int32, name=None), TensorSpec(shape=(None,), dtype=tf.float32, name=None), TensorSpec(shape=(None,), dtype=tf.float32, name=None)),)>

In [147]:
for xS in zipped_input.take(1):
    #print(xS)
    ASd = modeloTest.predict(xS)
    print(ASd)

[[0. 0.]]
