In [None]:
import numpy as np

# Scikit Learn does not have its own global random state. It uses the numpy random state instead.
np.random.seed(0)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

def train_and_test_classifier(X_train, y_train, X_test, y_test):

    print(f"Shapes before reshaping: ", X_train.shape, X_test.shape)
    X_train = X_train.reshape(len(X_train), -1)
    X_test = X_test.reshape(len(X_test), -1)
    print(f"Shapes after reshaping: ", X_train.shape, X_test.shape)

    y_train[y_train>0] = 1
    y_test[y_test>0] = 1

    # Create a GradientBoostingClassifier object with default hyperparameters
    clf = GradientBoostingClassifier()

    # Train the classifier on the training data
    clf.fit(X_train, y_train)
    
    # Make predictions on the testing data
    y_pred = clf.predict(X_test.reshape(len(X_test), -1))
    
    y_true = y_test
    y_true[y_true>0] = 1

    # Calculate the confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Define the class labels
    class_names = ['Negative', 'Positive']

    # Create a heatmap using Seaborn
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)

    # Add labels and title
    plt.xlabel('Predicted label')
    plt.ylabel('True label')
    plt.title('Confusion Matrix')

    # Show the plot
    plt.show()

    from sklearn.metrics import classification_report

    # Build the classification report
    target_names = ['Negative', 'Positive']
    report = classification_report(y_true, y_pred, target_names=target_names)

    # Print the classification report
    print(report)

In [None]:
# from collections import Counter
# from sklearn.datasets import make_classification
# from imblearn.under_sampling import NeighbourhoodCleaningRule 
# X, y = make_classification(n_classes=2, class_sep=2,
#  weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
#  n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
# print('Original dataset shape {}'.format(Counter(y)))

# ncr = NeighbourhoodCleaningRule()
# X_res, y_res = ncr.fit_resample(X, y)
# print('Resampled dataset shape {}'.format(Counter(y_res)))

In [None]:
import pickle

pipeline_id = "A652_N"
filename = "../data/datasets/" + pipeline_id + ".pickle"
print(f"Loading train/val/test datasets from {filename}.")
file = open(filename, 'rb')
(X_train, y_train, X_val, y_val, X_test, y_test) = pickle.load(file)
print(f"Shapes of train/val/test data matrices: {X_train.shape}/{X_val.shape}/{X_test.shape}")

# Train a classifier on the original training set

In [None]:
train_and_test_classifier(X_train, y_train, X_test, y_test)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np 

def train_pilot_model(X_train, y_train):
    y_eq_zero_idxs = np.where(y_train == 0)[0]
    y_gt_zero_idxs = np.where(y_train > 0)[0]
    print(f"Amounts of neg/pos examples: {len(y_eq_zero_idxs)}/{len(y_gt_zero_idxs)}")

    positive_examples = X_train[y_gt_zero_idxs]
    negative_examples = X_train[y_eq_zero_idxs]

    num_positive_examples = len(positive_examples)
    num_negative_examples = len(negative_examples)
    desired_num_examples = min(num_positive_examples, num_negative_examples)
    print(desired_num_examples)

    positive_indices = np.random.choice(num_positive_examples, size=desired_num_examples, replace=False)
    negative_indices = np.random.choice(num_negative_examples, size=desired_num_examples, replace=False)

    X_balanced = np.concatenate((positive_examples[positive_indices], negative_examples[negative_indices]))
    y_balanced = np.concatenate((np.ones(desired_num_examples), np.zeros(desired_num_examples)))

    assert len(y_balanced) == 2*desired_num_examples

    # Create a GradientBoostingClassifier object with default hyperparameters
    clf = GradientBoostingClassifier()

    # Train the classifier on the training data
    clf.fit(X_balanced, y_balanced)
    
    return clf, X_balanced, y_balanced

def score_negative_examples(clf, X_balanced, y_balanced):
    X_balanced_negative = X_balanced[y_balanced==0]
    y_balanced_negative = y_balanced[y_balanced==0]

    # Get predicted probabilities on the negative samples
    y_proba = clf.predict_proba(X_balanced_negative)

    # The predicted probabilities for the negative class (class 0) are in the first column
    y_proba_negative = y_proba[:, 0]

    # Normalize the probabilities to sum to 1
    y_proba_normalized = y_proba_negative / np.sum(y_proba_negative)

    print(f"Normalized scores for the first 5 negative examples: {y_proba_normalized[:5]}")
    print(f"Correct labels for the first 5 negative examples: {y_balanced_negative[:5]}")
    
    return y_proba_normalized

def sample_from_negative_examples(X_balanced, y_balanced, y_proba_normalized):
    # Create an array of indices corresponding to X_balanced_negative
    X_balanced_negative = X_balanced[y_balanced==0]
    indices = np.arange(len(X_balanced_negative))

    positive_examples = X_balanced[y_balanced==1]
    num_positive_examples = len(positive_examples)

    # Sample the indices using the normalized probabilities
    sampled_indices = np.random.choice(indices, size=num_positive_examples, replace=False, p=y_proba_normalized)

    # Use the sampled indices to get a subset "hard" negative examples
    X_sampled_negative = X_balanced_negative[sampled_indices]

    return X_sampled_negative

In [None]:
def apply_negative_sampling(X_train, y_train):
    original_shape_X_train = X_train.shape
    
    X_train = X_train.reshape(len(X_train), -1)
    y_train[y_train>0] = 1    
    
    ###
    # Now apply the steps of the negative sampling procedure
    ###

    # Step 1: Train "pilot" model
    clf, X_balanced, y_balanced = train_pilot_model(X_train, y_train)

    # Step 2: Score the negative examples with the pilot model
    y_proba_normalized = score_negative_examples(clf, X_balanced, y_balanced)

    # Step 3: Sample the negative examples proportionally to the scores
    X_sampled_negative = sample_from_negative_examples(X_balanced, y_balanced, y_proba_normalized)
    
    positive_examples = X_balanced[y_balanced==1]
    desired_num_examples = len(positive_examples)
    X_train_sampled = np.concatenate((positive_examples, X_sampled_negative))
    y_train_sampled = np.concatenate((np.ones(desired_num_examples), np.zeros(desired_num_examples)))
    X_train_sampled.shape, y_train_sampled.shape
    
    X_train_sampled = X_train_sampled.reshape((len(X_train_sampled), original_shape_X_train[1], original_shape_X_train[2]))
    
    return X_train_sampled, y_train_sampled

In [None]:
X_train_sampled, y_train_sampled = apply_negative_sampling(X_train, y_train)

X_train_sampled.shape, y_train_sampled.shape

# Train a classifier on the sampled training set

In [None]:
train_and_test_classifier(X_train_sampled, y_train_sampled, X_test, y_test)

In [None]:
import torch 
def BCELoss_class_weighted(weights):

    def loss(input, target):
        input = torch.clamp(input,min=1e-7,max=1-1e-7)
        bce = - weights[1] * target * torch.log(input) - (1 - target) * weights[0] * torch.log(1 - input)
        return torch.mean(bce)

    return loss

loss = BCELoss_class_weighted
# loss = torch.nn.BCELoss(reduction='none')

weights = torch.rand(10,1)
outputs = torch.rand(10,1)
targets = torch.rand(10,1)

intermediate_losses = loss(outputs, targets)
final_loss = torch.mean(weights*intermediate_losses)

In [None]:
import pandas as pd
df = pd.read_csv("../data/gauge/INMET_WS_Stations.csv")
df.head(30)

In [None]:
from math import radians, sin, cos, sqrt, atan2

def haversine_distance(point1, point2):
    R = 6371  # Earth's radius in kilometers
    lat1, lon1 = point1
    lat2, lon2 = point2
    dlat = radians(lat2 - lat1)
    dlon = radians(lon2 - lon1)
    a = sin(dlat / 2) ** 2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = R * c
    return distance

def distance_matrix(point, points_list):
    matrix = []
    for p in points_list:
        distance = haversine_distance(point, p)
        matrix.append(distance)
    return matrix

point = (52.2296756, 21.0122287)  # Warsaw, Poland
points_list = [(51.5073509, -0.1277583),  # London, UK
               (40.7127281, -74.0060152),  # New York City, USA
               (48.856614, 2.3522219)]    # Paris, France

matrix = distance_matrix(point, points_list)
print(matrix)  # output: [1464.580386325907, 6698.945740763303, 1319.1667457437978]


In [None]:
A652_Latitude = -22.98833333
A652_Longitude = -43.19055555
A652_point = (A652_Latitude, A652_Longitude)

print("Distances from A652 to all other INMET weather stations located in RJ:")
for index, row in df.iterrows():
    longitude = float(row['VL_LONGITUDE'].replace(',','.'))
    latitude = float(row['VL_LATITUDE'].replace(',','.'))
    ws_name = row['DC_NOME']
    ws_point = (latitude, longitude)
    dist = haversine_distance(A652_point, ws_point)
    print(f"{dist}\t {ws_name}")

In [None]:
gig_Latitude = -22.809167
gig_Longitude = -43.250556
gig_point = (gig_Latitude, gig_Longitude)

print("Distances from SBGL to INMET weather stations located in RJ:")
for index, row in df.iterrows():
    longitude = float(row['VL_LONGITUDE'].replace(',','.'))
    latitude = float(row['VL_LATITUDE'].replace(',','.'))
    ws_name = row['DC_NOME']
    ws_point = (latitude, longitude)
    dist = haversine_distance(gig_point, ws_point)
    print(f"{dist}\t {ws_name}")

In [None]:
df[['CD_ESTACAO', 'DC_NOME', 'DT_INICIO_OPERACAO']]

In [None]:
import pandas as pd
df_A621 = pd.read_csv("../data/gauge/A621_2007_2023.csv")
df_A621.info()

In [None]:
df_A621[df_A621.CHUVA>=5].shape, df_A621[df_A621.CHUVA>=25].shape, df_A621[df_A621.CHUVA>=50].shape

In [None]:
# pip install "dask[dataframe]"

In [None]:
import pandas as pd
import xarray as xr

ds = None
df = None
for year in range (1997, 2021):
    print(f"year: {year}")
    if ds is None:
        filename = "../data/ERA5/RJ_" + str(year) + ".nc"
        ds = xr.open_dataset(filename)
        print(f"File {filename} successfuly opened. Size: {ds.sizes['time']}")

        df = ds.to_dataframe()
    else:
        filename = "../data/ERA5/RJ_" + str(year) + ".nc"
        ds_aux = xr.open_dataset(filename)
        print(f"File {filename} successfuly opened. Size: {ds_aux.sizes['time']}")
        df_aux = ds_aux.to_dataframe()
        df = pd.concat([df, df_aux])

filename = "../data/NWP/ERA5_all_700_1000.parquet.gzip"
print(f"Saving dowloaded data to {filename}")
df.to_parquet(filename, compression='gzip')

In [1]:
import pandas as pd
filename = "../data/NWP/ERA5_all_700_1000.parquet.gzip"
df = pd.read_parquet(filename)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,z,r,t,u,v
longitude,latitude,level,time,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
-44.0,-22.0,1000,1997-01-01 00:00:00,892.438477,98.881012,298.071045,-0.224454,1.084408
-44.0,-22.0,1000,1997-01-01 01:00:00,942.387695,98.38829,298.07431,-0.312888,1.111478
-44.0,-22.0,1000,1997-01-01 02:00:00,918.625,98.545959,298.012054,-0.312888,1.048879
-44.0,-22.0,1000,1997-01-01 03:00:00,896.317383,98.371864,297.947876,-0.199931,1.027448
-44.0,-22.0,1000,1997-01-01 04:00:00,877.405273,99.202919,297.848022,-0.078798,0.98064


In [3]:
import xarray as xr

ds = None
df = None
for year in range (1997, 2020, 2):
    print(f"years: {year}-{year+1}")
    if ds is None:
        filename = "../data/ERA5/RJ_" + str(year) + "_" + str(year+1) + "_200.nc"
        ds = xr.open_dataset(filename)
        print(f"File {filename} successfuly opened. Size: {ds.sizes['time']}")
        df = ds.to_dataframe()

        # Add a new level called 'level' with a value of 200
        new_level = pd.Index([200] * len(df.index), name='level')
        new_index = pd.MultiIndex.from_arrays([df.index.get_level_values('longitude'),
                                               df.index.get_level_values('latitude'),
                                               new_level,
                                               df.index.get_level_values('time')])
        df.index = new_index
    else:
        filename = "../data/ERA5/RJ_" + str(year) + "_" + str(year+1) + "_200.nc"
        ds_aux = xr.open_dataset(filename)
        df_aux = ds_aux.to_dataframe()
        print(f"File {filename} successfuly opened. Size: {ds_aux.sizes['time']}")

        # Add a new level called 'level' with a value of 200
        new_level = pd.Index([200] * len(df_aux.index), name='level')
        new_index = pd.MultiIndex.from_arrays([df_aux.index.get_level_values('longitude'),
                                               df_aux.index.get_level_values('latitude'),
                                               new_level,
                                               df_aux.index.get_level_values('time')])
        df_aux.index = new_index
        df = pd.concat([df, df_aux])

filename = "../data/NWP/ERA5_all_200.parquet.gzip"
print(f"Saving dowloaded data to {filename}")
df.to_parquet(filename, compression='gzip')

years: 1997-1998
File ../data/ERA5/RJ_1997_1998_200.nc successfuly opened. Size: 17520
years: 1999-2000
File ../data/ERA5/RJ_1999_2000_200.nc successfuly opened. Size: 17544
years: 2001-2002
File ../data/ERA5/RJ_2001_2002_200.nc successfuly opened. Size: 17520
years: 2003-2004
File ../data/ERA5/RJ_2003_2004_200.nc successfuly opened. Size: 17544
years: 2005-2006
File ../data/ERA5/RJ_2005_2006_200.nc successfuly opened. Size: 17520
years: 2007-2008
File ../data/ERA5/RJ_2007_2008_200.nc successfuly opened. Size: 17544
years: 2009-2010
File ../data/ERA5/RJ_2009_2010_200.nc successfuly opened. Size: 17520
years: 2011-2012
File ../data/ERA5/RJ_2011_2012_200.nc successfuly opened. Size: 17544
years: 2013-2014
File ../data/ERA5/RJ_2013_2014_200.nc successfuly opened. Size: 17520
years: 2015-2016
File ../data/ERA5/RJ_2015_2016_200.nc successfuly opened. Size: 17544
years: 2017-2018
File ../data/ERA5/RJ_2017_2018_200.nc successfuly opened. Size: 17520
years: 2019-2020
File ../data/ERA5/RJ_2019_

In [4]:
df_200 = pd.read_parquet("../data/NWP/ERA5_all_200.parquet.gzip")
df_200.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,z,r,t,u,v
longitude,latitude,level,time,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
-44.0,-22.0,200,1997-01-01 00:00:00,121961.398438,65.385361,220.217529,4.944927,16.344982
-44.0,-22.0,200,1997-01-01 01:00:00,122028.703125,52.338036,220.431793,6.329298,16.025719
-44.0,-22.0,200,1997-01-01 02:00:00,121997.929688,46.309238,220.444519,7.749165,17.731339
-44.0,-22.0,200,1997-01-01 03:00:00,121927.84375,51.57534,220.212662,7.695263,18.958061
-44.0,-22.0,200,1997-01-01 04:00:00,121850.375,63.906269,219.916153,7.768886,18.994013


In [5]:
df_700_and_1000 = pd.read_parquet("../data/NWP/ERA5_all_700_1000.parquet.gzip")
df_1997_2020 = pd.concat([df_200, df_700_and_1000])
df_1997_2020.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,z,r,t,u,v
longitude,latitude,level,time,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
-44.0,-22.0,200,1997-01-01 00:00:00,121961.398438,65.385361,220.217529,4.944927,16.344982
-44.0,-22.0,200,1997-01-01 01:00:00,122028.703125,52.338036,220.431793,6.329298,16.025719
-44.0,-22.0,200,1997-01-01 02:00:00,121997.929688,46.309238,220.444519,7.749165,17.731339
-44.0,-22.0,200,1997-01-01 03:00:00,121927.84375,51.57534,220.212662,7.695263,18.958061
-44.0,-22.0,200,1997-01-01 04:00:00,121850.375,63.906269,219.916153,7.768886,18.994013


In [6]:
ds_1997_2020 = df_1997_2020.to_xarray()
ds_1997_2020

In [7]:
import xarray as xr
import pandas as pd

def merge_files_from_2021_to_2023():
    df_all = None
    for year in range (2021, 2023+1):
        for pressure_level in ["200", "700", "1000"]:
            filename = "../data/NWP/ERA5/RJ_" + str(year) + "_" + pressure_level + ".nc"
            ds = xr.open_dataset(filename)
            print(f"NetCDF file {filename} successfuly opened. Size: {ds.sizes['time']}")

            # see https://confluence.ecmwf.int/display/CUSF/ERA5+CDS+requests+which+return+a+mixture+of+ERA5+and+ERA5T+data
            if "expver" in list(ds.coords.keys()):
                print(">>>Oops! expver dimension found. Going to remove it.<<<")
                ds_combine = ds.sel(expver=1).combine_first(ds.sel(expver=5))
                ds_combine.load()
                ds = ds_combine

            df = ds.to_dataframe()
            # Add a new component to the multi-index called 'pressure_level'
            new_component = pd.Index([int(pressure_level)] * len(df.index), name='level')
            new_index = pd.MultiIndex.from_arrays([df.index.get_level_values('longitude'),
                                                       df.index.get_level_values('latitude'),
                                                       new_component,
                                                       df.index.get_level_values('time')])
            df.index = new_index
            
            if df_all is None:
                df_all = df
            else:
                df_all = pd.concat([df_all, df])

    filename = "../data/NWP/ERA5/ERA5_all.parquet.gzip"
    print(f"Saving dowloaded data to {filename}")
    df_all.to_parquet(filename, compression='gzip')
    assert (not df_all.isnull().values.any().any())
    return df_all.to_xarray()
        
ds_2021_2023 = merge_files_from_2021_to_2023()

NetCDF file ../data/NWP/ERA5/RJ_2021_200.nc successfuly opened. Size: 8760
NetCDF file ../data/NWP/ERA5/RJ_2021_700.nc successfuly opened. Size: 8760
NetCDF file ../data/NWP/ERA5/RJ_2021_1000.nc successfuly opened. Size: 8760
NetCDF file ../data/NWP/ERA5/RJ_2022_200.nc successfuly opened. Size: 8760
NetCDF file ../data/NWP/ERA5/RJ_2022_700.nc successfuly opened. Size: 8760
NetCDF file ../data/NWP/ERA5/RJ_2022_1000.nc successfuly opened. Size: 8760
NetCDF file ../data/NWP/ERA5/RJ_2023_200.nc successfuly opened. Size: 2706
>>>Oops! expver dimension found. Going to remove it.<<<
NetCDF file ../data/NWP/ERA5/RJ_2023_700.nc successfuly opened. Size: 2706
>>>Oops! expver dimension found. Going to remove it.<<<
NetCDF file ../data/NWP/ERA5/RJ_2023_1000.nc successfuly opened. Size: 2707
>>>Oops! expver dimension found. Going to remove it.<<<
Saving dowloaded data to ../data/NWP/ERA5/ERA5_all.parquet.gzip


In [8]:
ds_2021_2023

In [9]:
ds_1997_2023 = ds_1997_2020.merge(ds_2021_2023)

In [10]:
ds_1997_2023

In [11]:
filename = "../data/NWP/ERA5_1997_2023.nc"
ds_1997_2023.to_netcdf(filename)

In [None]:
filename = "../data/NWP/ERA5_1997_2023.nc"
ds = xr.open_dataset(filename)

In [None]:
df_nwp_era5 = pd.read_parquet('../data/NWP/ERA5_A652_1997_2023_preprocessed.parquet.gzip')
min(df_nwp_era5.index), max(df_nwp_era5.index)