In [1]:
import math
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from numpy import sin, cos, arccos, pi, round

from sklearn.impute import KNNImputer

from sklearn import neighbors
from sklearn import linear_model
from sklearn import svm
from sklearn import tree
from sklearn.metrics import mean_squared_error, mean_absolute_error

### 1° Leitura e tratamento inicial dos dados

Lendo a CSV e removendo todos os dados com valores de Tp_est iguais a zero

Resultados:
- Base de dados raw para treinamento (*df_raw_train*)
- Base de dados raw para validação (*df_raw_validation*)
- Base de dados raw para teste (*df_raw_test*)

In [2]:
df_raw_data = pd.read_csv('/content/drive/MyDrive/BCC - UFPR/Semestres /9 - 2023-2/Aprendizado de Maquina/Lab2/Dados_Radar_Estacao_Completo_2018_2022.csv')
df_raw_data.drop(df_raw_data[df_raw_data['Tp_est'] == 0.0].index, inplace=True);

Remoção inicial de colunas da base geral (2018 a 2022). A colunas removidas foram:
- Unnamed: 0 pois é a coluna de ids
- latitude e longitude pois possuem a mesma informação que as colunas lat e lon;
- distancia: Não agrega valor ao modelo.


In [3]:
df_raw_data.drop(['Unnamed: 0', 'latitude', 'longitude', 'distancia'], axis=1, inplace=True)

Separando o raw data entre treinamento e teste. Anos de 2018 a 2021 para treinamento e 2022 para teste.


In [4]:
raw_train_test_group = df_raw_data.groupby(df_raw_data['time'].str.contains('2022'))

df_raw_train = raw_train_test_group.get_group(False).copy()
df_raw_test = raw_train_test_group.get_group(True).copy()

Separando os dados de treinamento, selecionando uma porção dos dados para validação

In [5]:
dates = ['2018-01', '2018-02']
# dates = ['2018-01']

raw_train_validation_group = df_raw_train.groupby(df_raw_train['time'].str.contains('|'.join(dates)))
df_raw_train = raw_train_validation_group.get_group(False).copy()
df_raw_validation = raw_train_validation_group.get_group(True).copy()


Remoção das colunas elevation e sweep pois não possuiam valor agregado na base

Para verificar que as colunas elevation e sweep não possuiam valor agregado, foi utilizado a função describe do pandas. Com esta função, foi possível verificar que ambas possuiam média, minimo e maximo identicos, além de um desvio padrão igual a 0, ou seja, todas as linhas possuiam o mesmo valor.

In [6]:
print(df_raw_train.describe()['elevation'], end="\n\n")
print(df_raw_train.describe()['sweep'])

df_raw_train.drop(['elevation', 'sweep'], axis=1, inplace=True)
df_raw_validation.drop(['elevation', 'sweep'], axis=1, inplace=True)
df_raw_test.drop(['elevation', 'sweep'], axis=1, inplace=True)

count    83083.0
mean         0.5
std          0.0
min          0.5
25%          0.5
50%          0.5
75%          0.5
max          0.5
Name: elevation, dtype: float64

count    83083.0
mean         0.0
std          0.0
min          0.0
25%          0.0
50%          0.0
75%          0.0
max          0.0
Name: sweep, dtype: float64


### Clusterização por estações


In [7]:
df_train_grouped = df_raw_train.groupby(['Est'])
df_lat_lon = pd.DataFrame(columns=['est', 'lat', 'lon'])
for est in df_train_grouped.groups.keys():
    group = df_train_grouped.get_group(est)
    list_lat_lon = [est, group['lat'].mean(), group['lon'].mean()]
    df_lat_lon = pd.concat([pd.DataFrame([list_lat_lon], columns=df_lat_lon.columns), df_lat_lon], ignore_index=True)

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
for data in df_lat_lon.iterrows():
    ax.scatter(data[1][1], data[1][2], label=f'{data[1][0]} - {data[0]}')

colormap = plt.cm.gist_ncar
colorst = [colormap(i) for i in np.linspace(0.1, 0.9,len(ax.collections))]
for t, j1 in enumerate(ax.collections):
    j1.set_color(colorst[t])

for data in df_lat_lon.iterrows():
    ax.annotate(data[0], (data[1][1], data[1][2]))

ax.legend(bbox_to_anchor=(1, 1), bbox_transform=ax.transAxes, fontsize='small')
plt.show()

In [19]:
clusters = [
    ['Pato_Branco', 'Laranjeiras_do_Sul', 'Segredo', 'Derivacao_do_Rio_Jordao', 'Coronel_Domingos_Soares', 'Solais_Novo'],
    ['Cascavel', 'Baixo_Iguacu', 'Salto_Caxias', 'Reservatorio_Salto_Caxias', 'Boa_Vista_da_Aparecida', 'Porto_Santo_Antonio', 'Aguas_do_Vere', 'Bela_Vista_Jusante'],
    ['Foz_do_Iguacu_-_Itaipu', 'Santa_Helena', 'Guaira', 'Palotina', 'Toledo', 'Assis_Chateaubriand', 'Altonia'],
    ['Loanda', 'Paranavai', 'Campo_Mourao', 'Umuarama', 'Ubirata', 'Porto_Formosa']
]

dict_train = {}
dict_input_values = {}

df_train_grouped = df_raw_train.groupby(['Est'])
for cluster in clusters:
    group = df_train_grouped.get_group(cluster[0]).copy()
    for index in range(1, len(cluster)):
        group = pd.concat([df_train_grouped.get_group(cluster[index]), group], ignore_index=True)
    group.drop(['Est', 'time'], axis=1, inplace=True)

    label = ' - '.join(map(str, cluster))
    dict_train[label] = group
    dict_input_values[label] = group.dropna()

In [31]:
for cluster in dict_input_values:
    input_values = []
    columns = list(dict_input_values[cluster].head())
    for row in dict_input_values[cluster].iterrows():
        row_info = {}
        for col in columns:
            row_info[col] = row[1][col]

        input_values.append(row_info)

    input_values = sorted(input_values, key=lambda d: d["Tp_est"])
    dict_input_values[cluster] = input_values

### 3° KNN Imputation na base de treinamento

Resultados:
- Tratamento do dict_train para cada estação, preenchendo todos os dados NaN da base


In [23]:
imputer = KNNImputer(n_neighbors=2)

for cluster in dict_train:
    dict_train[cluster] = imputer.fit_transform(dict_train[cluster])

### Removendo outliers


In [None]:
est_data = {}
for est in dict_train:
    data = { '0 - 2.4': 0, '2.5 - 7.4': 0, '7.5 - 12.4': 0, '12.5 - 17.4': 0, '17.5 - 22.4': 0, '22.5 - 27.4': 0, '27.5 - 32.4': 0,  '32.4 - 37.5': 0}
    for row in dict_train[est]:
        if float(row[-1]) < 2.4:
            data['0 - 2.4'] += 1
        elif float(row[-1]) < 7.4:
            data['2.5 - 7.4'] += 1
        elif float(row[-1]) < 12.4:
            data['7.5 - 12.4'] += 1
        elif float(row[-1]) < 17.4:
            data['12.5 - 17.4'] += 1
        elif float(row[-1]) < 22.4:
            data['17.5 - 22.4'] += 1
        elif float(row[-1]) < 27.4:
            data['22.5 - 27.4'] += 1
        elif float(row[-1]) < 32.4:
            data['27.5 - 32.4'] += 1
        else:
            data['32.4 - 37.5'] += 1
    est_data[est] = data

est_data

In [27]:
new_dict_train = {}
for est in dict_train:
    indexes = []
    for index, row in enumerate(dict_train[est]):
        if row[-1] >= 12.4:
            indexes.append(index)
    temp = np.delete(dict_train[est], indexes, 0)
    new_dict_train[est] = temp

dict_train = new_dict_train

### 4° Preparação da base de validação
Realizar a imputaçao de dados na base de validação

Resultados:
- Base de validação com todas as linhas com valores preenchidos (*df_validation*)

In [29]:
df_validation = df_raw_validation.copy()

Realizando o input na base de dados de validação

In [32]:
def get_fields_to_input(row, columns):
    fields = []
    for col in columns:
        if str(row[col]) == 'nan':
            fields.append(col)
    return fields

def get_group(est, clusters):
    for cluster in clusters:
        if est in cluster: return clusters[cluster]

def get_data(tp_est, input_values):
    data = None
    min_distance = math.inf
    for value in input_values:
        distance = abs(tp_est - value["Tp_est"])
        if distance < min_distance:
            min_distance = distance
            data = value

    return data

columns = list(df_validation.head())
for row in df_validation.iterrows():

    fields_to_input = get_fields_to_input(row[1], columns)
    if len(fields_to_input) > 0:
        input_values = get_group(row[1]['Est'], dict_input_values)
        input_data = get_data(row[1]["Tp_est"], input_values)
        for field in fields_to_input:
            if str(row[1][field]) == 'nan':
                df_validation.loc[row[0], field] = input_data[col]

df_validation.drop(['Est', 'time'], axis=1, inplace=True)

### Validaçao dos modelos de regressão

In [33]:
df_validation_for_train = df_validation.copy()

In [34]:
def get_model(model):
    match model:
        case 'knn':
            return neighbors.KNeighborsRegressor(n_neighbors=100)
        case 'svr':
            return svm.SVR()
        case 'linear_regression':
            return linear_model.LinearRegression()
        case 'tree_regression':
            return tree.DecisionTreeRegressor()

models = ['knn', 'svr', 'linear_regression', 'tree_regression']
est_preds = {}

validate_list = np.array(df_validation_for_train.values.tolist())
x_validate = validate_list[:, :-1]
y_validate = validate_list[:, -1]

for est in dict_train:
    train_list = dict_train[est]
    x_train = train_list[:, :-1]
    y_train = train_list[:, -1]

    preds = {}
    for model_type in models:
        model = get_model(model_type)
        model.fit(x_train, y_train)

        pred = model.predict(x_validate)
        preds[model_type] = pred

    est_preds[est] = preds

In [35]:
errors = {}
for est in est_preds:
    est_errors = {}
    for model in est_preds[est]:
        mse = mean_squared_error(y_validate, est_preds[est][model])
        mae = mean_absolute_error(y_validate, est_preds[est][model])
        est_errors[model] = { 'mse': mse, 'mae': mae }
    errors[est] = est_errors

In [None]:
errors

In [36]:
df_comparison = pd.DataFrame(columns=['Estação', 'Melhor Modelo MSE', 'MSE', 'Melhor Modelo MAE', 'MAE'])

for est in errors:
    min_mse = None
    min_mse_value = math.inf
    min_mae = None
    min_mae_value = math.inf

    for model in errors[est]:
        if errors[est][model]['mse'] < min_mse_value:
            min_mse = model
            min_mse_value = errors[est][model]['mse']
        if errors[est][model]['mae'] < min_mae_value:
            min_mae = model
            min_mae_value = errors[est][model]['mae']
    new_row = [est, min_mse, min_mse_value, min_mae, min_mae_value]
    df_comparison.loc[len(df_comparison)] = new_row

df_comparison

Unnamed: 0,Estação,Melhor Modelo MSE,MSE,Melhor Modelo MAE,MAE
0,Pato_Branco - Laranjeiras_do_Sul - Segredo - D...,knn,6.049195,svr,1.062851
1,Cascavel - Baixo_Iguacu - Salto_Caxias - Reser...,knn,6.113812,svr,1.036819
2,Foz_do_Iguacu_-_Itaipu - Santa_Helena - Guaira...,knn,5.969253,svr,1.00879
3,Loanda - Paranavai - Campo_Mourao - Umuarama -...,knn,6.034171,svr,1.0969


### Preparação da base de teste

In [None]:
df_test = df_raw_test.copy()

In [None]:
def get_fields_to_input(row, columns):
    fields = []
    for col in columns:
        if str(row[col]) == 'nan':
            fields.append(col)
    return fields

def get_data(tp_est, input_values):
    data = None
    min_distance = math.inf
    for value in input_values:
        distance = abs(tp_est - value["Tp_est"])
        if distance < min_distance:
            min_distance = distance
            data = value

    return data

columns = list(df_test.head())
for row in df_test.iterrows():

    fields_to_input = get_fields_to_input(row[1], columns)
    if len(fields_to_input) > 0:
        input_data = get_data(row[1]["Tp_est"], input_values)
        for field in fields_to_input:
            if str(row[1][field]) == 'nan':
                df_test.loc[row[0], field] = input_data[col]

In [None]:
df_test.drop(['time','Est'], axis=1, inplace=True)

In [None]:
def get_model(model):
    match model:
        case 'knn':
            return neighbors.KNeighborsRegressor(n_neighbors=100)
        case 'svr':
            return svm.SVR(C=1)
        case 'linear_regression':
            return linear_model.LinearRegression()
        case 'tree_regression':
            return tree.DecisionTreeRegressor()

models = ['knn', 'svr', 'linear_regression', 'tree_regression']
est_preds = {}

test_list = np.array(df_test.values.tolist())
x_test = test_list[:, :-1]
y_test = test_list[:, -1]

for est in dict_train:
    train_list = dict_train[est]
    x_train = train_list[:, :-1]
    y_train = train_list[:, -1]

    preds = {}
    for model_type in models:
        model = get_model(model_type)
        model.fit(x_train, y_train)

        pred = model.predict(x_test)
        preds[model_type] = pred

    est_preds[est] = preds

In [None]:
errors = {}
for est in est_preds:
    est_errors = {}
    for model in est_preds[est]:
        mse = mean_squared_error(y_test, est_preds[est][model])
        mae = mean_absolute_error(y_test, est_preds[est][model])
        est_errors[model] = { 'mse': mse, 'mae': mae }
    errors[est] = est_errors

In [None]:
df_comparison = pd.DataFrame(columns=['Estação', 'Melhor Modelo MSE', 'MSE', 'Melhor Modelo MAE', 'MAE'])

for est in errors:
    min_mse = None
    min_mse_value = math.inf
    min_mae = None
    min_mae_value = math.inf

    for model in errors[est]:
        if errors[est][model]['mse'] < min_mse_value:
            min_mse = model
            min_mse_value = errors[est][model]['mse']
        if errors[est][model]['mae'] < min_mae_value:
            min_mae = model
            min_mae_value = errors[est][model]['mae']
    new_row = [est, min_mse, min_mse_value, min_mae, min_mae_value]
    df_comparison.loc[len(df_comparison)] = new_row

df_comparison