# Análise inicial dados meteorológicos: vento
## Preparando o ambiente

In [1]:
import pandas as pd
import numpy as np

## Carregando os dados

Os dados são compostos por um valor para eletrecidade (`electricity`) e um para velocidade do vento (`wind_speed`) e cada linha representa o valor para cada hora do dia.

Referência: **Osório, 2019**.

In [2]:
df_wind = pd.read_csv('../../data/wind_osorio_2019.csv', header=3)
df_wind.sample(5)

Unnamed: 0,time,local_time,electricity,wind_speed
7112,2019-10-24 08:00,2019-10-24 05:00,0.011,2.273
813,2019-02-03 21:00,2019-02-03 19:00,0.024,2.834
802,2019-02-03 10:00,2019-02-03 08:00,0.097,4.438
6354,2019-09-22 18:00,2019-09-22 15:00,0.113,4.673
7863,2019-11-24 15:00,2019-11-24 13:00,0.052,3.595


In [3]:
df_wind.dtypes

time            object
local_time      object
electricity    float64
wind_speed     float64
dtype: object

Convertendo os valores de data para o tipo correto.

In [4]:
df_wind['time'] = pd.to_datetime(df_wind['time'])
df_wind['local_time'] = pd.to_datetime(df_wind['local_time'])
print(df_wind.dtypes)
df_wind.sample(5)

time           datetime64[ns]
local_time     datetime64[ns]
electricity           float64
wind_speed            float64
dtype: object


Unnamed: 0,time,local_time,electricity,wind_speed
2704,2019-04-23 16:00:00,2019-04-23 13:00:00,0.069,3.953
6452,2019-09-26 20:00:00,2019-09-26 17:00:00,0.001,1.202
5418,2019-08-14 18:00:00,2019-08-14 15:00:00,0.007,2.058
5597,2019-08-22 05:00:00,2019-08-22 02:00:00,0.001,1.145
6009,2019-09-08 09:00:00,2019-09-08 06:00:00,0.007,2.026


Removendo a coluna do tempo UTC, visto que todo o fuso horário utilizado aqui é -3.

In [5]:
del df_wind['time']
df_wind.sample(5)

Unnamed: 0,local_time,electricity,wind_speed
6511,2019-09-29 04:00:00,0.048,3.503
8552,2019-12-23 06:00:00,0.038,3.256
6079,2019-09-11 04:00:00,0.08,4.154
3979,2019-06-15 16:00:00,0.0,0.797
3787,2019-06-07 16:00:00,0.004,1.738


Separando a hora da data.

In [6]:
new_dates, new_times = zip(*[(d.date(), d.hour) for d in df_wind['local_time']])
df_wind = df_wind.assign(new_date=new_dates, new_time=new_times)

In [7]:
df_wind.sample(5)

Unnamed: 0,local_time,electricity,wind_speed,new_date,new_time
781,2019-02-02 11:00:00,0.046,3.456,2019-02-02,11
7129,2019-10-24 22:00:00,0.006,1.942,2019-10-24,22
3759,2019-06-06 12:00:00,0.051,3.568,2019-06-06,12
941,2019-02-09 03:00:00,0.019,2.664,2019-02-09,3
7716,2019-11-18 10:00:00,0.048,3.508,2019-11-18,10


In [8]:
del df_wind['local_time']

In [9]:
df_wind.rename(columns={'new_date': 'date', 'new_time': 'time'}, inplace=True)
df_wind.sample(5)

Unnamed: 0,electricity,wind_speed,date,time
3865,0.001,1.388,2019-06-10,22
2897,0.008,2.093,2019-05-01,14
847,0.008,2.088,2019-02-05,5
5843,0.002,1.587,2019-09-01,8
1333,0.039,3.281,2019-02-25,10


## Agrupando por período

In [10]:
def obter_periodo(hora):
    if(0 <= hora < 6):
        return 'madrugada'
    if(6 <= hora < 12):
        return 'manhã'
    if(12 <= hora < 18):
        return 'tarde'
    if(18 <= hora < 24):
        return 'noite'

In [11]:
obter_periodo(21)

'noite'

In [12]:
df_wind['periodo'] = [obter_periodo(hora) for hora in df_wind['time']]
df_wind.sample(5)

Unnamed: 0,electricity,wind_speed,date,time,periodo
1473,0.003,1.686,2019-03-03,6,manhã
1245,0.065,3.875,2019-02-21,18,noite
2342,0.02,2.675,2019-04-08,11,manhã
3378,0.002,1.462,2019-05-21,15,tarde
7611,0.164,5.327,2019-11-14,1,madrugada


In [13]:
df_wind.groupby(['date', 'periodo']).mean().to_csv('../../data/wind_osorio_2019_periodos.csv')

In [14]:
df_wind = pd.read_csv('../../data/wind_osorio_2019_periodos.csv')
df_wind.sample(5)

Unnamed: 0,date,periodo,electricity,wind_speed,time
1204,2019-10-28,tarde,0.001,1.282667,14.5
140,2019-02-04,tarde,0.021833,2.744333,14.5
1412,2019-12-19,tarde,0.127833,4.872333,14.5
1229,2019-11-04,madrugada,0.016333,2.471667,2.5
939,2019-08-23,noite,0.009333,2.141167,20.5


In [15]:
dias = []
for i in df_wind.groupby('date'):
    d = {
        'data': i[0],
        'el_madrugada': i[1][i[1]['periodo'] == 'madrugada']['electricity'].mean(),
        'ws_madrugada': i[1][i[1]['periodo'] == 'madrugada']['wind_speed'].mean(),

        'el_manha': i[1][i[1]['periodo'] == 'manhã']['electricity'].mean(),
        'ws_manha': i[1][i[1]['periodo'] == 'manhã']['wind_speed'].mean(),
        
        'el_tarde': i[1][i[1]['periodo'] == 'tarde']['electricity'].mean(),
        'ws_tarde': i[1][i[1]['periodo'] == 'tarde']['wind_speed'].mean(),
        
        'el_noite': i[1][i[1]['periodo'] == 'noite']['electricity'].mean(),
        'ws_noite': i[1][i[1]['periodo'] == 'noite']['wind_speed'].mean()        
    }
    dias.append(d)

df_periodo = pd.DataFrame(dias)
df_periodo

Unnamed: 0,data,el_madrugada,ws_madrugada,el_manha,ws_manha,el_tarde,ws_tarde,el_noite,ws_noite
0,2018-12-31,,,,,,,0.021000,2.709000
1,2019-01-01,0.027500,2.950333,0.060333,3.764833,0.096500,4.356000,0.143000,5.071000
2,2019-01-02,0.113833,4.673667,0.043333,3.218500,0.003167,1.680000,0.017500,2.585000
3,2019-01-03,0.004500,1.820833,0.015167,2.278667,0.190833,5.549167,0.055667,3.567167
4,2019-01-04,0.036333,3.152333,0.008833,2.149167,0.031333,3.014500,0.021667,2.720167
...,...,...,...,...,...,...,...,...,...
361,2019-12-27,0.093667,4.378333,0.043167,3.368333,0.115500,4.594167,0.143000,5.070333
362,2019-12-28,0.101333,4.480167,0.040833,3.301833,0.082000,4.123167,0.125333,4.840333
363,2019-12-29,0.064833,3.848000,0.010667,2.058833,0.005333,1.323333,0.029333,2.997833
364,2019-12-30,0.014500,2.442500,0.008667,2.123000,0.009500,2.155333,0.023000,2.794333


In [16]:
df_periodo.dropna(inplace=True)
df_periodo

Unnamed: 0,data,el_madrugada,ws_madrugada,el_manha,ws_manha,el_tarde,ws_tarde,el_noite,ws_noite
1,2019-01-01,0.027500,2.950333,0.060333,3.764833,0.096500,4.356000,0.143000,5.071000
2,2019-01-02,0.113833,4.673667,0.043333,3.218500,0.003167,1.680000,0.017500,2.585000
3,2019-01-03,0.004500,1.820833,0.015167,2.278667,0.190833,5.549167,0.055667,3.567167
4,2019-01-04,0.036333,3.152333,0.008833,2.149167,0.031333,3.014500,0.021667,2.720167
5,2019-01-05,0.020500,2.695500,0.044667,3.372500,0.136667,4.981167,0.140500,5.035833
...,...,...,...,...,...,...,...,...,...
361,2019-12-27,0.093667,4.378333,0.043167,3.368333,0.115500,4.594167,0.143000,5.070333
362,2019-12-28,0.101333,4.480167,0.040833,3.301833,0.082000,4.123167,0.125333,4.840333
363,2019-12-29,0.064833,3.848000,0.010667,2.058833,0.005333,1.323333,0.029333,2.997833
364,2019-12-30,0.014500,2.442500,0.008667,2.123000,0.009500,2.155333,0.023000,2.794333


In [17]:
df_periodo.to_csv('../../data/wind_osorio_2019_periodos.csv')

In [21]:
df_periodo.drop(columns=['data'], inplace=True)

## Agrupando o novo dataset por período

In [22]:
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score

In [24]:
faixa_clusters = [i for i in range(2, 10)]
valores_silhueta = []

for n in faixa_clusters:
    agrupador = KMeans(n_clusters=n)
    labels = agrupador.fit_predict(df_periodo)
    media = silhouette_score(df_periodo, labels)
    valores_silhueta.append((n, media))
    
print(valores_silhueta)

[(2, 0.4103516017026995), (3, 0.31640738050704165), (4, 0.3069702951909494), (5, 0.25761624771677183), (6, 0.2536302974641323), (7, 0.25929916770436107), (8, 0.24592757807865664), (9, 0.23075053153526065)]


In [40]:
faixa_eps = [0.1, 0.3, 0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 5, 10]
valores_silhueta = []
for n in faixa_eps:
    agrupador = DBSCAN(eps=n, min_samples=7)
    labels = agrupador.fit_predict(df_periodo)
    if(len(np.unique(labels)) > 1):
        media = silhouette_score(df_periodo, labels)
        print((n, media, np.unique(labels, return_counts=True)))
        print('\n')

(0.5, -0.4164732698888519, (array([-1,  0,  1,  2,  3,  4], dtype=int64), array([326,  12,   7,   7,   7,   6], dtype=int64)))


(0.7, 0.13069586698629487, (array([-1,  0,  1], dtype=int64), array([163, 143,  59], dtype=int64)))


(0.9, 0.1934180303146228, (array([-1,  0], dtype=int64), array([ 63, 302], dtype=int64)))


(1.1, 0.31765897954805344, (array([-1,  0], dtype=int64), array([ 26, 339], dtype=int64)))


(1.3, 0.36527178416133355, (array([-1,  0], dtype=int64), array([ 13, 352], dtype=int64)))


(1.5, 0.4411423048086335, (array([-1,  0], dtype=int64), array([  5, 360], dtype=int64)))




Tanto para o `KMeans` quanto para o `DBScan` foram encontrados dois clusters para o melhor caso (considerando velocidade do vento vs eletricidade). No entando o `DBScan` encontrou um dos clusters como sendo ruído, ou seja, _os dados estão possivelmente agrupados e não existe uma correlação direta sazonal entre o vento e a eletrecidade aqui_. Serão feitas outras análises ainda sobre os dados.