In [43]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [44]:
df = pd.read_csv("../UH_2023/DATOS_METEO.txt", sep = "|")

In [45]:
df.validTimeUtc = pd.to_datetime(df.validTimeUtc)

Només necessitem dades horàries, que en tinguem suficients i pel 22.

In [46]:
df[df["windDirection"].isna()==True].validTimeUtc.dt.year.unique()

array([2015, 2016, 2017], dtype=int64)

In [47]:
columns_to_drop = ["precip2Day", "precip3Day", "precip7Day", "precipMtd", "precipYtd", "pressureMeanSeaLevel", "snow2Day", "snow3Day", "snow7Day", "snowMtd", "snowSeason", "snowYtd", "windDirection", "windGust", "precip24Hour", "precip6Hour", "snow24Hour", "snow6Hour", "temperatureChange24Hour", "temperatureMax24Hour", "temperatureMin24Hour", "visibility", "uvIndex"]

In [48]:
df2 = df.drop(columns_to_drop, axis=1)

In [49]:
cols_to_modify = list(df2.isna().sum()[df2.isna().sum() != 0].index)

Hi ha 20 files que falten.

In [50]:
df2[cols_to_modify] = (df2[cols_to_modify].ffill()+df2[cols_to_modify].bfill())/2

El 13/9/2015 hi ha 7 files que falten a totes les estacions.

In [51]:
for st in list(df2.ID_ESTACION.unique()):
    row_primer = df2[(df2.validTimeUtc.dt.day == 13) & ( df2.validTimeUtc.dt.month == 9) & (df2.validTimeUtc.dt.year == 2015) & (df2.ID_ESTACION == st)].iloc[-1, :]
    row_ultima = df2[(df2.validTimeUtc.dt.day == 14) & ( df2.validTimeUtc.dt.month == 9) & (df2.validTimeUtc.dt.year == 2015) & (df2.ID_ESTACION == st)].iloc[0, :]
    row_primer_del = row_primer.drop(['ID_ESTACION'])
    row_ultima_del = row_ultima.drop(['ID_ESTACION'])
    diff = row_ultima_del - row_primer_del
    diff /= 8
    for i in range(1, 8):
        df_aux = pd.DataFrame([row_primer_del + diff*i], columns=df2.columns[:-1])
        df_aux["ID_ESTACION"] = st
        df2 = pd.concat([df2, df_aux]).reset_index(drop = True)

### GroupBy amb day

In [52]:
df2_mean = df2.groupby([pd.Grouper(key="validTimeUtc", freq="1D"), pd.Grouper(key = 'ID_ESTACION')]).mean()
df2_max = df2.groupby([pd.Grouper(key="validTimeUtc", freq="1D"), pd.Grouper(key = 'ID_ESTACION')]).max()
df2_min = df2.groupby([pd.Grouper(key="validTimeUtc", freq="1D"), pd.Grouper(key = 'ID_ESTACION')]).min()

In [53]:
cols_mean = [i + "DayAvg" for i in df2_mean.columns]
df2_mean.columns = cols_mean
cols_max = [i + "DayMax" for i in df2_max.columns]
df2_max.columns = cols_max
cols_min = [i + "DayMin" for i in df2_min.columns]
df2_min.columns = cols_min

### GroupyBy amb daytime

In [54]:
df2_daytime = df2[(7 < df2.validTimeUtc.dt.hour) & (df2.validTimeUtc.dt.hour < 19)]

df2_mean_daytime = df2.groupby([pd.Grouper(key="validTimeUtc", freq="1D"), pd.Grouper(key = 'ID_ESTACION')]).mean()
df2_max_daytime = df2.groupby([pd.Grouper(key="validTimeUtc", freq="1D"), pd.Grouper(key = 'ID_ESTACION')]).max()
df2_min_daytime = df2.groupby([pd.Grouper(key="validTimeUtc", freq="1D"), pd.Grouper(key = 'ID_ESTACION')]).min()

In [55]:
# Periodes normals
dic_periodes = {"Day": (0, 24), "Daytime": (7, 19), "Morning": (7, 13), "Afternoon": (13, 19)}
llista_df = {}
for k,(i,j) in dic_periodes.items():
    df_aux = df2[(i < df2.validTimeUtc.dt.hour) & (df2.validTimeUtc.dt.hour < j)]
    df2_mean = df_aux.groupby([pd.Grouper(key="validTimeUtc", freq="1D"), pd.Grouper(key = 'ID_ESTACION')]).mean()
    df2_max = df_aux.groupby([pd.Grouper(key="validTimeUtc", freq="1D"), pd.Grouper(key = 'ID_ESTACION')]).max()
    df2_min = df_aux.groupby([pd.Grouper(key="validTimeUtc", freq="1D"), pd.Grouper(key = 'ID_ESTACION')]).min()
    
    cols_mean = [c + k + "Avg" for c in df2_mean.columns]
    df2_mean.columns = cols_mean
    cols_max = [c + k + "Max" for c in df2_max.columns]
    df2_max.columns = cols_max
    cols_min = [c + k + "Min" for c in df2_min.columns]
    df2_min.columns = cols_min
    llista_df[k] = [df2_mean, df2_max, df2_min]

In [56]:
# Periodes raros
dic_periodes = {"Nighttime": (19, 7), "Evening": (19, 1), "Overnight": (1, 7)}
for k,(i,j) in dic_periodes.items():
    df_aux = df2[(i < df2.validTimeUtc.dt.hour) | (df2.validTimeUtc.dt.hour < j)]
    df_aux.loc[:, "validTimeUtc"] = df_aux.loc[:, "validTimeUtc"] - pd.Timedelta(hours=10)
    df2_mean = df_aux.groupby([pd.Grouper(key="validTimeUtc", freq="1D"), pd.Grouper(key = 'ID_ESTACION')]).mean()
    df2_max = df_aux.groupby([pd.Grouper(key="validTimeUtc", freq="1D"), pd.Grouper(key = 'ID_ESTACION')]).max()
    df2_min = df_aux.groupby([pd.Grouper(key="validTimeUtc", freq="1D"), pd.Grouper(key = 'ID_ESTACION')]).min()
    
    cols_mean = [c + k + "Avg" for c in df2_mean.columns]
    df2_mean.columns = cols_mean
    cols_max = [c + k + "Max" for c in df2_max.columns]
    df2_max.columns = cols_max
    cols_min = [c + k + "Min" for c in df2_min.columns]
    df2_min.columns = cols_min
    llista_df[k] = [df2_mean, df2_max, df2_min]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_aux.loc[:, "validTimeUtc"] = df_aux.loc[:, "validTimeUtc"] - pd.Timedelta(hours=10)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_aux.loc[:, "validTimeUtc"] = df_aux.loc[:, "validTimeUtc"] - pd.Timedelta(hours=10)


In [57]:
first_df = llista_df["Day"][0]
llista = [j for j in [i for i in llista_df.values()]]
df_tractats = [item for sublist in llista for item in sublist]
df_tractats = df_tractats[1:]

In [58]:
for ds in df_tractats:
    first_df = pd.merge(first_df, ds, left_index=True, right_index=True)

In [59]:
first_df = first_df.reset_index()

In [60]:
first_df.to_csv("../DATASETS_TRACTATS/df_meteo_tractat.csv", index = False)

In [61]:
first_df

Unnamed: 0,validTimeUtc,ID_ESTACION,precip1HourDayAvg,pressureChangeDayAvg,relativeHumidityDayAvg,snow1HourDayAvg,temperatureDayAvg,temperatureDewPointDayAvg,temperatureFeelsLikeDayAvg,windSpeedDayAvg,...,temperatureFeelsLikeOvernightMax,windSpeedOvernightMax,precip1HourOvernightMin,pressureChangeOvernightMin,relativeHumidityOvernightMin,snow1HourOvernightMin,temperatureOvernightMin,temperatureDewPointOvernightMin,temperatureFeelsLikeOvernightMin,windSpeedOvernightMin
0,2015-06-30,0,0.0,-0.321739,44.700000,0.0,26.534783,12.095652,26.565217,11.073913,...,35.7,20.2,0.0,-1.5,21.6,0.0,18.3,10.4,18.3,3.6
1,2015-06-30,1,0.0,-0.308696,38.773913,0.0,26.834783,10.226087,26.834783,12.986957,...,35.5,22.3,0.0,-1.2,19.3,0.0,18.3,8.6,18.3,4.7
2,2015-06-30,2,0.0,-0.330435,44.865217,0.0,26.821739,12.252174,26.839130,11.213043,...,36.2,20.5,0.0,-1.0,21.6,0.0,17.5,10.5,17.5,3.6
3,2015-06-30,3,0.0,-0.321739,41.652174,0.0,27.213043,11.456522,27.213043,12.278261,...,36.3,21.6,0.0,-1.3,19.7,0.0,18.5,9.5,18.5,4.7
4,2015-06-30,4,0.0,-0.308696,40.960870,0.0,26.852174,10.904348,26.865217,12.808696,...,36.0,22.0,0.0,-1.2,20.0,0.0,18.1,9.5,18.1,4.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51155,2022-06-30,15,0.0,0.230435,53.113043,0.0,23.769565,12.200000,23.791304,7.717391,...,31.5,14.8,0.0,-1.4,25.7,0.0,18.7,8.3,18.7,3.6
51156,2022-06-30,16,0.0,0.239130,52.556522,0.0,24.182609,12.626087,24.200000,6.852174,...,31.4,13.3,0.0,-1.2,25.9,0.0,19.6,8.9,19.6,2.5
51157,2022-06-30,17,0.0,0.226087,48.439130,0.0,23.582609,10.091304,23.586957,8.382609,...,31.5,16.2,0.0,-1.7,21.3,0.0,17.9,5.3,17.9,4.7
51158,2022-06-30,18,0.0,0.304348,52.656522,0.0,24.195652,13.030435,24.239130,6.669565,...,31.1,10.4,0.0,-1.4,30.2,0.0,19.5,10.7,19.5,4.3
