In [1]:
import pandas as pd

# Single Dataset Creation

In [2]:
excel_file = pd.ExcelFile("DATOS HISTÓRICOS 2022_2023_TODAS ESTACIONES.xlsx")
excel_file.sheet_names

['SURESTE',
 'NORESTE',
 'CENTRO',
 'NOROESTE',
 'SUROESTE',
 'NOROESTE2',
 'NORTE',
 'SUROESTE2',
 'SURESTE2',
 'SURESTE3',
 'SUR',
 'NORTE2',
 'NORESTE2',
 'NORESTE3',
 'NOROESTE3',
 'CATÁLOGO']

In [3]:
dataset = pd.concat(
    (excel_file.parse(sheet).assign(Estacion=i) for i, sheet in enumerate(excel_file.sheet_names[:-1])),
    ignore_index=True
)

In [4]:
dataset.head()

Unnamed: 0,date,CO,NO,NO2,NOX,O3,PM10,PM2.5,PRS,RAINF,RH,SO2,SR,TOUT,WSR,WDR,Estacion
0,2022-01-01 00:00:00,2.36,19.8,48.4,68.0,5.0,142.0,111.0,715.7,0.0,59.0,4.0,0.0,21.65,6.5,285.0,0
1,2022-01-01 01:00:00,2.29,10.8,44.5,55.1,6.0,168.0,112.16,715.5,0.0,59.0,4.6,0.0,21.21,7.1,284.0,0
2,2022-01-01 02:00:00,2.71,28.3,47.1,75.2,4.0,203.0,139.31,715.4,0.0,62.0,5.9,0.0,21.21,7.7,290.0,0
3,2022-01-01 03:00:00,2.31,19.6,42.1,61.5,5.0,262.0,177.68,715.1,0.0,59.0,5.5,0.0,20.44,8.2,294.0,0
4,2022-01-01 04:00:00,1.85,13.0,39.5,52.5,4.0,185.0,134.21,715.1,0.0,59.0,4.9,0.0,19.8,8.1,287.0,0


In [5]:
rubric = excel_file.parse(excel_file.sheet_names[-1])
rubric['Nombre_Estacion'] = rubric['Nombre_Estacion'].str.upper()
rubric = rubric.reset_index()
rubric.head()

Unnamed: 0,index,Nombre_Estacion,Clave_Estacion,location
0,0,NORESTE3 PESQUERIA,Pesqueria,"25.791343,-100.078176"
1,1,NORTE2 UNIVERSIDAD,Universidad,"25.729787,-100.310028"
2,2,NOROESTE SAN BERNABÉ,San Bernabé,"25.75712,-100.365974"
3,3,NOROESTE 2 GARCIA,Garcia,"25.783331,-100.585833"
4,4,CENTRO OBISPADO,Obispado,"25.67602,-100.335847"


In [6]:
data_joined = dataset.merge(rubric, left_on='Estacion', right_on='index', how='left')
data_joined.drop(columns=['index', 'Estacion'], inplace=True)
data_joined[['x_coord', 'y_coord']] = data_joined['location'].str.split(',', expand=True)
data_joined['x_coord'] = data_joined['x_coord'].astype(float)
data_joined['y_coord'] = data_joined['y_coord'].astype(float)
data_joined.drop(columns=['location'], inplace=True)
data_joined.head()

Unnamed: 0,date,CO,NO,NO2,NOX,O3,PM10,PM2.5,PRS,RAINF,RH,SO2,SR,TOUT,WSR,WDR,Nombre_Estacion,Clave_Estacion,x_coord,y_coord
0,2022-01-01 00:00:00,2.36,19.8,48.4,68.0,5.0,142.0,111.0,715.7,0.0,59.0,4.0,0.0,21.65,6.5,285.0,NORESTE3 PESQUERIA,Pesqueria,25.791343,-100.078176
1,2022-01-01 01:00:00,2.29,10.8,44.5,55.1,6.0,168.0,112.16,715.5,0.0,59.0,4.6,0.0,21.21,7.1,284.0,NORESTE3 PESQUERIA,Pesqueria,25.791343,-100.078176
2,2022-01-01 02:00:00,2.71,28.3,47.1,75.2,4.0,203.0,139.31,715.4,0.0,62.0,5.9,0.0,21.21,7.7,290.0,NORESTE3 PESQUERIA,Pesqueria,25.791343,-100.078176
3,2022-01-01 03:00:00,2.31,19.6,42.1,61.5,5.0,262.0,177.68,715.1,0.0,59.0,5.5,0.0,20.44,8.2,294.0,NORESTE3 PESQUERIA,Pesqueria,25.791343,-100.078176
4,2022-01-01 04:00:00,1.85,13.0,39.5,52.5,4.0,185.0,134.21,715.1,0.0,59.0,4.9,0.0,19.8,8.1,287.0,NORESTE3 PESQUERIA,Pesqueria,25.791343,-100.078176


# Data Prep

In [7]:
data_joined.columns

Index(['date', 'CO', 'NO', 'NO2', 'NOX', 'O3', 'PM10', 'PM2.5', 'PRS', 'RAINF',
       'RH', 'SO2', 'SR', 'TOUT', 'WSR', 'WDR', 'Nombre_Estacion',
       'Clave_Estacion', 'x_coord', 'y_coord'],
      dtype='object')

In [8]:
columns = ['date', 'x_coord', 'y_coord', 'RAINF', 'TOUT', 'RH', 'SR', 'PRS', 'WSR', 'WDR', 'PM10', 'PM2.5']
filtered_data = data_joined[columns]


column_map = {
    'date': 'Date',
    'x_coord': 'Latitude',
    'y_coord': 'Longitude',
    'RAINF': 'Precipitation',
    'TOUT': 'Temperature',
    'RH': 'Relative Humidity',
    'SR': 'Solar Radiation',
    'PRS': 'Atmospheric Pressure',
    'WSR': 'Wind Speed',
    'WDR': 'Wind Direction'
}

filtered_data.rename(columns=column_map, inplace=True)
filtered_data.to_csv('dataset.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data.rename(columns=column_map, inplace=True)
