#### Aggregating Files

In [1]:
import pandas as pd
import os, re

In [2]:
# Constants
RAW_DATA_DIR_PATH = "G:\\My Drive\\AirPollutionML\\Raw_Data"
# RAW_DATA_FILES_PATH = 'G:\\My Drive\\AirPollutionML\\Raw_Data_Files'
RAW_DATA_FILES_PATH = "data"
RAW_FILE_PATTERN = r"(?:Copy of )?Raw_data_1Hr_(\d{4})_"

In [3]:
from regex import *

#### Yearly Region Files Aggregation
Files of each year by each region with mean of station data

In [18]:
station_files = {}

In [19]:
for filename in os.listdir(RAW_DATA_FILES_PATH):
    match = re.match(STATION_FILE_PATTERN, filename)
    if match:
        year, region, station = match.groups()
        if year not in station_files:
            station_files[year] = {}
        if region not in station_files[year]:
            station_files[year][region] = []
        station_files[year][region].append(filename)

In [None]:
station_files

In [39]:
for year, regions in station_files.items():
    for region, files in regions.items():
        new_filename = f"{year}_{region}.csv"
        aggregated_df = pd.concat(
            [pd.read_csv(os.path.join(RAW_DATA_FILES_PATH, file)) for file in files]
        )
        aggregated_df = aggregated_df.groupby("Timestamp").mean()
        aggregated_df = aggregated_df.reset_index().rename(
            columns={"index": "Timestamp"}
        )
        aggregated_df.to_csv(
            os.path.join(RAW_DATA_FILES_PATH, new_filename), index=False
        )

### Year Files Aggregation
Files of each year with mean of all regions

In [4]:
year_files = {}

In [5]:
for filename in os.listdir(RAW_DATA_FILES_PATH):
    match = re.match(YEAR_REGION_FILE_PATTERN, filename)
    if match:
        year, region = match.groups()
        if year not in year_files:
            year_files[year] = []
        year_files[year].append(filename)

In [6]:
year_files

{'2023': ['2023_Gurugram.csv',
  '2023_Faridabad.csv',
  '2023_Delhi.csv',
  '2023_Sonipat.csv',
  '2023_Rohtak.csv',
  '2023_Jind.csv',
  '2023_Meerut.csv',
  '2023_Muzaffarnagar.csv',
  '2023_Ghaziabad.csv',
  '2023_Baghpat.csv',
  '2023_Palwal.csv',
  '2023_Karnal.csv',
  '2023_Noida.csv'],
 '2022': ['2022_Gurugram.csv',
  '2022_Faridabad.csv',
  '2022_Delhi.csv',
  '2022_Sonipat.csv',
  '2022_Rohtak.csv',
  '2022_Jind.csv',
  '2022_Meerut.csv',
  '2022_Muzaffarnagar.csv',
  '2022_Ghaziabad.csv',
  '2022_Baghpat.csv',
  '2022_Palwal.csv',
  '2022_Karnal.csv',
  '2022_Noida.csv'],
 '2021': ['2021_Gurugram.csv',
  '2021_Faridabad.csv',
  '2021_Delhi.csv',
  '2021_Sonipat.csv',
  '2021_Rohtak.csv',
  '2021_Jind.csv',
  '2021_Meerut.csv',
  '2021_Muzaffarnagar.csv',
  '2021_Ghaziabad.csv',
  '2021_Baghpat.csv',
  '2021_Palwal.csv',
  '2021_Karnal.csv',
  '2021_Noida.csv'],
 '2020': ['2020_Gurugram.csv',
  '2020_Faridabad.csv',
  '2020_Delhi.csv',
  '2020_Sonipat.csv',
  '2020_Rohtak.csv

In [8]:
RAW_DATA_FILES_PATH

'G:\\My Drive\\AirPollutionML\\Raw_Data_Files'

In [7]:
for year, files in year_files.items():
    new_filename = f"{year}.csv"
    aggregated_df = pd.concat(
        [pd.read_csv(os.path.join(RAW_DATA_FILES_PATH, file)) for file in files]
    )
    aggregated_df = aggregated_df.groupby("Timestamp").mean()
    aggregated_df = aggregated_df.reset_index().rename(columns={"index": "Timestamp"})
    aggregated_df.to_csv(os.path.join(RAW_DATA_FILES_PATH, new_filename), index=False)

### Year All Region Files Aggregation
Files of each year and all regions as column

In [28]:
for year, files in year_files.items():
    dfs = []
    new_filename = f"{year}_all_regions.csv"
    for file in files:
        match = re.match(YEAR_REGION_FILE_PATTERN, file)
        if match:
            df = pd.read_csv(os.path.join(RAW_DATA_FILES_PATH, file))
            _, region = match.groups()
            df["Region"] = region
            dfs.append(df)
        else:
            print(
                "Error in matching this file: {} with this pattern {}",
                file,
                YEAR_REGION_FILE_PATTERN,
            )
    aggregated_df = pd.concat(dfs)
    aggregated_df.to_csv(os.path.join(RAW_DATA_FILES_PATH, new_filename), index=False)

In [None]:
pd.read_csv(os.path.join(RAW_DATA_FILES_PATH,"2019_all_regions.csv"))

### Region All Year Files Aggregation
Files of each region and all years

In [30]:
regions_files = {}

In [31]:
for filename in os.listdir(RAW_DATA_FILES_PATH):
    match = re.match(YEAR_REGION_FILE_PATTERN, filename)
    if match:
        year, region = match.groups()
        if region not in regions_files:
            regions_files[region] = []
        regions_files[region].append(filename)

In [None]:
regions_files

In [34]:
for region, files in regions_files.items():
    dfs = []
    new_filename = f"{region}_all_years.csv"
    aggregated_df = pd.concat(
        [pd.read_csv(os.path.join(RAW_DATA_FILES_PATH, file)) for file in files]
    )
    aggregated_df.to_csv(os.path.join(RAW_DATA_FILES_PATH, new_filename), index=False)

In [None]:
pd.read_csv(os.path.join(RAW_DATA_FILES_PATH,"Delhi_all_years.csv"))