In [5]:
from datetime import date, timedelta, datetime
import csv
import pandas as pd
import numpy as np


def pre_process(filename):
  #Reading File and renaming Columns
  df_t = pd.read_csv(filename)
  df_t = df_t.rename(columns={' pm10': 'pm10',' pm25': 'pm25',' o3': 'o3',' no2': 'no2',' so2': 'so2',' co': 'co'})

  #Converting Date column to Date Time Format
  df_t['date'] = pd.to_datetime(df_t['date'], format='%d-%m-%y')
  df_t.sort_values(by='date',inplace=True)
  start_date = df_t.date.min()
  end_date = date.today() - timedelta(days=1)
  r = pd.date_range(start = start_date, end = end_date)
  df_t=df_t.set_index('date').reindex(r).rename_axis('date').reset_index()


  raw_preprocessed = df_t.set_index('date')
  #EXPORT PREPROCESSED RAW
  reindex_name = filename[:-8]+'_rawprocessed.csv'
  raw_preprocessed.to_csv(reindex_name, index=True)
  #used for plotting historical data


  #Adding Columns for Date, Month and Year
  dd,mm,yy=[],[],[]
  for i in range(len(df_t)):
      d=int(df_t["date"][i].day)
      dd.append(d)
      m=int(df_t["date"][i].month)
      mm.append(m)
      y=int(df_t["date"][i].year)
      yy.append(y)
  df_t["dd"]=dd
  df_t["mm"]=mm
  df_t["yy"]=yy
  df_t.set_index('date', inplace=True)

  #Removing 2020 Data
  df_t = df_t[df_t.yy != 2020]
  df_t = df_t.apply(pd.to_numeric,  errors='coerce')

  #Removing Date Keys from Key List
  date_keys = ['dd','mm','yy']
  pollutants = list(df_t.keys())
  for date_key in date_keys:
    pollutants.remove(date_key)

  #Fillin NaN Data with Monthly Medians
  for k in pollutants:
      df_t[k] = df_t.groupby(["mm","yy"])[k].transform(lambda x: x.fillna(np.nanmedian(x)))
    
  #Dropping Keys with NaN left after PreProcessing
  drop_l=[]
  for p in pollutants:
      count_nan = len(df_t[p]) - df_t[p].count()
      if(count_nan>0):
          drop_l.append(p)
  for i in drop_l:
      df_t=df_t.drop(i, axis = 1)

  # #EXPORT PREPROCESSED RAW
  # pred_file = filename[:-8]+'_preprocessed.csv'
  # df_t.to_csv(pred_file, index=True)
  # #used for plotting historical data

  return df_t

Unnamed: 0_level_0,pm25,pm10,o3,no2,so2,co,dd,mm,yy
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-01-18,277.0,515.0,5.0,15.0,4.0,9.0,18,1,2018
2018-01-19,437.0,438.0,5.0,15.0,4.0,9.0,19,1,2018
2018-01-20,350.0,321.0,5.0,15.0,4.0,9.0,20,1,2018
2018-01-21,277.0,321.0,5.0,15.0,4.0,9.0,21,1,2018
2018-01-22,277.0,317.0,5.0,15.0,4.0,9.0,22,1,2018
...,...,...,...,...,...,...,...,...,...
2021-04-11,146.0,225.0,24.0,24.0,7.0,10.0,11,4,2021
2021-04-12,184.0,212.0,20.0,23.0,7.0,9.0,12,4,2021
2021-04-13,179.0,145.0,18.0,21.0,5.0,7.0,13,4,2021
2021-04-14,140.0,184.0,19.0,20.0,6.0,8.0,14,4,2021
