In [1]:
import math
import numpy as np
import pandas as pd

In [2]:
# Retrieve Neel's "clean" files
appa1 = pd.read_csv('neel_exports/appa_hr_crop_withCreated.csv')
appa2 = pd.read_csv('neel_exports/appa_hr_2_crop_withCreated.csv')
appa1_gt = pd.read_csv('neel_exports/appa1_gt_clean_withCO.csv')
appa2_gt = pd.read_csv('data/appa2_gt.csv')

# Remove confusing indices in appa1_gt
appa1_gt.drop(columns='Unnamed: 0', inplace=True)

In [3]:
# Convert time columns from Timestamp format to DateTime format
appa1.Time = pd.to_datetime(appa1.Time).dt.tz_localize(None)
appa2.Time = pd.to_datetime(appa2.Time).dt.tz_localize(None)

appa1_gt.Time = pd.to_datetime(appa1.Time)
appa2_gt.Time = pd.to_datetime(appa2.Time)

In [4]:
def add_times(df):
    # Add periodical time features to the dataset
    
    # HOUR OF THE DAY
    df["sin_hour"] = df["Time"].dt.hour.apply(
        lambda x: math.sin(x) * 2 * (math.pi / 24)
    )
    df["cos_hour"] = df["Time"].dt.hour.apply(
        lambda x: math.cos(x) * 2 * (math.pi / 24)
    )

    # DAY OF THE WEEK
    df["sin_weekday"] = df["Time"].dt.weekday.apply(
        lambda x: math.sin(x) * 2 * (math.pi / 7)
    )
    df["cos_weekday"] = df["Time"].dt.weekday.apply(
        lambda x: math.cos(x) * 2 * (math.pi / 7)
    )

    # MONTH OF THE YEAR
    df["sin_month"] = df["Time"].dt.month.apply(
        lambda x: math.sin(x) * 2 * (math.pi / 12)
    )

    df["cos_month"] = df["Time"].dt.month.apply(
        lambda x: math.cos(x) * 2 * (math.pi / 12)
    )

    # DAY OF THE YEAR (ORDINAL DATE)
    df["sin_ordate"] = df["Time"].apply(
        lambda x: math.sin(x.toordinal()) * 2 * (math.pi / 366)
    )
    df["cos_ordate"] = df["Time"].apply(
        lambda x: math.cos(x.toordinal()) * 2 * (math.pi / 366)
    )

    # Add the year feature
    df["year"] = df["Time"].dt.year

add_times(appa1)
add_times(appa2)

In [5]:
# Match column names to the convention
def rename_columns(df):
    df.columns = df.columns.str.replace('TimeSinceCreated', 'Age')

    df.columns = df.columns.str.replace('heatR_1', '1_heatR')
    df.columns = df.columns.str.replace('heatV_1', '1_heatV')
    df.columns = df.columns.str.replace('heatR_2', '2_heatR')
    df.columns = df.columns.str.replace('heatV_2', '2_heatV')

rename_columns(appa1)
rename_columns(appa2)

In [6]:
# Add appa2_gt data to appa2
appa2 = appa2.merge(appa2_gt, on='Time', how='left')

In [7]:
# Retrieve additional weather data
weather = pd.read_csv('data/weather_data.csv')

# Match time column to convention and convert to TimeDate format
weather.rename(columns={'date': 'Time'}, inplace=True)
weather['Time'] = pd.to_datetime(weather['Time'])

In [8]:
# Split weather to two dataframes
appa1_weather = weather[['Time', 'laste uv', 'laste rain', 'laste w_sp', 'laste w_dir']]
appa2_weather = weather[['Time', 'ronc uv', 'ronc rain', 'ronc w_sp', 'ronc w_dir']]

# Match column names to convention
appa1_weather.rename(columns={
    'laste uv': 'UV',
    'laste rain': 'Rain',
    'laste w_sp': 'Wind_Speed',
    'laste w_dir': 'Wind_Direction'
    }, inplace=True)

appa2_weather.rename(columns={
    'ronc uv': 'UV',
    'ronc rain': 'Rain',
    'ronc w_sp': 'Wind_Speed',
    'ronc w_dir': 'Wind_Direction'
    }, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  appa1_weather.rename(columns={
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  appa2_weather.rename(columns={


In [9]:
# Convert wind direction from degrees to radians
def convert_windir(df):
    df["sin_wind"] = df["Wind_Direction"].apply(
            lambda x: math.sin(x) * 2 * (math.pi / 360)
        )
    df["cos_wind"] = df["Wind_Direction"].apply(
            lambda x: math.cos(x) * 2 * (math.pi / 360)
        )
    df.drop(columns='Wind_Direction', inplace=True)

convert_windir(appa1_weather)
convert_windir(appa2_weather)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["sin_wind"] = df["Wind_Direction"].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["cos_wind"] = df["Wind_Direction"].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["sin_wind"] = df["Wind_Direction"].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try

In [10]:
# Add the weather data to the datasets
appa1 = appa1.merge(appa1_weather, on='Time', how='left')
appa2 = appa2.merge(appa2_weather, on='Time', how='left')

In [11]:
# Move ground truth labels to the end
def move_labels (df, labels):
    for label in labels:
        pos = len(df.columns) - 1
        col = df.pop(label)
        df.insert(pos, label, col)

appa1_labels, appa2_labels = ['PM10', 'NO2', 'SO2', 'O3', 'CO'], ['PM10', 'CO', 'NO2']
move_labels(appa1, appa1_labels)
move_labels(appa2, appa2_labels)

In [12]:
# Export final CSVs
appa1.to_csv('noam_exports/appa1.csv')
appa2.to_csv('noam_exports/appa2.csv')