# Read in Covid Data
# Create csv file with preprocessed covid data

In [3]:
import pandas as pd
import csv

# Read in data from COVID-19_Daily_Counts_of_Cases__Hospitalizations__and_Deaths.csv 
# download from https://data.cityofnewyork.us/Health/COVID-19-Daily-Counts-of-Cases-Hospitalizations-an/rc75-m7u3
covid_data = pd.read_csv(r"../raw_data/raw/COVID-19_Daily_Counts_of_Cases__Hospitalizations__and_Deaths.csv", index_col = False, delimiter = ',', header = 0, low_memory = False)
pd.options.mode.chained_assignment = None

covid_data = covid_data.loc[covid_data['DATE_OF_INTEREST'].str.contains('2020')]

# Extra two columns of data which is needed for preprocessing
covid_date = covid_data['DATE_OF_INTEREST']
covid_count = covid_data['CASE_COUNT']

# Convert the date value into the form fitted the green / yellow taxis' data as form yyyy-mm-dd
dim_covid = len(covid_date)
for i in range(dim_covid):
    covid_date[i] = covid_date[i][6:] + '-' + covid_date[i][0:2] + '-' + covid_date[i][3:5]

# Create new csv file for only containing Date and Case number named covid.csv for preprocessing usage in code Preprocess.py
field_order = ["DATE_OF_INTEREST", "CASE_COUNT"]
with open("../raw_data/raw/covid.csv", 'w', encoding = "utf-8", newline='') as csvfile:
    writer = csv.DictWriter(csvfile, field_order)
    writer.writeheader()
    for i in range(dim_covid):
        writer.writerow(dict(zip(field_order, [covid_date[i], covid_count[i]])))
    print('finish')

finish


# Read in Weather Data (Temperature & Relative Humidity)
# Create csv file with preprocessed human comfort index (DI) data

In [4]:

# Read in external weather data and remove the first 9th rows which do not contain informative data
weather = pd.read_csv(r"../raw_data/raw/weather_data.csv", index_col = False, delimiter = ',', header = 0, low_memory = False)
weather = weather.drop([0, 1, 2, 3, 4, 5, 6, 7, 8]).reset_index()

# convert the average temperature and average humidity data into float type
weather["New York.2"] = weather["New York.2"].astype(float)
weather["New York.5"] = weather["New York.5"].astype(float)

# formula: DI = T - (0.55 - 0.55f) * (T-58); T is temperature (unit of Fahrenheit), f is Relative Humidity (%)
# calculate the DI index (human comfort index)
# cited from 
# Yang, D., & Qian, Y. (Eds.). (2021). Proceedings of the 24th International Symposium on Advancement of Construction Management and Real Estate. Springer.

# convert DI into range in (-Inf, 0) with 0 = completely comfortable while the more it less than 0, the more uncomfortable
# Pick DI = 67.5 as completely comfortable value, refer from https://en.calculator.jp/health/discomfort/
weather["DIcomfort_index"] = (abs((weather["New York.2"] - (0.55-0.55*weather["New York.5"]/100)*(weather["New York.2"]-58)) - 67.5)/2.5 * (-0.5)).round(5)

# remove the columns with no useful information
weather = weather.drop(columns = ['New York'])
weather = weather.drop(columns = ['New York.1'])
weather = weather.drop(columns = ['New York.2'])
weather = weather.drop(columns = ['New York.3'])
weather = weather.drop(columns = ['New York.4'])
weather = weather.drop(columns = ['New York.5'])
weather = weather.drop(columns = ['index'])

# convert the "yyyymmddT0000" data into "yyyy-mm-dd" form and store it in column 'date'
date = []
date_location = weather['location'].tolist()
for i in date_location:
    date.append(i[0:4] + "-" + i[4:6] + "-" + i[6:8])
date_column = pd.Series(date)
weather['date'] = date_column.values

# remove the column with no useful information
weather = weather.drop(columns = ['location'])

# covert the dataframe into csv file for merging data usage
weather.to_csv("../raw_data/raw/comfortIndex.csv", index = False)

print("finish")

finish
