In [1]:
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)

In [2]:
# read csv data
data_raw = pd.read_csv('weather_data.csv')
# drop the column 'Precipitation', due to all-zero values
data = data_raw.drop(columns=['Precipitation'])
# change the month jan to 1, feb to 2, etc.
month_mapping = {
    "Jan": 1, "Feb": 2, "Mar": 3, "Apr": 4, "May": 5, "Jun": 6,
    "Jul": 7, "Aug": 8, "Sep": 9, "Oct": 10, "Nov": 11, "Dec": 12
}
data['Month'] = data['Month'].replace(month_mapping)
# change the year, month, day to datetime format
data['Date'] = pd.to_datetime(data[['year', 'Month', 'Day']])
# drop the original year, month, day columns
data = data.drop(columns=['year', 'Month', 'Day'])
# set the Date column as index
data = data.set_index('Date')
# change the data type to float
data = data.astype(float)

In [3]:
# outliers

# Temperature 
# convert 0 to the average of the other two
data.loc[data['Temp_Min'] == 0, 'Temp_Min'] = ((data.loc[data['Temp_Min'] == 0, 'Temp_Avg'] * 2) - data.loc[data['Temp_Min'] == 0, 'Temp_Max']).apply(lambda x: round(x, 1))
# convert Temp_Min <= 30 to 30
data.loc[data['Temp_Min'] <= 30, 'Temp_Min'] = 30

# Dew
# convert Dew_Min <= 0 to the average of the other two
data.loc[data['Dew_Min'] <= 0, 'Dew_Min'] = ((data.loc[data['Dew_Min'] <= 0, 'Dew_Avg'] * 2) - data.loc[data['Dew_Min'] <= 0, 'Dew_Max']).apply(lambda x: round(x, 1))
# convert Dew_Min <= 10 to 10
data.loc[data['Dew_Min'] <= 10, 'Dew_Min'] = 10

# Humidity
# convert Humidity_Min <= 0 to the average of the other two
data.loc[data['Humidity_Min'] <= 0, 'Humidity_Min'] = ((data.loc[data['Humidity_Min'] <= 0, 'Humidity_Avg'] * 2) - data.loc[data['Humidity_Min'] <= 0, 'Humidity_Max']).apply(lambda x: round(x, 1))
# convert Humidity_Min <= 20 to 20
data.loc[data['Humidity_Min'] <= 20, 'Humidity_Min'] = 20

# Wind
# convert Wind_Max >= 50 to 50
data.loc[data['Wind_Max'] >= 50, 'Wind_Max'] = 50

# Pressure
# convert Pressure_Avg <27.3 to 27.3
data.loc[data['Pressure_Avg'] < 27.3, 'Pressure_Avg'] = 27.3
# convert Pressure_Min = 0 to the average of the other two
data.loc[data['Pressure_Min'] == 0, 'Pressure_Min'] = ((data.loc[data['Pressure_Min'] == 0, 'Pressure_Avg'] * 2) - data.loc[data['Pressure_Min'] == 0, 'Pressure_Max']).apply(lambda x: round(x, 1))

In [4]:
# add features
# Temp
data['Temp_Max_3'] = data['Temp_Max'].rolling(window=3).mean().apply(lambda x: round(x, 2))
data['Temp_Max_7'] = data['Temp_Max'].rolling(window=7).mean().apply(lambda x: round(x, 2))
data['Temp_Min_3'] = data['Temp_Min'].rolling(window=3).mean().apply(lambda x: round(x, 2))
data['Temp_Min_7'] = data['Temp_Min'].rolling(window=7).mean().apply(lambda x: round(x, 2))
data['Temp_Avg_3'] = data['Temp_Avg'].rolling(window=3).mean().apply(lambda x: round(x, 2))
data['Temp_Avg_7'] = data['Temp_Avg'].rolling(window=7).mean().apply(lambda x: round(x, 2))
data['Temp_Diff_Max_Avg'] = (data['Temp_Max'] - data['Temp_Avg']).apply(lambda x: round(x, 2))
data['Temp_Diff_Min_Avg'] = (data['Temp_Min'] - data['Temp_Avg']).apply(lambda x: round(x, 2))
data['Temp_Diff_Max_Min'] = (data['Temp_Max'] - data['Temp_Min']).apply(lambda x: round(x, 2))

# Dew
data['Dew_Max_3'] = data['Dew_Max'].rolling(window=3).mean().apply(lambda x: round(x, 2))
data['Dew_Max_7'] = data['Dew_Max'].rolling(window=7).mean().apply(lambda x: round(x, 2))
data['Dew_Min_3'] = data['Dew_Min'].rolling(window=3).mean().apply(lambda x: round(x, 2))
data['Dew_Min_7'] = data['Dew_Min'].rolling(window=7).mean().apply(lambda x: round(x, 2))
data['Dew_Avg_3'] = data['Dew_Avg'].rolling(window=3).mean().apply(lambda x: round(x, 2))
data['Dew_Avg_7'] = data['Dew_Avg'].rolling(window=7).mean().apply(lambda x: round(x, 2))
data['Dew_Diff_Max_Avg'] = (data['Dew_Max'] - data['Dew_Avg']).apply(lambda x: round(x, 2))
data['Dew_Diff_Min_Avg'] = (data['Dew_Min'] - data['Dew_Avg']).apply(lambda x: round(x, 2))
data['Dew_Diff_Max_Min'] = (data['Dew_Max'] - data['Dew_Min']).apply(lambda x: round(x, 2))

# Humidity
data['Humidity_Max_3'] = data['Humidity_Max'].rolling(window=3).mean().apply(lambda x: round(x, 2))
data['Humidity_Max_7'] = data['Humidity_Max'].rolling(window=7).mean().apply(lambda x: round(x, 2))
data['Humidity_Min_3'] = data['Humidity_Min'].rolling(window=3).mean().apply(lambda x: round(x, 2))
data['Humidity_Min_7'] = data['Humidity_Min'].rolling(window=7).mean().apply(lambda x: round(x, 2))
data['Humidity_Avg_3'] = data['Humidity_Avg'].rolling(window=3).mean().apply(lambda x: round(x, 2))
data['Humidity_Avg_7'] = data['Humidity_Avg'].rolling(window=7).mean().apply(lambda x: round(x, 2))
data['Humidity_Diff_Max_Avg'] = (data['Humidity_Max'] - data['Humidity_Avg']).apply(lambda x: round(x, 2))
data['Humidity_Diff_Min_Avg'] = (data['Humidity_Min'] - data['Humidity_Avg']).apply(lambda x: round(x, 2))
data['Humidity_Diff_Max_Min'] = (data['Humidity_Max'] - data['Humidity_Min']).apply(lambda x: round(x, 2))

# Wind
data['Wind_Max_3'] = data['Wind_Max'].rolling(window=3).mean().apply(lambda x: round(x, 2))
data['Wind_Max_7'] = data['Wind_Max'].rolling(window=7).mean().apply(lambda x: round(x, 2))
data['Wind_Min_3'] = data['Wind_Min'].rolling(window=3).mean().apply(lambda x: round(x, 2))
data['Wind_Min_7'] = data['Wind_Min'].rolling(window=7).mean().apply(lambda x: round(x, 2))
data['Wind_Avg_3'] = data['Wind_Avg'].rolling(window=3).mean().apply(lambda x: round(x, 2))
data['Wind_Avg_7'] = data['Wind_Avg'].rolling(window=7).mean().apply(lambda x: round(x, 2))
data['Wind_Diff_Max_Avg'] = (data['Wind_Max'] - data['Wind_Avg']).apply(lambda x: round(x, 2))
data['Wind_Diff_Min_Avg'] = (data['Wind_Min'] - data['Wind_Avg']).apply(lambda x: round(x, 2))
data['Wind_Diff_Max_Min'] = (data['Wind_Max'] - data['Wind_Min']).apply(lambda x: round(x, 2))

# Pressure
data['Pressure_Max_3'] = data['Pressure_Max'].rolling(window=3).mean().apply(lambda x: round(x, 2))
data['Pressure_Max_7'] = data['Pressure_Max'].rolling(window=7).mean().apply(lambda x: round(x, 2))
data['Pressure_Min_3'] = data['Pressure_Min'].rolling(window=3).mean().apply(lambda x: round(x, 2))
data['Pressure_Min_7'] = data['Pressure_Min'].rolling(window=7).mean().apply(lambda x: round(x, 2))
data['Pressure_Avg_3'] = data['Pressure_Avg'].rolling(window=3).mean().apply(lambda x: round(x, 2))
data['Pressure_Avg_7'] = data['Pressure_Avg'].rolling(window=7).mean().apply(lambda x: round(x, 2))
data['Pressure_Diff_Max_Avg'] = (data['Pressure_Max'] - data['Pressure_Avg']).apply(lambda x: round(x, 2))
data['Pressure_Diff_Min_Avg'] = (data['Pressure_Min'] - data['Pressure_Avg']).apply(lambda x: round(x, 2))
data['Pressure_Diff_Max_Min'] = (data['Pressure_Max'] - data['Pressure_Min']).apply(lambda x: round(x, 2))


In [10]:
data_test = data[data.index <= '2021-01-01']
len(data_test) # 1095 rows

7307

In [None]:
# set the target column
data['target'] = data['Temp_Avg']
data = data.dropna()
# save new data to csv
data.to_csv('weather_data_cleaned.csv')