# Introduction
The project is to help predict the possibility of precipitations and its consequential flight delays by using various variables including temperature, wind speed, humidity, dew point, and pressure. The sample dataset used in the project contains 5727 rows (about 5% or original rows) and 9 variables.

**About the Data Source:** The NOAA JFK dataset contains 114,546 hourly observations of various local climatological variables (including visibility, temperature, wind speed and direction, humidity, dew point, and pressure). The data was collected by a NOAA weather station located at the John F. Kennedy International Airport in Queens, New York.

**Data Link (sample dataset):** https://dax-cdn.cdn.appdomain.cloud/dax-noaa-weather-data-jfk-airport/1.1.4/noaa-weather-sample-data.tar.gz

**License:** CDLA-Sharing

In [None]:
# Installing tidymodels
install.packages("tidymodels")
install.packages("rlang")

In [None]:
# Load tidyverse & tidymodels
library(tidyverse)
library(tidymodels)

## Downloading and untarring the sample dataset

In [None]:
url <- "https://dax-cdn.cdn.appdomain.cloud/dax-noaa-weather-data-jfk-airport/1.1.4/noaa-weather-sample-data.tar.gz"
download.file(url, destfile = "noaa-weather-sample-data.tar.gz")
untar("noaa-weather-sample-data.tar.gz", tar = "internal")

## Exploring the data
Reading the dataset and exploring its columns, number of rows and data types.

In [None]:
jfk_weather_sample_df <-read.csv("noaa-weather-sample-data/jfk_weather_sample.csv")
head(jfk_weather_sample_df)
glimpse(jfk_weather_sample_df)

## Data Preprocessing
For preprocessing the following columns are selected and inspected:

* HOURLYRelativeHumidity
* HOURLYDRYBULBTEMPF
* HOURLYPrecip
* HOURLYWindSpeed
* HOURLYStationPressure

In [None]:
jfk_weather_sub_df <- jfk_weather_sample_df %>%
 select(HOURLYRelativeHumidity, HOURLYDRYBULBTEMPF, HOURLYPrecip, HOURLYWindSpeed, HOURLYStationPressure)

head(jfk_weather_sub_df, 10)

In [None]:
unique(jfk_weather_sub_df$HOURLYPrecip)

### Data Cleaning
Cleaning values that have characters ("T" and "s").

In [None]:
jfk_weather_sub_df2 <- jfk_weather_sub_df %>%
 mutate(HOURLYPrecip = str_replace_all(HOURLYPrecip, "T", "0.0"))

jfk_weather_sub_df3 <- jfk_weather_sub_df2 %>%
 mutate(HOURLYPrecip = str_remove(HOURLYPrecip, pattern = "s$"))

unique(jfk_weather_sub_df3$HOURLYPrecip)

### Converting Datatype
Convert HOURLYPrecip to the numeric

In [None]:
jfk_weather_sub_df4 <- jfk_weather_sub_df3 %>%
  mutate_all(type.convert) %>%
  mutate_if(is.character, as.numeric)

glimpse(jfk_weather_sub_df4)

### Renaming Columns

*   'HOURLYRelativeHumidity' to 'relative_humidity'
*   'HOURLYDRYBULBTEMPF' to 'dry_bulb_temp_f'
*   'HOURLYPrecip' to 'precip'
*   'HOURLYWindSpeed' to 'wind_speed'
*   'HOURLYStationPressure' to 'station_pressure'

In [None]:
jfk_weather_sub_df5 <- jfk_weather_sub_df4 %>% 
rename('relative_humidity' = 'HOURLYRelativeHumidity',
'dry_bulb_temp_f' = 'HOURLYDRYBULBTEMPF',
'precip' = 'HOURLYPrecip',
'wind_speed' = 'HOURLYWindSpeed',
'station_pressure' = 'HOURLYStationPressure')

glimpse(jfk_weather_sub_df5)

## Data Training - Train/Test Split
Splitting the data into a training (80%) and testing set (20%).

In [None]:
set.seed(1234)
train_test <- initial_split(jfk_weather_sub_df5, prop = 0.8)
train_set <- training(train_test)
test_set <- testing(train_test)

## Data Modeling - Simple Linear Regression
Exploring four simple linear regression models where:
* precip as the dependent variable ~ relative_humidity as independent variable
* precip ~ dry_bulb_temp_f
* precip ~ wind_speed
* precip ~ station_pressure

Also calculating R Square and RMSE for each model along the way.

In [None]:
lm_1_train <- lm(precip ~ relative_humidity, data = train_set)
lm_1_train

mse_lm_1_train <- mean(lm_1_train$residuals^2)
mse_lm_1_train

rmse_lm_1_train <- sqrt(mse_lm_1_train)
rmse_lm_1_train

summary(lm_1_train)$r.squared


train_set %>%
ggplot(aes(relative_humidity, precip))+
  geom_point()

In [None]:
lm_2_train <- lm(precip ~ dry_bulb_temp_f, data = train_set)
lm_2_train

mse_lm_2_train <- mean(lm_2_train$residuals^2)
mse_lm_2_train

rmse_lm_2_train <- sqrt(mse_lm_2_train)
rmse_lm_2_train

summary(lm_2_train)$r.squared


train_set %>%
ggplot(aes(dry_bulb_temp_f, precip))+
  geom_point()

In [None]:
lm_3_train <- lm(precip ~ wind_speed, data = train_set)
lm_3_train

mse_lm_3_train <- mean(lm_3_train$residuals^2)
mse_lm_3_train

rmse_lm_3_train <- sqrt(mse_lm_3_train)
rmse_lm_3_train

summary(lm_3_train)$r.squared


train_set %>%
ggplot(aes(wind_speed, precip))+
  geom_point()

In [None]:
lm_4_train <- lm(precip ~ station_pressure, data = train_set)
lm_4_train

mse_lm_4_train <- mean(lm_4_train$residuals^2)
mse_lm_4_train

rmse_lm_4_train <- sqrt(mse_lm_4_train)
rmse_lm_4_train

summary(lm_4_train)$r.squared

train_set %>%
ggplot(aes(station_pressure, precip))+
  geom_point()

## Data Modeling - Multiple Linear Regression
Exploring two multiple linear regression models:

* precip ~ relative_humidity + station_pressure 
* precip ~ relative_humidity + station_pressure + wind_speed

Also calculating R Square and RMSE for each model along the way.

In [None]:
mlr_1_train <- lm(precip ~ relative_humidity + station_pressure, data = train_set)
mlr_1_train

mse_mlr_1_train <- mean(mlr_1_train$residuals^2)
mse_mlr_1_train

rmse_mlr_1_train <- sqrt(mse_mlr_1_train)
rmse_mlr_1_train

summary(mlr_1_train)$r.squared

In [None]:
mlr_2_train <- lm(precip ~ relative_humidity + station_pressure + wind_speed, data = train_set)
mlr_2_train

mse_mlr_2_train <- mean(mlr_2_train$residuals^2)
mse_mlr_2_train

rmse_mlr_2_train <- sqrt(mse_mlr_2_train)
rmse_mlr_2_train

summary(mlr_2_train)$r.squared

## Evaluating the models on the testing set

In [None]:
lm_1_test <- lm(precip ~ relative_humidity, data = test_set)

mse_lm_1_test <- mean(lm_1_test$residuals^2)
mse_lm_1_test

rmse_lm_1_test <- sqrt(mse_lm_1_test)
rmse_lm_1_test

summary(lm_1_test)$r.squared

In [None]:
lm_2_test <- lm(precip ~ dry_bulb_temp_f, data = test_set)

mse_lm_2_test <- mean(lm_2_test$residuals^2)
mse_lm_2_test

rmse_lm_2_test <- sqrt(mse_lm_2_test)
rmse_lm_2_test

summary(lm_2_test)$r.squared

In [None]:
lm_3_test <- lm(precip ~ wind_speed, data = test_set)

mse_lm_3_test <- mean(lm_3_test$residuals^2)
mse_lm_3_test

rmse_lm_3_test <- sqrt(mse_lm_3_test)
rmse_lm_3_test

summary(lm_3_test)$r.squared

In [None]:
lm_4_test <- lm(precip ~ station_pressure, data = test_set)

mse_lm_4_test <- mean(lm_4_test$residuals^2)
mse_lm_4_test

rmse_lm_4_test <- sqrt(mse_lm_4_test)
rmse_lm_4_test

summary(lm_4_test)$r.squared

In [None]:
mlr_1_test <- lm(precip ~ relative_humidity + station_pressure, data = test_set)

mse_mlr_1_test <- mean(mlr_1_test$residuals^2)
mse_mlr_1_test

rmse_mlr_1_test <- sqrt(mse_mlr_1_test)
rmse_mlr_1_test

summary(mlr_1_test)$r.squared

In [None]:
mlr_2_test <- lm(precip ~ relative_humidity + station_pressure + wind_speed, data = test_set)

mse_mlr_2_test <- mean(mlr_2_test$residuals^2)
mse_mlr_2_test

rmse_mlr_2_test <- sqrt(mse_mlr_2_test)
rmse_mlr_2_test

summary(mlr_2_test)$r.squared

## Finding the best model
Evaluating the best model based on the highest R Square and the lowest RMSE.

In [None]:
model_names <- c("lm_1", "lm_2", "lm_3", "lm_4", "mlr_1", "mlr_2")
train_rmse <- c("0.042633884986004", "0.043392385033562", "0.0433315518426125", "0.0430772440922356", "0.0425098471997881", "0.0423731277100182")
test_rmse <- c("0.03322053062065", "0.0342407570726119", "0.0336190623854605", "0.0336635239626001", "0.0329297928037371", "0.0321171813867534")
train_rsquared <- c("0.0348125959119728", "0.00016376925087888", "0.00296520979252207", "0.01557426038156", "0.0413364254294442", "0.0474929860384939")
test_rsquared <- c("0.0587280705157021", "0.0000260319245104704", "0.0360085732827907", "0.0334571088498333", "0.0751315309095512", "0.120214446648217")
comparison_df <- data.frame(model_names, train_rmse, test_rmse, train_rsquared, test_rsquared)

In [None]:
comparison_df %>%
#Comparison data frame is arrnaged by descending R-Squared values and ascending RMSE values
arrange(desc(train_rsquared), train_rmse)
#Based on the outcome, the mlr_2 offers the best model since the model has the highest R Square and the lowest RMSE.