In [4]:
__Title__ = "Using KNN to predict rainy weather patterns through temprature and wind speed"
__Author__ = "Sam Schnellmann"
__Credits__ = "The Coriges Weather Project [Gabriel Marafino, Seth Haldiman, Clayton Napoli, Sam Schnellmann]"

In [1]:
%pwd

'C:\\Users\\schne'

In [2]:
# Changing the folder file where I have the rest of the teams code stored
%cd C:\Users\schne\OneDrive\Desktop\data mining\Data Mining project

C:\Users\schne\OneDrive\Desktop\data mining\Data Mining project


In [5]:
# Making sure that I'm in the right folder
%pwd

'C:\\Users\\schne\\OneDrive\\Desktop\\data mining\\Data Mining project'

In [6]:
# Importing everything I need / I know
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import mean_squared_error

In [7]:
# Weather Dataframe
weather_data = pd.read_csv("weather_revised.csv")
weather_data

Unnamed: 0,Precipitation,Month,Week,Year,City,Code,State,AverageTemp,MaxTemp,MinTemp,WindDirection,WindSpeed
0,0.00,1,3,2016,Birmingham,BHM,Alabama,39,46,32,33,4.33
1,0.00,1,3,2016,Huntsville,HSV,Alabama,39,47,31,32,3.86
2,0.16,1,3,2016,Mobile,MOB,Alabama,46,51,41,35,9.73
3,0.00,1,3,2016,Montgomery,MGM,Alabama,45,52,38,32,6.86
4,0.01,1,3,2016,Anchorage,ANC,Alaska,34,38,29,19,7.80
...,...,...,...,...,...,...,...,...,...,...,...,...
15679,0.11,1,1,2017,Madison,MSN,Wisconsin,27,34,19,25,6.33
15680,0.15,1,1,2017,Milwaukee,MKE,Wisconsin,31,38,23,25,10.98
15681,0.00,1,1,2017,Cheyenne,CYS,Wyoming,32,42,21,26,15.16
15682,0.00,1,1,2017,Lander,LND,Wyoming,17,29,4,26,1.65


In [8]:
# TEMP
# Temperature features and target variable
temperature_features = weather_data[['MinTemp', 'MaxTemp', 'AverageTemp']]
target_temperature = weather_data['Precipitation']

In [9]:
# Temp
# Split the dataset into training and testing sets for temperature
X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(
    temperature_features, target_temperature, test_size=0.2, random_state=42)

In [10]:
# TEMP
# Standardize the features for temperature
scaler_temp = StandardScaler()
X_train_scaled_temp = scaler_temp.fit_transform(X_train_temp)
X_test_scaled_temp = scaler_temp.transform(X_test_temp)

In [11]:
# TEMP
# Build and train the KNN regression model for temperature
knn_model_temp = KNeighborsRegressor(n_neighbors=7)
knn_model_temp.fit(X_train_scaled_temp, y_train_temp)

KNeighborsRegressor(n_neighbors=7)

In [12]:
# TEMP
# Make predictions on the test set for temperature
y_pred_temp = knn_model_temp.predict(X_test_scaled_temp)

In [13]:
# TEMP
# Evaluate the model for temperature
mse_temp = mean_squared_error(y_test_temp, y_pred_temp)

In [14]:
# WIND
# Wind features and target variable
wind_features = weather_data[['WindSpeed', 'WindDirection']]
target_wind = weather_data['Precipitation']

In [15]:
# WIND
# Split the dataset into training and testing sets for wind
X_train_wind, X_test_wind, y_train_wind, y_test_wind = train_test_split(
    wind_features, target_wind, test_size=0.2, random_state=42)

In [16]:
# WIND
# Standardize the features for wind
scaler_wind = StandardScaler()
X_train_scaled_wind = scaler_wind.fit_transform(X_train_wind)
X_test_scaled_wind = scaler_wind.transform(X_test_wind)

In [17]:
# WIND
# Build and train the KNN regression model for wind
knn_model_wind = KNeighborsRegressor(n_neighbors=7)
knn_model_wind.fit(X_train_scaled_wind, y_train_wind)

KNeighborsRegressor(n_neighbors=7)

In [18]:
# WIND
# Make predictions on the test set for wind
y_pred_wind = knn_model_wind.predict(X_test_scaled_wind)

In [19]:
# WIND
# Evaluate the model for wind
mse_wind = mean_squared_error(y_test_wind, y_pred_wind)

In [20]:
# RESULTS
print(f"Mean Squared Error (Temperature Model): {mse_temp:.2f}")
print(f"Mean Squared Error (Wind Model): {mse_wind:.2f}")

Mean Squared Error (Temperature Model): 0.46
Mean Squared Error (Wind Model): 0.54


In [21]:
# START OF TEMP ACCURACY
# Choose a threshold value
threshold = 0.5

# Convert regression predictions to binary
y_pred_temp_binary = (y_pred_temp > threshold).astype(int)
y_test_temp_binary = (y_test_temp > threshold).astype(int)

# Calculate accuracy for temperature model
accuracy_temp_binary = accuracy_score(y_test_temp_binary, y_pred_temp_binary)
# END OF TEMP ACCURACY

# START OF WIND ACCURACY
# Choose a threshold value
threshold_wind = 0.5

# Convert regression predictions to binary for wind
y_pred_wind_binary = (y_pred_wind > threshold_wind).astype(int)
y_test_wind_binary = (y_test_wind > threshold_wind).astype(int)

# Calculate accuracy for wind model
accuracy_wind_binary = accuracy_score(y_test_wind_binary, y_pred_wind_binary)
# END OF WIND ACCURACY

# Accuracy results
print(f"Accuracy (Temperature Model): {accuracy_temp_binary:.2f}")
print(f"Accuracy (Wind Model): {accuracy_wind_binary:.2f}")

Accuracy (Temperature Model): 0.65
Accuracy (Wind Model): 0.56


In [22]:
# In terms of predicting the precipitation level on a given day.
# The Machine Learning algorithm that can accuretly predict better would be based off temprature gagements
# My Temprature scores for the Mean Squared Error is closer to 0 (0 meaning no errors when predicting)
# And it's overall accuracy is higher, sitting at a 65%

# This still being a 65% means that it is not the greatest way of predicting precipitation levels accuretly, but
#     This algorithm would still give us an idea on what a supposed given day would look like based entirely
#     off of temprature gagements

# My other KNN model that uses wind is, within a doubt effective, if not a tad bit faster, but is still not as accurate
#     Nor is it error free. any supposed day with this model could just as well be justified as a coin flip

In [23]:
# This program mainly revolves around MSE (Mean Squared Error, the KNN algorithm, and regressors)
# MSE - MSE is a metric used to quantify the average squared difference between 
#       predicted values and the actual values in a regression problem
# KNN - KNN makes predictions based on the majority class or average value 
#       of the k nearest neighbors in the feature space
# Regressor - a model designed for predicting continuous values

In [24]:
# While playing around with the number of neighbors within the algorithm I noticed that the more
#     neighbors that I add to the algorithm, the closer the MSE (Mean Squared Error) would be to 0
#
#     one things about MSE though, neighbors don't nessisarily effect the outcome:
#         A lower MSE would indicate that the programs predictions are, on average, closer to the
#         actual precipitation levels
#         And vise-versa for a higher MSE (higher means farther from the actual values)
#
#     However the use of fewer neighbors might capture more local details in data, but we could risk
#     overfitting to noise and not generalizing unseen data (this would lead to a higher MSE)
#
#     A larger amount of neighbors could lead to a smoother model that generalizes better
#     This would indicate a better performance in terms of regression accuracy

# My experiment with diffrent sizes of neighbors:
# With only 1 neighbor Temp: MSE = 0.80 & Accuracy = 0.63
#                      Wind: MSE = 0.91 & Accuracy = 0.59
#
# At 2 neighbors Temp: MSE = 0.59 & Accuracy = 0.63
#                Wind: MSE = 0.69 & Accuracy = 0.57
#
# At 3 neighbors Temp: MSE = 0.53 & Accuracy = 0.63
#                Wind: MSE = 0.62 & Accuracy = 0.56
#
# At 4 neighbors Temp: MSE = 0.47 & Accuracy = 0.65
#                Wind: MSE = 0.57 & Accuracy = 0.56
#
# At 7 neighbors Temp: MSE = 0.46 & Accuracy = 0.65
#                Wind: MSE = 0.54 & Accuracy = 0.56
#
# At 10 neighbors Temp: MSE = 0.44 & Accuracy = 0.65
#                 Wind: MSE = 0.54 & Accuracy = 0.54
#
# At 100 neighbors Temp: MSE = 0.41 & Accuracy = 0.65
#                  Wind: MSE = 0.48 & Accuracy = 0.53
#
# 1000 neighbors Temp: MSE = 0.44 & Accuracy = 0.63
#                Wind: MSE = 0.48 & Accuracy = 0.50
#
# 10000 neighbors Temp: MSE = 0.49 & Accuracy = 0.49
#                 Wind: MSE = 0.49 & Accuracy = 0.44

In [25]:
# With this in mind we can see that if we conteniue to add more neighbors could help us level out the MSE score
#    However, in doing so, we would see a very slight decline in the accuracy

# What would be the best ammount of neighbors to set the MSE score to 0?
#     To set the MSE score to 0, we would need to have >10000 neighbors

# What would be the best ammount of neighbors to set accuracy closest to 1?
#     To set the accuracy to 1, is an impossible task, but the closest we could ever
#     get to 1 would be within the range of 4-100 for temprature and only 1 neighbor for wind

# What really is the greatest number of neighbors to anaylze this KNN machine learning algorithm?
#     The best number of neighbors to study our data would be around 7 neighbors, because the MSE for both
#     temp and wind would be closer to 0 and the this is the most accurate depiction of the two