### Data Retrive

In [None]:
from datetime import datetime
from meteostat import Daily, Point

# # Set time period
start = datetime(1960, 1, 1)
end = datetime(2023, 10, 16)

# Create Point for city
new_york = Point(40.7119, -73.9939, 40.7012)

# #Get daily data
data = Daily(new_york, start, end)
data = data.fetch()
print(data)

# save data to csv file for further analysis
data.to_csv('newyork_historical_weather_data.csv')

## Data Preprocessing and data cleaning

In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

data = pd.read_csv('newyork_historical_weather_data.csv')

data.describe()
data.info()

data.isnull().sum()
data.shape
data_copy=data.copy() #creating a copy of the original dataframe
#removing unneccesary columns
data = data.drop(columns=['tmin','tmax','snow','wdir','wpgt','tsun'])
#removing if a row contains more than 2 null values
data_cleaned = data.dropna(thresh=4)
data_cleaned.isnull().sum()
#treatment for every columns
# tavg column
data_cleaned['tavg'].interpolate(method='linear', inplace=True)
#for wspd column
data_cleaned['wspd'].interpolate(method='linear', inplace=True)
#for pres column
data_cleaned['pres'].fillna(data_cleaned['pres'].mean(), inplace=True)
data_cleaned.isnull().sum()
data_cleaned.to_csv('cleaned_ny_historical_weather_data.csv')


## Data Visualization

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib qt5
data=pd.read_csv("cleaned_ny_historical_weather_data.csv")
print(data.describe())
print(data.isnull().sum())
#automated data viz
import dtale
dtale.show(data).open_browser()
#heatmap
numeric_cols = ['tavg', 'prcp', 'wspd', 'pres', 'Visibility']
correlation_matrix = data[numeric_cols].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Heatmap")
plt.show()

## Machine Learning

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

data=pd.read_csv("cleaned_ny_historical_weather_data.csv")

data = data.drop(['Unnamed: 0','time'], axis=1)

def preprocess_inputs(df):

    # Split df into X and y
    y = df['Visibility'].copy()
    X = df.drop('Visibility', axis=1).copy()

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)

    # Scale X with a standard scaler
    scaler = StandardScaler()
    scaler.fit(X_train)

    X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = preprocess_inputs(data)

y_test_actual = y_test

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

from sklearn import metrics

linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

dec_tree_reg = DecisionTreeRegressor()
dec_tree_reg.fit(X_train, y_train)

randomforest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
randomforest_reg.fit(X_train, y_train)

supp_vec_reg = SVR(kernel='linear')
supp_vec_reg.fit(X_train, y_train)

def model_evaluation(model, model_name):
    y_pred = model.predict(X_test)

    MAE = metrics.mean_absolute_error(y_test, y_pred)
    MSE = metrics.mean_squared_error(y_test, y_pred)
    RMSE = np.sqrt(MSE)
    R2_Score = metrics.r2_score(y_test, y_pred)

    return pd.DataFrame([MAE, MSE, RMSE, R2_Score], index=['MAE', 'MSE', 'RMSE' ,'R2-Score'], columns=[model_name])

print(model_evaluation(linear_reg,'Linear Reg.'))
print('\n',model_evaluation(dec_tree_reg,'Decision Tree Reg.'))
print('\n',model_evaluation(randomforest_reg,'Random Forest Reg.'))
print('\n',model_evaluation(supp_vec_reg,'SVR'))

In [None]:
def result_comp(x,o="Regression Model"):
    y_test_pred = x.predict(X_test)
    print(o)
    df_comp = pd.DataFrame({'Actual':y_test_actual, 'Predicted':y_test_pred})
    print("\n",df_comp,"\n")
result_comp(linear_reg,"Linear Reg.")
result_comp(dec_tree_reg,"Decision Tree Reg.")
result_comp(randomforest_reg,"Random Forest Reg.")
result_comp(supp_vec_reg,"Support Vector Reg.")