IMports


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

Train and test data

In [3]:
# For setting up the random state
import random
random.seed(42)

data = pd.read_csv("../input/weather-dataset-us/sampled_data.csv")

In [20]:
data

Unnamed: 0,ID,DATE,TMAX,TMIN,EVAP,PRCP,Latitude,Longitude,Elevation
0,USC00417140,11/24/2018,244.0,111.0,41.0,3.0,28.6575,-96.5553,6.1
1,US10cust046,12/6/2016,0.0,0.0,0.0,0.0,41.4816,-99.8928,698.0
2,US1OKGR0001,11/29/2009,0.0,0.0,0.0,3.0,34.9594,-99.3889,498.0
3,USC00091732,3/13/1996,217.0,-28.0,0.0,0.0,34.0589,-85.2339,239.3
4,USC00418566,10/7/2018,194.0,139.0,0.0,84.0,33.4792,-100.8761,700.1
...,...,...,...,...,...,...,...,...,...
7792040,US1MOBN0056,11/3/2014,0.0,0.0,0.0,0.0,38.8600,-92.3631,217.3
7792041,US1ORCC0019,2/22/2017,0.0,0.0,0.0,135.0,45.1725,-122.4375,214.0
7792042,USS0010J30S,5/14/1992,137.0,-20.0,0.0,0.0,40.5800,-110.5900,3230.9
7792043,US1ORDS0049,11/14/2020,0.0,0.0,0.0,213.0,44.1063,-121.2897,1054.9


In [9]:
data.isna().sum()

ID                 0
DATE               0
TMAX         3810345
TMIN         3812880
EVAP         7705944
PRCP          710903
Latitude           0
Longitude          0
Elevation          0
dtype: int64

In [10]:
data = data.fillna(0)

In [11]:
# Features and target
X = data[['Latitude', 'Longitude']]
y = data['PRCP']

# Train-test split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Initialize models

In [16]:
# Decision Tree Regressor
dt_model = DecisionTreeRegressor(random_state=42)

# XGBoost Regressor
xgb_model = XGBRegressor(random_state=42, n_estimators=100, learning_rate=0.1)

# Linear Regression Model
lr_model = LinearRegression()

# K-Nearest Neighbors Regressor
knn_model = KNeighborsRegressor(n_neighbors=5)

# List of models to iterate over
models = [dt_model, xgb_model, lr_model, knn_model]
model_names = ['Decision Tree', 'XGBoost', 'Linear Regression', 'K-Nearest Neighbors']


Training model 

In [22]:
# Loop to train and evaluate all models
results = {}

for model, name in zip(models, model_names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Calculate the metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results[name] = {'MSE': mse, 'R2': r2}
    
    print(f"{name} Model Performance:")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"R-squared: {r2:.4f}")
    print("-" * 50)

Decision Tree Model Performance:
Mean Squared Error: 6664.5147
R-squared: 0.0285
--------------------------------------------------
XGBoost Model Performance:
Mean Squared Error: 6711.5455
R-squared: 0.0217
--------------------------------------------------
Linear Regression Model Performance:
Mean Squared Error: 6816.3665
R-squared: 0.0064
--------------------------------------------------
K-Nearest Neighbors Model Performance:
Mean Squared Error: 7926.1158
R-squared: -0.1554
--------------------------------------------------
Random Forest Model Performance:
Mean Squared Error: 6696.3402
R-squared: 0.0239
--------------------------------------------------


ModuleNotFoundError: No module named 'ace_tools'

In [23]:
# Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42, n_estimators=5,max_depth=10)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

# Calculate the metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

results[name] = {'MSE': mse, 'R2': r2}

print(f"{name} Model Performance:")
print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared: {r2:.4f}")
print("-" * 50)

Random Forest Model Performance:
Mean Squared Error: 6697.8608
R-squared: 0.0237
--------------------------------------------------


Hyperparameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV

# Example: Grid Search for Random Forest
param_grid = {'n_estimators': [100, 200], 'max_depth': [None, 10, 20]}
grid_search = GridSearchCV(rf_model, param_grid, cv=5)
grid_search.fit(X_train, y_train)
print(f"Best parameters for Random Forest: {grid_search.best_params_}")
