In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Load dataset
df = pd.read_csv("daily-min-temperatures.csv")

df.head()


Unnamed: 0,Date,Temp
0,1981-01-01,20.7
1,1981-01-02,17.9
2,1981-01-03,18.8
3,1981-01-04,14.6
4,1981-01-05,15.8


In [2]:
df.shape

(3650, 2)

In [3]:
df.dtypes

Date     object
Temp    float64
dtype: object

In [4]:
df.isna().mean()

Date    0.0
Temp    0.0
dtype: float64

In [5]:
#Convert Date to datetime and sort
df["Date"] = pd.to_datetime(df["Date"])
df = df.sort_values("Date").reset_index(drop=True)

df.describe()

Unnamed: 0,Date,Temp
count,3650,3650.0
mean,1985-12-31 07:12:00,11.177753
min,1981-01-01 00:00:00,0.0
25%,1983-07-02 06:00:00,8.3
50%,1985-12-31 12:00:00,11.0
75%,1988-06-30 18:00:00,14.0
max,1990-12-31 00:00:00,26.3
std,,4.071837


In [None]:
plt.figure(figsize=(10,4))
plt.plot(df["Date"], df["Temp"])
plt.title("Daily Minimum Temperatures Over Time")
plt.xlabel("Date")
plt.ylabel("Temperature")
plt.tight_layout()
plt.show()

In [7]:
#The daily minimum temperature dataset contains a single continuous target variable (Temp) recorded once per day, with a total of 3,650 observations and 2 columns (Date and Temp). After converting the Date column to a proper datetime type and sorting by time, The statistics show that minimum temperatures range roughly between 0°C and 27°C, with an average around the mid-teens.

In [8]:
# Lag features allow the model to learn from past temperatures.
df["lag1"] = df["Temp"].shift(1)
df["lag2"] = df["Temp"].shift(2)
df["lag3"] = df["Temp"].shift(3)
df["lag7"] = df["Temp"].shift(7)   # weekly pattern

# Rolling average features (smooths weekly behavior)
df["roll_mean7"] = df["Temp"].rolling(window=7).mean()

# Drop rows with NaN created by lagging
df_model = df.dropna().reset_index(drop=True)

df_model.head()


Unnamed: 0,Date,Temp,lag1,lag2,lag3,lag7,roll_mean7
0,1981-01-08,17.4,15.8,15.8,15.8,20.7,16.585714
1,1981-01-09,21.8,17.4,15.8,15.8,17.9,17.142857
2,1981-01-10,20.0,21.8,17.4,15.8,18.8,17.314286
3,1981-01-11,16.2,20.0,21.8,17.4,14.6,17.542857
4,1981-01-12,13.3,16.2,20.0,21.8,15.8,17.185714


In [9]:
# Separate features and target
X = df_model[["lag1", "lag2", "lag3", "lag7", "roll_mean7"]]
y = df_model["Temp"]

# Time-series train/test split (no shuffle)
split_idx = int(len(df_model) * 0.8)

X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

X_train.shape, X_test.shape


((2914, 5), (729, 5))

In [10]:
#Because this dataset is a time series, preprocessing focuses on converting the single temperature column into a set of meaningful input features. We created lag features so that the model can learn how past temperatures influence future temperatures. We also added a 7-day rolling mean to capture short-term seasonal trends. These transformations introduce missing values at the start of the series, so the resulting rows were dropped after feature creation. Since time must be preserved, the train/test split was performed chronologically without shuffling, using the first 80% of observations for training and the final 20% for testing. Finally, the input features were standardized for use with linear regression models.

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# --- Linear Regression model ---
lin_reg = LinearRegression()

# Train the model on unscaled features
lin_reg.fit(X_train, y_train)

# Predict on the test set
y_pred_lin = lin_reg.predict(X_test)

# Compute metrics
lin_rmse = np.sqrt(mean_squared_error(y_test, y_pred_lin))
lin_mae = mean_absolute_error(y_test, y_pred_lin)
lin_r2 = r2_score(y_test, y_pred_lin)

lin_rmse, lin_mae, lin_r2


(1.987648458816895, 1.5785504357540898, 0.7655791832393135)

In [16]:
# The model assumes a linear relationship between past temperatures and the future temperature we are trying to predict. After training the model on the unscaled time-series features, its performance on the test set resulted in:
#RMSE: ~1.99
#MAE: ~1.58
#R²: ~0.77
#These metrics show that the model captures a substantial amount of the variance in the temperature data, but the errors indicate the presence of nonlinear patterns that a linear model cannot fully represent. Because daily temperatures often depend on seasonal cycles and nonlinear dynamics.

In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# --- Random Forest Regression model ---
rf_reg = RandomForestRegressor(
    n_estimators=300,
    random_state=42
)

# Train the model
rf_reg.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_reg.predict(X_test)

# Compute metrics
rf_rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))
rf_mae = mean_absolute_error(y_test, y_pred_rf)
rf_r2 = r2_score(y_test, y_pred_rf)

rf_rmse, rf_mae, rf_r2


(2.043097249590647, 1.6032135345221772, 0.7523176266773317)

In [19]:
# Random Forest works by training many decision trees on different subsets of the data and averaging their predictions, which reduces variance and improves generalization. The model was trained with 300 trees and produced the following test performance:
#RMSE: ~2.04
#MAE: ~1.60
#R²: ~0.75
# Compared to Linear Regression, the Random Forest model did not outperform the baseline and actually produced slightly higher error values

In [20]:
#Linear Regression and Random Forest—show very similar performance levels, but Linear Regression slightly outperforms Random Forest across all three metrics (RMSE, MAE, and R²). This makes sense because daily minimum temperature tends to follow a smooth, continuous trend rather than abrupt nonlinear changes. Linear Regression effectively captures this structure, while Random Forest attempts to model more complex relationships that may not exist, leading to mild overfitting and slightly poorer generalization. Computationally, Linear Regression is also far more efficient, training almost instantly compared to Random Forest, which must build hundreds of decision trees. Given its simplicity, interpretability, and superior performance, Linear Regression is the preferred model for this dataset. Random Forest remains useful when additional engineered features.

In [None]:
# This dataset involves weather measurements rather than human subjects, several ethical considerations still apply when building predictive models. First, transparency is important, models used for environmental forecasting should clearly communicate their limitations, especially since inaccurate predictions may influence decisions in agriculture, energy planning, or public safety. Over-reliance on machine learning outputs without understanding uncertainty could lead to misinformed actions. Second, reproducibility is essential data sources, preprocessing steps, and modeling choices should be documented so results can be verified and trusted.

In [21]:
#Bibliography

#Fisher, R. A. Daily Minimum Temperatures Dataset. UCI Machine Learning Repository.
#https://archive.ics.uci.edu/dataset/373/daily+minimum+temperatures

#Scikit-learn Developers. Scikit-learn Documentation.
#https://scikit-learn.org/stable/documentation.html

#Hyndman, R. J., & Athanasopoulos, G. Forecasting: Principles and Practice.
#https://otexts.com/fpp3/