In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import statsmodels.api as sm
import seaborn as sns
import os
from statsmodels.nonparametric.smoothers_lowess import lowess

# Load & Process Data

In [None]:
prec_data = pd.read_csv(os.path.join('data', 'prec-Mainland-raw.csv'))
prec_data = prec_data.melt(id_vars=["year"], var_name="month_str", value_name="prec")
prec_data.head()

month_map = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6,
             'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12}

prec_data["month"] = prec_data["month_str"].map(month_map)

# Build datetime column (use first day of month as convention)
prec_data["date"] = pd.to_datetime(dict(year=prec_data["year"], 
                                       month=prec_data["month"], 
                                       day=1))
prec_data.head()

In [None]:
inverse_month_map = {v:k for k,v in month_map.items()}
inverse_month_map

In [None]:
# Taken from https://www.ipma.pt/en/oclima/series.longas/?loc=Mainland&type=raw
temp_data = pd.read_csv(os.path.join('data', 'temp-Mainland-raw.csv'))
temp_data['month'] = temp_data['date'].str.extract(r'([0-9]{2})')
temp_data['year'] = temp_data['date'].str.extract(r'([0-9]{4})')
temp_data['month'] = pd.to_numeric(temp_data['month'])
temp_data['year'] = pd.to_numeric(temp_data['year'])
temp_data['date'] = pd.to_datetime(temp_data['date'], format='%m/%Y')
temp_data.head()

In [None]:
full_data = pd.merge(temp_data, prec_data[["date", "prec"]], 
                  on="date", how="inner")
full_data['month_str'] = full_data['month'].map(inverse_month_map)

full_data['tdiff'] = full_data['tmax'] - full_data['tmin']

full_data.head()

In [None]:
start, end = "1938-01-01", "2020-01-01"

In [None]:
selected_data = full_data[full_data['date'].between(start, end)]

# Plot time Series

In [None]:
def plot_time_series(df, date_col, var_col, ma_window=None):
    # 1. Set up the plot
    fig, ax = plt.subplots(figsize=(12, 6))

    # 2. Plot using the new 'date' column for the x-axis
    ax.plot(df[date_col], df[var_col], marker='o', linestyle='-')
    # 3. Optionally plot moving average
    if ma_window is not None and ma_window > 1:
        ma_series = df[var_col].rolling(window=ma_window).mean()
        ax.plot(df[date_col], ma_series, color='red', linewidth=2,
                label=f'{ma_window}-period MA')
        
    # 3. Format the date axis for clarity ✨
    # Set the major locator to find the start of each year
    ax.xaxis.set_major_locator(mdates.YearLocator(base=5))
    # Set the format of the major labels to show just the year (e.g., "2023")
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))

    # To add ticks for every 3 months, you can use a minor locator
    ax.xaxis.set_minor_locator(mdates.MonthLocator(interval=6))

    # 4. Add labels and a grid
    ax.set_title(var_col)
    ax.set_xlabel('Date')
    ax.set_ylabel('Value')
    ax.grid(True, which='major', alpha=0.6)
    ax.grid(True, which='minor', alpha=0.2)

    plt.tight_layout()
    plt.show()


In [None]:
plot_time_series(selected_data, 'date', 'tmed', ma_window=12)

In [None]:
plot_time_series(selected_data, 'date', 'tmax', ma_window=12)

In [None]:
plot_time_series(selected_data, 'date', 'tmin', ma_window=12)

In [None]:
plot_time_series(selected_data, 'date', 'tdiff', ma_window=12)

In [None]:
tdiff_year = []
for year in full_data['year'].unique():
    date = full_data[full_data['year'] == year]['date'].min()
    tdiff_year.append({
        "date": date,
        "tdiff": full_data[full_data['year'] == year]['tmax'].max() - full_data[full_data['year'] == year]['tmin'].min()
    }) 


tdiff_year = pd.DataFrame.from_dict(tdiff_year)
tdiff_year.head()

In [None]:
plot_time_series(tdiff_year, 'date', 'tdiff', ma_window=12)

In [None]:
plot_time_series(selected_data, 'date', 'prec', ma_window=12)

Initial Characteristics:

* Seasonality is a given
* Trends at least so far are hard to see with just the time series plots

In [None]:
temp_data = selected_data[['date','year','month_str','tmin', 'tmed', 'tmax']]
temp_data = temp_data.set_index('date')

In [None]:
fig, ax = plt.subplots(figsize=(16, 8))
sm.graphics.tsa.month_plot(temp_data['tmed'], ylabel='tmed', ax=ax)
ax.set_title("Seasonal Subseries Plot: Avg Temp")
ax.set_xlabel("Month")
plt.show()

In [None]:
sns.lineplot(data=temp_data, x='year', y='tmed', hue='month_str')
plt.title('Monthly Plot: Tmed')
plt.xlabel('Year')
plt.ylabel('Tmed (C)')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
plt.show()

In [None]:
sns.lineplot(data=temp_data, x='year', y='tmin', hue='month_str')
plt.title('Monthly Plot: Tmin')
plt.xlabel('Year')
plt.ylabel('Tmin (C)')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
plt.show()

In [None]:
sns.lineplot(data=temp_data, x='year', y='tmax', hue='month_str')
plt.title('Monthly Plot: Tmax')
plt.xlabel('Year')
plt.ylabel('Tmax (C)')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
plt.show()

In [None]:
def lag_plot_grid(ts, lags=12, title="Lag Plots"):
    """Create grid of lag plots"""
    fig, axes = plt.subplots(3, 4, figsize=(15, 10))
    fig.suptitle(title, fontsize=16)
    
    for i in range(lags):
        row = i // 4
        col = i % 4
        
        # Create lagged series
        lagged = ts.shift(i+1)
        
        # Remove NaN values
        mask = ~(np.isnan(ts) | np.isnan(lagged))
        x = ts[mask]
        y = lagged[mask]
        
        # Scatter plot
        axes[row, col].scatter(y, x, alpha=0.9, s=10, color="steelblue", edgecolor="black")
        axes[row, col].set_title(f'Lag {i+1}')
        axes[row, col].set_ylabel('X(t)')
        axes[row, col].set_xlabel(f'X(t-{i+1})')
        axes[row, col].grid(True, alpha=0.3)

        # Compute correlation
        corr = np.corrcoef(y, x)[0, 1]
        axes[row, col].text(
            0.05, 0.95,
            f"r = {corr:.3f}",
            transform=axes[row, col].transAxes,
            fontsize=12,
            verticalalignment="top",
            bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.5)
        )

        # Fit LOWESS
        smoothed = lowess(x, y, frac=0.3)  # frac controls smoothing
        axes[row, col].plot(smoothed[:,0], smoothed[:,1], color="red", linewidth=1.5)

    plt.tight_layout()
    plt.show()


In [None]:
temp_data['tmax'].head()

In [None]:
lag_plot_grid(temp_data['tmed'])