In [None]:
# Import pandas and numpy
import pandas as pd
import numpy as np

# --- 1. Date/Time Data Types & Creation ---

print("--- Date/Time Data Types & Creation ---")

# a) Timestamps: Represent a single point in time
ts = pd.Timestamp('2024-05-04 18:30:00')
print(f"Pandas Timestamp: {ts}")
print(f"Timestamp Year: {ts.year}, Month: {ts.month}, Day: {ts.day}, Hour: {ts.hour}")

# Timestamps can be created from various string formats
ts_alt = pd.Timestamp('5/4/2024')
print(f"\nTimestamp from different format: {ts_alt}")

# b) DatetimeIndex: An index composed of Timestamps
# Often created using pd.to_datetime or pd.date_range

# pd.to_datetime: Converts argument (list, Series, scalar) to datetime objects
date_strs = ['2024-01-01', '2024-01-05', '2024-01-10', '2024-01-15']
dt_index = pd.to_datetime(date_strs)
print(f"\nDatetimeIndex from list of strings:\n{dt_index}")
print(f"Type: {type(dt_index)}")

# Can handle different formats, but specifying format can be faster/safer
date_strs_mixed = ['01/05/2024', '01-06-2024', '20240107']
# dt_index_mixed = pd.to_datetime(date_strs_mixed, format='mixed', dayfirst=False) # Requires format='mixed' or infer_datetime_format=True in older versions
# print(f"\nDatetimeIndex from mixed formats:\n{dt_index_mixed}")

# pd.date_range: Generates a DatetimeIndex with a fixed frequency
# Arguments: start, end, periods, freq
# freq: Frequency string (e.g., 'D' day, 'B' business day, 'H' hour, 'T'/'min' minute,
# 'S' second, 'M' month end, 'MS' month start, 'W' week, 'Q' quarter end)

# Daily frequency for 10 periods starting from '2024-03-01'
date_rng_daily = pd.date_range(start='2024-03-01', periods=10, freq='D')
print(f"\nDate range (daily):\n{date_rng_daily}")

# Business day frequency between two dates
date_rng_biz = pd.date_range(start='2024-04-01', end='2024-04-15', freq='B')
print(f"\nDate range (business days):\n{date_rng_biz}")

# Hourly frequency for 5 periods
date_rng_hourly = pd.date_range(start='2024-05-04 10:00', periods=5, freq='H')
print(f"\nDate range (hourly):\n{date_rng_hourly}")
print("-" * 30)


# --- 2. Time Series Data (Series/DataFrame with DatetimeIndex) ---

print("--- Time Series Data ---")
# Create a time series (Series with DatetimeIndex)
np.random.seed(42)
ts_data = np.random.randn(len(date_rng_daily)) # Random data
time_series = pd.Series(ts_data, index=date_rng_daily)
print("Sample Time Series:\n", time_series)

# Create a DataFrame with a DatetimeIndex
df_ts = pd.DataFrame(np.random.randn(len(date_rng_biz), 3),
                     index=date_rng_biz,
                     columns=['Value1', 'Value2', 'Value3'])
print("\nSample Time Series DataFrame:\n", df_ts.head())
print("-" * 30)


# --- 3. Time Series Indexing & Slicing ---

print("--- Time Series Indexing & Slicing ---")
print("Using Time Series:\n", time_series)

# a) Selecting specific dates
print(f"\nValue at '2024-03-05': {time_series['2024-03-05']}") # Exact match

# b) Partial string indexing (convenient)
# Select all data for a specific year
print("\nData for year 2024:\n", time_series['2024']) # Works because index is DatetimeIndex

# Select all data for a specific month
print("\nData for March 2024:\n", time_series['2024-03'])

# c) Slicing with dates/timestamps (inclusive)
print("\nData from '2024-03-03' to '2024-03-07':\n", time_series['2024-03-03':'2024-03-07'])

# d) Works on DataFrames too
print("\nDataFrame slice '2024-04-03' to '2024-04-09':\n", df_ts['2024-04-03':'2024-04-09'])

# e) Using .truncate() - limits based on dates
# Keep data before a specific date
print("\nTruncate before '2024-03-06':\n", time_series.truncate(before='2024-03-06'))
# Keep data after a specific date
print("\nTruncate after '2024-03-06':\n", time_series.truncate(after='2024-03-06'))
print("-" * 30)


# --- 4. Time Zone Handling ---

print("--- Time Zone Handling ---")
# Create a naive DatetimeIndex (no time zone info)
date_rng_naive = pd.date_range('2024-05-04 12:00:00', periods=5, freq='H')
ts_naive = pd.Series(range(5), index=date_rng_naive)
print("Naive Time Series (no timezone):\n", ts_naive)
print(f"Index timezone: {ts_naive.index.tz}") # Output: None

# a) Localizing naive timestamps (.tz_localize)
# Assign a time zone (e.g., 'Europe/Berlin', 'UTC', 'America/New_York')
ts_berlin = ts_naive.tz_localize('Europe/Berlin')
print("\nTime Series localized to 'Europe/Berlin':\n", ts_berlin)
print(f"Index timezone: {ts_berlin.index.tz}")

# b) Converting between time zones (.tz_convert)
# Requires the index to be timezone-aware first
ts_utc = ts_berlin.tz_convert('UTC')
print("\nTime Series converted to 'UTC':\n", ts_utc)
print(f"Index timezone: {ts_utc.index.tz}")

ts_ny = ts_utc.tz_convert('America/New_York')
print("\nTime Series converted to 'America/New_York':\n", ts_ny)
print(f"Index timezone: {ts_ny.index.tz}")

# Operations between timezone-aware series align to UTC internally
print("\nAdding Berlin and New York series (result in UTC):\n", ts_berlin + ts_ny)
print("-" * 30)


# --- 5. Resampling (`.resample()`) ---
# Changing the frequency of time series data (upsampling or downsampling).
# Similar to groupby: split by time frequency, apply aggregation/interpolation, combine.

print("--- Resampling ---")
# Use a longer daily series for better examples
date_rng_long = pd.date_range('2024-01-01', periods=35, freq='D')
ts_long = pd.Series(np.random.randint(50, 100, size=35), index=date_rng_long)
print("Longer daily Time Series (first 10):\n", ts_long.head(10))

# a) Downsampling (e.g., daily to weekly) - Requires aggregation
# Resample to weekly frequency ('W' = Sunday end of week), calculate mean
weekly_mean = ts_long.resample('W').mean()
print("\nWeekly mean (downsampled from daily):\n", weekly_mean)

weekly_sum = ts_long.resample('W').sum()
print("\nWeekly sum:\n", weekly_sum)

# OHLC (Open, High, Low, Close) resampling for financial data
weekly_ohlc = ts_long.resample('W').ohlc()
print("\nWeekly OHLC:\n", weekly_ohlc)

# Monthly resampling ('M' month end, 'MS' month start)
monthly_max = ts_long.resample('M').max()
print("\nMonthly max:\n", monthly_max)

# b) Upsampling (e.g., daily to hourly) - Requires filling method
# Resample daily series to 6-hourly frequency ('6H')
hourly_resampled = ts_long.resample('6H') # Creates Resampler object
print("\nUpsampled to 6-hourly (before filling):\n", hourly_resampled) # Shows object

# Fill with forward fill
hourly_ffill = hourly_resampled.ffill()
print("\nUpsampled to 6-hourly (forward fill):\n", hourly_ffill.head(10))

# Fill with backward fill
hourly_bfill = hourly_resampled.bfill()
print("\nUpsampled to 6-hourly (backward fill):\n", hourly_bfill.head(10))

# Interpolate missing values (e.g., linear)
# hourly_interpolated = hourly_resampled.interpolate() # Needs SciPy? Let's use ffill/bfill
# print("\nUpsampled to 6-hourly (interpolated):\n", hourly_interpolated.head(10))

# Can use .asfreq() for simple upsampling without filling (leaves NaNs)
hourly_asfreq = ts_long.resample('6H').asfreq()
print("\nUpsampled to 6-hourly (.asfreq(), leaves NaNs):\n", hourly_asfreq.head(10))
print("-" * 30)


# --- 6. Shifting/Lagging (`.shift()`) ---
# Shift data points forward or backward in time. Index remains unchanged.

print("--- Shifting/Lagging ---")
print("Original Time Series (first 5):\n", ts_long.head())

# Shift data forward by 2 periods (values move down)
shifted_forward = ts_long.shift(2)
print("\nShifted forward by 2 periods:\n", shifted_forward.head()) # First 2 values become NaN

# Shift data backward by 1 period (values move up)
shifted_backward = ts_long.shift(-1)
print("\nShifted backward by 1 period:\n", shifted_backward.head()) # Last value becomes NaN

# Calculate percentage change from previous period
pct_change = (ts_long / ts_long.shift(1) - 1) * 100
print("\nPercentage change from previous period:\n", pct_change.head())
print("-" * 30)


# --- 7. Rolling Windows (`.rolling()`) ---
# Calculate statistics over a sliding window of fixed size.

print("--- Rolling Windows ---")
# Calculate 3-day rolling mean
rolling_mean_3d = ts_long.rolling(window=3).mean()
print("3-day rolling mean:\n", rolling_mean_3d.head(10)) # First 2 values are NaN

# Calculate 5-day rolling sum
rolling_sum_5d = ts_long.rolling(window=5).sum()
print("\n5-day rolling sum:\n", rolling_sum_5d.head(10)) # First 4 values are NaN

# Specify minimum number of periods required in window
rolling_mean_3d_min2 = ts_long.rolling(window=3, min_periods=2).mean()
print("\n3-day rolling mean (min_periods=2):\n", rolling_mean_3d_min2.head(10)) # First value is NaN

# Centered rolling window
rolling_mean_3d_center = ts_long.rolling(window=3, center=True).mean()
print("\n3-day centered rolling mean:\n", rolling_mean_3d_center.head(10)) # First and last are NaN

# Rolling standard deviation
rolling_std_4d = ts_long.rolling(window=4).std()
print("\n4-day rolling standard deviation:\n", rolling_std_4d.head(10))

# Apply custom function to rolling window
rolling_max_minus_min = ts_long.rolling(window=3).apply(lambda x: x.max() - x.min(), raw=True) # raw=True can improve performance
print("\n3-day rolling (max - min):\n", rolling_max_minus_min.head(10))
print("-" * 30)


# --- 8. Expanding Windows (`.expanding()`) ---
# Calculate statistics over an expanding window (includes all data up to current point).

print("--- Expanding Windows ---")
# Calculate expanding sum (cumulative sum)
expanding_sum = ts_long.expanding().sum()
print("Expanding sum (cumulative sum):\n", expanding_sum.head(10))
# Note: Equivalent to ts_long.cumsum()

# Calculate expanding mean
expanding_mean = ts_long.expanding().mean()
print("\nExpanding mean:\n", expanding_mean.head(10))

# Specify minimum number of periods
expanding_mean_min3 = ts_long.expanding(min_periods=3).mean()
print("\nExpanding mean (min_periods=3):\n", expanding_mean_min3.head(10)) # First 2 are NaN
print("-" * 30)

--- Date/Time Data Types & Creation ---
Pandas Timestamp: 2024-05-04 18:30:00
Timestamp Year: 2024, Month: 5, Day: 4, Hour: 18

Timestamp from different format: 2024-05-04 00:00:00

DatetimeIndex from list of strings:
DatetimeIndex(['2024-01-01', '2024-01-05', '2024-01-10', '2024-01-15'], dtype='datetime64[ns]', freq=None)
Type: <class 'pandas.core.indexes.datetimes.DatetimeIndex'>

Date range (daily):
DatetimeIndex(['2024-03-01', '2024-03-02', '2024-03-03', '2024-03-04',
               '2024-03-05', '2024-03-06', '2024-03-07', '2024-03-08',
               '2024-03-09', '2024-03-10'],
              dtype='datetime64[ns]', freq='D')

Date range (business days):
DatetimeIndex(['2024-04-01', '2024-04-02', '2024-04-03', '2024-04-04',
               '2024-04-05', '2024-04-08', '2024-04-09', '2024-04-10',
               '2024-04-11', '2024-04-12', '2024-04-15'],
              dtype='datetime64[ns]', freq='B')

Date range (hourly):
DatetimeIndex(['2024-05-04 10:00:00', '2024-05-04 11:00:00',


  date_rng_hourly = pd.date_range(start='2024-05-04 10:00', periods=5, freq='H')
  date_rng_naive = pd.date_range('2024-05-04 12:00:00', periods=5, freq='H')
