In [1]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib       
matplotlib.use('Qt5Agg')
sns.set_style("whitegrid")
import calendar

# DATA PREPROCESSING

### MSTA

In [2]:
MSTA= pd.read_excel('Data_36544086.xlsx', sheet_name='MSTA',
                                header=0, 
                                usecols=['Time', 'Anomaly (deg C)'],
                                parse_dates=True).squeeze()

In [3]:
MSTA['Time'] = pd.to_datetime(MSTA['Time'])
MSTA = MSTA.set_index('Time')

In [4]:
MSTA.describe()

Unnamed: 0,Anomaly (deg C)
count,2100.0
mean,-0.065022
std,0.406237
min,-1.044895
25%,-0.345928
50%,-0.153942
75%,0.122397
max,1.352173


In [5]:
MSTA.isna().sum()

Anomaly (deg C)    0
dtype: int64

In [6]:
# Extract data from the year 1950 onwards
MSTA_1950_onwards = MSTA[MSTA.index >= '1950-01-01']
MSTA_1950_onwards

Unnamed: 0_level_0,Anomaly (deg C)
Time,Unnamed: 1_level_1
1950-01-01,-0.300044
1950-02-01,-0.370036
1950-03-01,-0.216438
1950-04-01,-0.243616
1950-05-01,-0.137298
...,...
2024-08-01,1.239584
2024-09-01,1.144937
2024-10-01,1.199982
2024-11-01,1.225049


### CH4

In [7]:
CH4 = pd.read_excel('Data_36544086.xlsx', sheet_name='CH4',
                header=0, 
                usecols=['Year', 'Month', 'NOAA CH4 (ppb)'], 
                parse_dates=True).squeeze()

In [8]:
# Convert 'Year' and 'Month' to strings and concatenate them
CH4['Time'] = CH4['Year'].astype(str) + '-' + CH4['Month'].astype(str).str.zfill(2)

In [9]:
CH4 = CH4.drop(columns=['Year', 'Month'])
CH4['Time'] = pd.to_datetime(CH4['Time'])
CH4 = CH4.set_index('Time')


In [10]:
CH4.describe()
CH4.isna().sum()

NOAA CH4 (ppb)    0
dtype: int64

In [11]:
CH4

Unnamed: 0_level_0,NOAA CH4 (ppb)
Time,Unnamed: 1_level_1
1983-07-01,1625.95
1983-08-01,1628.05
1983-09-01,1638.44
1983-10-01,1644.80
1983-11-01,1642.59
...,...
2024-05-01,1926.36
2024-06-01,1921.77
2024-07-01,1921.03
2024-08-01,1926.83


### GMAF

In [12]:
GMAF = pd.read_excel('Data_36544086.xlsx', sheet_name='GMAF',
                    header=None,
                    skiprows=227,
                    usecols=[0,2],
                    names=['Time', 'Passenger Count'], 
                    parse_dates=True).squeeze()

In [13]:
# Reset the index to access the 'Time' column
#GMAF = GMAF.reset_index()

# Convert the 'Time' column to datetime format
GMAF['Time'] = pd.to_datetime(GMAF['Time'], format='%Y %b')

# Format the dates to 'YYYY-MM'
GMAF['Time'] = GMAF['Time'].dt.strftime('%Y-%m')

GMAF['Time'] = pd.to_datetime(GMAF['Time'])

# Set 'Time' column back as the index
GMAF = GMAF.set_index('Time')

In [14]:
GMAF['Passenger Count'] = GMAF['Passenger Count'].astype(int)

In [15]:
GMAF.describe()

Unnamed: 0,Passenger Count
count,528.0
mean,4259.448864
std,2304.916442
min,150.0
25%,2475.5
50%,4033.0
75%,5793.25
max,11628.0


In [16]:
GMAF.isna().sum()

Passenger Count    0
dtype: int64

### ET12

In [17]:
ET12 = pd.read_excel('Data_36544086.xlsx', sheet_name='ET12',
                                    header=None,
                                    skiprows= 6,
                                    usecols=[0,1],
                                    names=['Time', 'Unadjusted Total'], parse_dates=True).squeeze()

In [18]:
ET12 = ET12.set_index('Time')

In [19]:
ET12['Unadjusted Total'] = ET12['Unadjusted Total'].round(2)

In [20]:
# Reset the index to access the 'Time' column
ET12 = ET12.reset_index()

# Clean the 'Time' column to remove unwanted characters or spaces
ET12['Time'] = ET12['Time'].str.replace(r'\[.*\]', '', regex=True).str.strip()

# Convert the 'Time' column to datetime format
ET12['Time'] = pd.to_datetime(ET12['Time'], format='%B %Y')

# Format the dates to 'YYYY-MM'
ET12['Time'] = ET12['Time'].dt.strftime('%Y-%m')

# Set 'Time' column back as the index
ET12['Time'] = pd.to_datetime(ET12['Time'])
ET12 = ET12.set_index('Time')

In [21]:
ET12.describe()

Unnamed: 0,Unadjusted Total
count,360.0
mean,17.369639
std,3.374263
min,10.02
25%,14.89
50%,16.9
75%,19.735
max,25.02


In [22]:
ET12.isna().sum()

Unadjusted Total    0
dtype: int64

### Calendar Adjustment

In [23]:
variables = [MSTA_1950_onwards, CH4, GMAF, ET12]

In [24]:
#Calendar adjustment
# Create empty column for days in the month
def calendar_adjustment(df):
    df["Days"] = np.nan
    # Fill empty columns with the days per calendar month
    for date in df.index:
        df.loc[date, "Days"] = calendar.monthrange(date.year, date.month)[1]
    # Perform calendar adjustment
    df["Adjusted Data"] = (df.iloc[:, 0] * 365.25 / (12 * df["Days"])).round(2)


#for i in variables:
for i in [MSTA_1950_onwards, GMAF, ET12]:
    calendar_adjustment(i)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Days"] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Adjusted Data"] = (df.iloc[:, 0] * 365.25 / (12 * df["Days"])).round(2)


In [25]:
# CH4 = CH4.drop(columns=['Days', 'NOAA CH4 (ppb)'])
MSTA_1950_onwards = MSTA_1950_onwards.drop(columns=['Days', 'Anomaly (deg C)'])
GMAF = GMAF.drop(columns=['Days', 'Passenger Count'])
ET12 = ET12.drop(columns=['Days', 'Unadjusted Total'])

In [26]:
def rename_columns(df, df_name):
    df = df.rename(columns={'Adjusted Data': df_name})
    return df

In [27]:
# CH4 = rename_columns(CH4, 'CH4 (adjusted)')
MSTA_1950_onwards = rename_columns(MSTA_1950_onwards, 'Anomaly (adjusted deg C)')#
GMAF = rename_columns(GMAF, 'Passenger Count (adjusted)')
ET12 = rename_columns(ET12, 'Total ET12 (adjusted)')

In [28]:
CH4

Unnamed: 0_level_0,NOAA CH4 (ppb)
Time,Unnamed: 1_level_1
1983-07-01,1625.95
1983-08-01,1628.05
1983-09-01,1638.44
1983-10-01,1644.80
1983-11-01,1642.59
...,...
2024-05-01,1926.36
2024-06-01,1921.77
2024-07-01,1921.03
2024-08-01,1926.83


In [29]:
# Add frequency to the index
def frequency_add(i):
    i.index = pd.DatetimeIndex(i.index.values, freq=i.index.inferred_freq)

for i in variables:
    frequency_add(i)

In [30]:
# Set the name attribute for each DataFrame
variables[0].name = 'MSTA_1950_onwards'
variables[1].name = 'CH4'
variables[2].name = 'GMAF'
variables[3].name = 'ET12'

# Decomposition

In [32]:
# Seasonal Decomposition
from statsmodels.tsa.seasonal import seasonal_decompose

In [33]:
#Additive Decomposition
def additive_decompose(variable):
        result = seasonal_decompose(variable.iloc[:, 0], model='additive')
        result.plot()
        plt.show()
#Multiplicative Decomposition
def mul_decompose(variable):
        result = seasonal_decompose(variable.iloc[:, 0], model='multiplicative')
        result.plot()
        plt.show()

for i in variables:
        additive_decompose(i)

In [34]:
for i in [CH4, GMAF, ET12]:
    mul_decompose(i)