### Statistics Question 1

#### Use a web scraping method to download the data for the latest 10-year period.

In [1]:
import requests
import bs4
import pandas as pd

def fetch_fred_data(url):
    page = requests.get(url)
    soup = bs4.BeautifulSoup(page.text, "lxml")
    table = soup.find('table', id='data-table-observations')
    headers = [th.text.strip() for th in table.find('thead').find_all('th')]
    data = [[cell.text.strip() for cell in row.find_all(['th', 'td'])] 
            for row in table.find('tbody').find_all('tr')]
    df = pd.DataFrame(data, columns=headers)
    df['DATE'] = pd.to_datetime(df['DATE'])
    return df

urls = {
    "federal_funds": "https://fred.stlouisfed.org/data/FEDFUNDS",
    "tb_1yr": "https://fred.stlouisfed.org/data/TB1YR",
    "gs_10yr": "https://fred.stlouisfed.org/data/GS10"
}

df1, df2, df3 = [fetch_fred_data(url) for url in urls.values()]

In [9]:
def filter_data(df, name, start="2015-01-01", end="2025-01-01"):
    filtered_df = df[(df['DATE'] >= start) & (df['DATE'] <= end)]
    print(f"{name}\n")
    print(filtered_df, "\n")
    return filtered_df
Federal_funds = filter_data(df1, "Federal_funds")
One_year_treasury_bill = filter_data(df2, "One_year_treasury_bill")
Ten_year_treasury_constant_maturity = filter_data(df3, "Ten_year_treasury_constant_maturity")

#### a) Select appropriate descriptive statistics to explain the behaviour of the federal_funds,1_year_treasury_bill and 10_year_treasury_constant_maturity for the past 10 years 

##### Central Tendency, Dispersion, Distribution Shape

In [11]:
from scipy.stats import skew, kurtosis

# Function to calculate statistics
def stats_summary(series):
    series = pd.to_numeric(series, errors='coerce')
    return {
      '🔶 Central Tendency': '', 'Mean': series.mean(), 'Median': series.median(),
      'Mode': series.mode().iloc[0] if not series.mode().empty else None, '🔶 Dispersion': '', 'Std Dev': series.std(),
      'Variance': series.var(), 'Range': series.max() - series.min(), 'IQR': series.quantile(0.75) - series.quantile(0.25),  
      '🔶 Distribution Shape': '', 'Skewness': skew(series, nan_policy='omit'), 'Kurtosis': kurtosis(series, nan_policy='omit')  
    }
# Datasets
datasets = {
    'Federal Funds': Federal_funds['VALUE'],
    '1Y Treasury Bill': One_year_treasury_bill['VALUE'],
    '10Y Treasury': Ten_year_treasury_constant_maturity['VALUE']
}
# Compute statistics for each dataset
stats = {name: stats_summary(data) for name, data in datasets.items()}

# Convert to DataFrame
df_final = pd.DataFrame(stats)

# Ensure proper order of index
df_final = df_final.reindex([
    '🔶 Central Tendency', 'Mean', 'Median', 'Mode', 
    '🔶 Dispersion', 'Std Dev', 'Variance', 'Range', 'IQR', 
    '🔶 Distribution Shape', 'Skewness', 'Kurtosis'
])
# Display
df_final


#### b) Select appropriate visualizing plots, and explain the behaviour of the federal_funds,1_year_treasury_bill and 10_year_treasury_constant_maturity for the said period

In [10]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

def plot_data(df, title): #### Function to create plots for a given dataset

    df_copy = df.copy() # Create a copy of the DataFrame to avoid modifying the original
    df_copy['VALUE'] = pd.to_numeric(df_copy['VALUE'], errors='coerce')  # Convert to numeric, set errors to 'coerce'
    df_copy = df_copy.dropna(subset=['VALUE'])  # Drop rows where 'VALUE' is NaN
    
    ### Histogram
    plt.figure(figsize=(14, 3))
    plt.subplot(1, 3, 1)
    plt.hist(df_copy['VALUE'], bins=30, color='skyblue', edgecolor='black')
    plt.title(f"{title} - Histogram")
    plt.xlabel("Value")
    plt.ylabel("Frequency")   
    
    ### Scatter plot (assuming 'DATE' is datetime, if not, convert it)
    plt.subplot(1, 3, 2)
    plt.scatter(df_copy['DATE'], df_copy['VALUE'], color='orange', alpha=0.5)
    plt.title(f"{title} - Scatter Plot")
    plt.xlabel("Date")
    plt.ylabel("Value")   
    
    ### Boxplot (using 'VALUE' for the distribution)
    plt.subplot(1, 3, 3)
    sns.boxplot(x=df_copy['VALUE'], color='lightgreen')  # Fix: Use 'x' instead of 'data'
    plt.title(f"{title} - Boxplot")
    plt.xlabel("Value")

    plt.tight_layout()
    plt.show()

plot_data(Federal_funds, "Federal Funds")
plot_data(One_year_treasury_bill, "1Y Treasury Bill")
plot_data(Ten_year_treasury_constant_maturity, "10Y Treasury")

#### c) What are the barriers that you found in the dataset (list a minimum of three) and explain how they influence descriptive statistics and data visualization

In [12]:
# Check missing values
print(Federal_funds.isnull().sum())
print(One_year_treasury_bill.isnull().sum())
print(Ten_year_treasury_constant_maturity.isnull().sum())

In [13]:
# Check for duplicates
print(Federal_funds.duplicated().sum())
print(One_year_treasury_bill.duplicated().sum())
print(Ten_year_treasury_constant_maturity.duplicated().sum())

In [14]:
from scipy.stats import zscore

# Function to detect outliers using Z-score
def find_outliers_zscore(df, column_name):
    return df[abs(zscore(df[column_name].dropna())) > 3]  # Drop NaNs before applying zscore

# Process all three datasets
for df, name in zip([Federal_funds, One_year_treasury_bill, Ten_year_treasury_constant_maturity], 
                    ['Federal Funds', 'One Year Treasury Bill', 'Ten Year Treasury']):
    
    df = df.copy()  # Explicitly create a copy to avoid SettingWithCopyWarning   
    # Convert 'VALUE' column to numeric, handling errors
    df.loc[:, 'VALUE'] = pd.to_numeric(df['VALUE'], errors='coerce')   
    print(f"{name} - Missing Values in 'VALUE': {df['VALUE'].isna().sum()}")  
    
    # Drop NaNs before detecting outliers
    df_clean = df.dropna(subset=['VALUE'])
    outliers = find_outliers_zscore(df_clean, 'VALUE')    
    print(f"{name} Outliers (Z-score): {len(outliers)}")

In [15]:
# Check for invalid dates without conversion
invalid_dates_federal = pd.to_datetime(Federal_funds['DATE'], errors='coerce').isna().sum()
invalid_dates_one_year = pd.to_datetime(One_year_treasury_bill['DATE'], errors='coerce').isna().sum()
invalid_dates_ten_year = pd.to_datetime(Ten_year_treasury_constant_maturity['DATE'], errors='coerce').isna().sum()

# Print the counts of invalid dates (NaT values)
print(f"Invalid dates in Federal Funds: {invalid_dates_federal}")
print(f"Invalid dates in One Year Treasury Bill: {invalid_dates_one_year}")
print(f"Invalid dates in Ten Year Treasury Constant Maturity: {invalid_dates_ten_year}")