In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Get GPT Sentiment Index

In [None]:
data_gpt_35 = pd.read_csv(
    "/Users/nicolasroever/Documents/Promotion/Debt Crisis/debt_crisis/src/debt_crisis/data/GPT_Output_Data/sentiment_data_portugal_output_v003.csv",
)
data_gpt_4 = pd.read_csv(
    "/Users/nicolasroever/Documents/Promotion/Debt Crisis/debt_crisis/src/debt_crisis/data/GPT_Output_Data/sentiment_data_portugal_output_v005.csv",
)
clean_transcripts = pd.read_pickle(
    "/Users/nicolasroever/Documents/Promotion/Debt Crisis/debt_crisis/bld/data/df_transcripts_raw.pkl",
)
training_data = pd.read_pickle(
    "/Users/nicolasroever/Documents/Promotion/Debt Crisis/debt_crisis/bld/data/gpt_sentiment_data/df_gpt_sentiment_training_dataset_cleaned.pkl",
)

In [None]:
# Make 35 Dataset
# Add Transcript_ID
full_data_35 = data_gpt_35.merge(
    training_data,
    how="left",
    left_on="Snippet_ID",
    right_on="Snippet_ID",
    validate="one_to_one",
)

# Check
are_columns_equal = full_data_35["Snippet"] == full_data_35["Excerpt"]
# Check if all values in the two columns are equal
all_equal = are_columns_equal.all()

# To see the result
print(all_equal)
full_data_35 = full_data_35.merge(
    clean_transcripts,
    how="left",
    left_on="Transcript_ID",
    right_on="Transcript_ID",
    validate="many_to_one",
)
# Count the number of rows before dropping NaN values
before_drop = len(full_data_35)

# Drop rows with NaN values in the "Prediction" column
full_data_35 = full_data_35.dropna(subset=["Prediction"])

# Count the number of rows after dropping NaN values
after_drop = len(full_data_35)

# Calculate and print the number of observations dropped
observations_dropped = before_drop - after_drop
print(f"Number of observations dropped: {observations_dropped}")

In [None]:
# Make 4 Dataset
# Add Transcript_ID
full_data_4 = data_gpt_4.merge(
    training_data,
    how="left",
    left_on="Snippet_ID",
    right_on="Snippet_ID",
    validate="one_to_one",
)

# Check
are_columns_equal = full_data_4["Snippet"] == full_data_4["Excerpt"]
# Check if all values in the two columns are equal
all_equal = are_columns_equal.all()

# To see the result
print(all_equal)
full_data_4 = full_data_4.merge(
    clean_transcripts,
    how="left",
    left_on="Transcript_ID",
    right_on="Transcript_ID",
    validate="many_to_one",
)
# Count the number of rows before dropping NaN values
before_drop = len(full_data_4)

# Drop rows with NaN values in the "Prediction" column
full_data_4 = full_data_4.dropna(subset=["Prediction"])

# Count the number of rows after dropping NaN values
after_drop = len(full_data_4)

# Calculate and print the number of observations dropped
observations_dropped = before_drop - after_drop
print(f"Number of observations dropped: {observations_dropped}")

In [None]:
def calculate_gpt_sentiment_index(
    preprocessed_data,
    countries_under_study,
    day_window=90,
):
    """This function calculates the sentiment index taking as input the preprocessed
    data generated by earlier functions in this script.

    Args:
        preprocessed_data (pd.DataFrame): Dataframe with the data from gpt
        countries_under_study (list): List of countries to consider
        day_window (int): Number of days to consider for the sentiment index

    Returns:
        pd.DataFrame: Dataframe with the sentiment index
        columns: Date (pd.DateTime): Date of sentiment index
                Sentiment_Index_country (int): Sentiment index for the country (there is one of such columns for every country under study.)
    """
    # Ensure 'Date' column in preprocessed_data is of datetime type
    preprocessed_data["Date"] = pd.to_datetime(preprocessed_data["Date"])

    # Create date range from January 2003 to January 2023
    date_range = pd.date_range(start="1/1/2003", end="1/1/2023")

    # Initialize a DataFrame with 'Date' column
    result_df = pd.DataFrame(date_range, columns=["Date"])

    # Set 'Date' as index for efficient lookup
    result_df = result_df.set_index("Date")
    preprocessed_data = preprocessed_data.set_index("Date")

    # Iterate over each date
    for date in date_range:
        # Iterate over each country
        for country in countries_under_study:
            # Calculate the sum of the Sentiment_Index_McDonald_{country} column over the prior day_window days
            end_date = date
            start_date = end_date - pd.Timedelta(
                days=day_window,
            )  # start date is day_window days before the end date

            # Extract the data for the window
            # Filter for observations within the date range using boolean indexing
            mask = (preprocessed_data.index >= start_date) & (
                preprocessed_data.index <= end_date
            )
            window_data = preprocessed_data.loc[mask, "Prediction"]

            # Calculate the sentiment index
            sentiment_index = (
                window_data.sum() / len(window_data) if len(window_data) > 0 else np.nan
            )

            # Add the sentiment index to the result DataFrame
            result_df.loc[date, f"Sentiment_GPT_{country}"] = sentiment_index

    return result_df.reset_index()

In [None]:
gpt_sentiment_index_35 = calculate_gpt_sentiment_index(
    preprocessed_data=full_data_35,
    countries_under_study=["portugal"],
)

In [None]:
gpt_sentiment_index_4 = calculate_gpt_sentiment_index(
    preprocessed_data=full_data_4,
    countries_under_study=["portugal"],
)

In [None]:
plt.figure(figsize=(10, 6))  # Set the figure size
plt.plot(
    gpt_sentiment_index_35["Date"],
    gpt_sentiment_index_35["Sentiment_GPT_portugal"],
    label="Sentiment GPT Portugal",
    marker="o",
)
# Adding the time series from full_data_gpt_4
plt.plot(
    gpt_sentiment_index_4["Date"],
    gpt_sentiment_index_4["Sentiment_GPT_portugal"],
    label="Sentiment GPT Index 4",
    marker="x",
)
plt.title("Sentiment GPT Portugal Over Time")  # Title of the plot
plt.xlabel("Date")  # X-axis label
plt.ylabel("Sentiment GPT Score")  # Y-axis label
plt.legend()
plt.show()

In [None]:
# Get Correlation
# Step 1: Ensure 'Date' is the index and is of datetime type
gpt_sentiment_index_35["Date"] = pd.to_datetime(gpt_sentiment_index_35["Date"])
gpt_sentiment_index_4["Date"] = pd.to_datetime(gpt_sentiment_index_4["Date"])
gpt_sentiment_index_35 = gpt_sentiment_index_35.set_index("Date")
gpt_sentiment_index_4 = gpt_sentiment_index_4.set_index("Date")

# Step 2: Align both series on 'Date'
aligned_data = gpt_sentiment_index_35[["Sentiment_GPT_portugal"]].merge(
    gpt_sentiment_index_4[["Sentiment_GPT_portugal"]],
    left_index=True,
    right_index=True,
    suffixes=("_35", "_4"),
)

# Step 3: Calculate the correlation
correlation = aligned_data.corr().iloc[0, 1]
correlation
gpt_sentiment_index_35 = gpt_sentiment_index_35.reset_index()
gpt_sentiment_index_4 = gpt_sentiment_index_4.reset_index("Date")

# Get normal senitment index

In [None]:
mcdonald_sentiment_index = pd.read_pickle(
    "/Users/nicolasroever/Documents/Promotion/Debt Crisis/debt_crisis/bld/data/mcdonald_sentiment_index_negative_and_positive_20_.pkl",
)

In [None]:
plt.figure(figsize=(10, 6))  # Set the figure size
plt.plot(
    mcdonald_sentiment_index["Date"],
    mcdonald_sentiment_index["Sentiment_Index_McDonald_portugal"],
    label="Sentiment McDonald Portugal",
    marker="o",
)
plt.title("Sentiment GPT Portugal Over Time")  # Title of the plot
plt.xlabel("Date")  # X-axis label
plt.ylabel("Sentiment GPT Score")  # Y-axis label
plt.legend()
plt.show()

# GEt bond yield data

In [None]:
yield_data = pd.read_pickle(
    "/Users/nicolasroever/Documents/Promotion/Debt Crisis/debt_crisis/bld/data/financial_data/Quarterly Macroeconomic Variables_cleaned.pkl",
)

In [None]:
# Step 1: Filter for Portugal
portugal_data = yield_data[yield_data["Country"] == "portugal"]

# Step 2: Convert 'Date' column to datetime format
portugal_data["Date"] = pd.to_datetime(portugal_data["Date"])


# Step 4: Plotting
plt.figure(figsize=(10, 6))  # Set the figure size
plt.plot(
    portugal_data["Date"],
    portugal_data["10y_Maturity_Bond_Yield"],
    label="10y Maturity Bond Yield for Portugal",
    marker="o",
)
plt.title("10y Maturity Bond Yield for Portugal Over Time")  # Title of the plot
plt.xlabel("Date")  # X-axis label
plt.ylabel("10y Maturity Bond Yield (%)")  # Y-axis label
plt.legend()  # Show legend
plt.grid(True)  # Show grid
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.tight_layout()  # Adjust layout to not cut off labels
plt.show()  # Display the plot

# Combine all data in one plot

In [None]:
quarter_dates = portugal_data["Date"]
gpt_quarter_data = gpt_sentiment_index_4[
    gpt_sentiment_index_4["Date"].isin(quarter_dates)
]
mcdonald_quarter_data = mcdonald_sentiment_index[
    mcdonald_sentiment_index["Date"].isin(quarter_dates)
]

In [None]:
# Step 3: Plotting
fig, ax1 = plt.subplots(figsize=(10, 6))

# Plot sentiment from full_data
ax1.plot(
    gpt_quarter_data["Date"],
    gpt_quarter_data["Sentiment_GPT_portugal"],
    label="Sentiment GPT Portugal",
    alpha=0.9,
)
ax1.set_xlabel("Date")
ax1.set_ylabel("Sentiment from GPT", color="blue")
ax1.tick_params(axis="y", labelcolor="blue")

# Create a second y-axis for bond yield
ax2 = ax1.twinx()
ax2.plot(
    portugal_data["Date"],
    portugal_data["10y_Maturity_Bond_Yield"],
    label="Bond Yield",
    color="red",
)
ax2.set_ylabel("Bond Yield", color="red")
ax2.tick_params(axis="y", labelcolor="red")

# Create a third y-axis for sentiment in df
ax3 = ax1.twinx()
# Offset the right spine of ax3. The ticks and label have already been
# colored in ax2, so only the spine needs to be colored.
ax3.spines["right"].set_position(("outward", 60))  # Offset the third axis
ax3.plot(
    mcdonald_quarter_data["Date"],
    mcdonald_quarter_data["Sentiment_Index_McDonald_portugal"],
    label="Sentiment McDonald Portugal",
    color="green",
    alpha=0.9,
)
ax3.set_ylabel("Sentiment from Loughran and McDonald", color="green")
ax3.tick_params(axis="y", labelcolor="green")

# Optional: Add a legend or grid
ax1.legend(loc="upper left")
ax2.legend(loc="lower left")
ax3.legend(loc="upper right")
ax1.grid(True)

plt.title("Sentiment and Bond Yield Over Time for Portugal")
plt.show()

# Get Correlation Matix

In [None]:
# Step 1: Filter for quarter dates
quarter_dates = portugal_data["Date"]
gpt_quarter_data = gpt_sentiment_index_4[
    gpt_sentiment_index_4["Date"].isin(quarter_dates)
]
mcdonald_quarter_data = mcdonald_sentiment_index[
    mcdonald_sentiment_index["Date"].isin(quarter_dates)
]

# Step 2: Merge DataFrames
# Ensure that the Date columns are of the same data type to avoid merge issues
merged_data = pd.merge(portugal_data, gpt_quarter_data, on="Date", how="inner")
merged_data = pd.merge(merged_data, mcdonald_quarter_data, on="Date", how="inner")

# Rename columns for clarity if needed
merged_data = merged_data.rename(
    columns={
        "10y_Maturity_Bond_Yield": "Bond_Yield",
        "Sentiment_GPT_portugal": "GPT_Index",
        "Sentiment_Index_McDonald_portugal": "McDonald_Index",
    },
)

# Step 3: Calculate Correlation Matrix
correlation_matrix = merged_data[["Bond_Yield", "GPT_Index", "McDonald_Index"]].corr()

print(correlation_matrix)

In [None]:
# Plotting the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix between Bond Yield, GPT Index, and McDonald Index")
plt.show()