In [None]:
import random
import re

import matplotlib.pyplot as plt
import pandas as pd
import requests
import seaborn as sns
import statsmodels.formula.api as smf
import xmltodict

# Improve Preprocessing of Transcripts

In [None]:
data = pd.read_pickle(
    "/Users/nicolasroever/Documents/Promotion/Debt Crisis/debt_crisis/bld/data/df_transcripts_clean_step_2_negative_20_.pkl",
)

In [None]:
data.head()

In [None]:
data = pd.read_excel(
    "/Users/nicolasroever/Documents/Promotion/Debt Crisis/debt_crisis/bld/data/gpt_sentiment_data/df_random_transcript_snippets.xlsx",
)

In [None]:
data.head()

In [None]:
data["Snippet"] = data["Snippet"].apply(
    lambda x: "".join(
        x.replace("[", "")
        .replace("]", "")
        .replace("'", "")
        .replace(",", "")
        .replace("-", ""),
    ),
)

In [None]:
data["Snippet"]

In [None]:
"".join(data.loc[:, "Snippet"][0])

In [None]:
data = data.drop_duplicates(subset="Snippet")

In [None]:
data.to_excel(
    "/Users/nicolasroever/Documents/Promotion/Debt Crisis/debt_crisis/bld/data/gpt_sentiment_data/df_random_transcript_snippets_NR.xlsx",
)

# Make Training Data for GPT

In [None]:
full_data = pd.read_pickle(
    "/Users/nicolasroever/Documents/Promotion/Debt Crisis/debt_crisis/bld/data/df_transcripts_raw.pkl",
)
full_data["Date"] = pd.to_datetime(full_data["Date"])
full_data["Transcript"] = full_data["Transcript"].str.lower()

In [None]:
country_names = pd.read_excel(
    "/Users/nicolasroever/Documents/Promotion/Debt Crisis/debt_crisis/src/debt_crisis/data/country_names/country_names.xlsx",
)

In [None]:
def create_set_with_all_country_words(country_names_file):
    # Flatten the DataFrame to a single list
    country_words = country_names_file.values.flatten()

    # Remove NaN values
    country_words = [word for word in country_words if pd.notna(word)]

    # Create a set of unique words
    return set(country_words)

In [None]:
country_words_set = create_set_with_all_country_words(country_names)

In [None]:
row = full_data.sample(1)
transcript_id = row["Transcript_ID"].values[0]
occuring_words = country_words_set.intersection(
    set(row["Transcript"].str.split().values[0]),
)
occuring_words

In [None]:
def get_a_text_snippet_if_there_is_country_mentioned(
    full_data,
    country_words_set,
    context=50,
):
    row = full_data.sample(1)
    transcript_id = row["Transcript_ID"].values[0]
    occuring_words = country_words_set.intersection(
        set(row["Transcript"].str.split().values[0]),
    )

    if occuring_words:
        # Randomly pick a word
        word = random.choice(list(occuring_words))

        # Get the 'Transcript'
        transcript = row["Transcript"].values[0]

        # Split the 'Transcript' into words
        words = transcript.split()

        # Find the index of the word
        index = words.index(word)

        # Get the 40 preceding and succeeding words
        start = max(0, index - context)
        end = min(len(words), index + context)
        snippet = words[start:end]

        # Create a single-row DataFrame
        result = pd.DataFrame(
            {"Keyword": [word], "Transcript_ID": [transcript_id], "Snippet": [snippet]},
        )

        print("Occurence")

        return result

    else:
        return None

In [None]:
final_output = pd.DataFrame()

for i in range(100):
    country_words_set = create_set_with_all_country_words(country_names)
    single_snippet = get_a_text_snippet_if_there_is_country_mentioned(
        full_data,
        country_words_set,
    )

    if single_snippet is not None:
        final_output = pd.concat([final_output, single_snippet])

In [None]:
final_output.head()

In [None]:
# Check which
row = full_data.sample(1)
set(row["Transcript"].str.split().values[0])
occurence = country_words_set.intersection(set(row["Transcript"].str.split().values[0]))
occurence

In [None]:
# Randomly pick a word
word = random.choice(list(occurence))

# Get the 'Transcript'
transcript = row["Transcript"].values[0]

# Split the 'Transcript' into words
words = transcript.split()

# Find the index of the word
index = words.index(word)


# Get the 40 preceding and succeeding words
start = max(0, index - 50)
end = min(len(words), index + 50)
snippet = words[start:end]

print(" ".join(snippet))

In [None]:
# Filter the DataFrame
filtered_data = full_data[
    full_data.apply(
        lambda x: any(word in x["Transcript"] for word in country_words_list)
        and print(f"Processing row: {x.name}")
        or True,
        axis=1,
    )
]

In [None]:
row

Plan: 

1. Check which word occurs
2. Then get context

In [None]:
def obtain_country_names(country, country_names_file):
    """THis function extracts the country names from the country names file."""
    country_row = country_names_file[
        country_names_file["name"].str.lower() == country.lower()
    ]
    if not country_row.empty:
        country_names = set(country_row.iloc[0].values.tolist())
    else:
        country_names = set()

    return country_names

In [None]:
def get_country_appearance_index_from_transcript_text(transcript_words, country_names):
    """Get a list of indices where any of the country names or their alternate names are
    found in the transcript.

    Args:
        transcript (str): Earnings call transcript
        country_names (set): Set of country names and alternate names

    Returns:
        list: List of indices where country names or alternate names are found

    """
    # Initialize a list to store indices
    country_indices = []

    # Iterate through words and check for country names or alternate names
    for i, word in enumerate(transcript_words):
        if word in country_names:
            country_indices.append(i)

    return country_indices

In [None]:
def get_random_country_transcript_snippet(
    data,
    countries_under_study,
    country_names_file,
    window_size=40,
):
    # Randomly select a row from the data
    row = data.sample(1)

    transcript_id = row["Transcript_ID"].values[0]

    # Get the transcript text
    transcript = row["Transcript"].values[0]

    # Split the transcript into words
    transcript_words = transcript.split()

    for country in countries_under_study:
        country_names = obtain_country_names(country, country_names_file)

        # Find the indices of the country in the transcript
        country_indices = get_country_appearance_index_from_transcript_text(
            transcript_words,
            country_names,
        )

        # If the country is not in the transcript, return an empty string
        if not country_indices:
            return None

        # Randomly select an index from country_indices
        index = random.choice(country_indices)

        # Get the start and end indices for the snippet
        start = max(0, index - window_size)
        end = min(len(transcript_words), index + window_size)

        # Get the snippet
        snippet = " ".join(transcript_words[start:end])

        # Create a single-row DataFrame
        result = pd.DataFrame(
            {
                "Country": [country],
                "Transcript_ID": [transcript_id],
                "Snippet": [snippet],
            },
        )

    return result

In [None]:
# Initialize an empty DataFrame
df = pd.DataFrame()

# Loop until df has 200 rows
while len(df) < 10:
    # Execute the function
    result = get_random_country_transcript_snippet(
        full_data,
        COUNTRIES_UNDER_STUDY,
        country_names_file=country_names,
        window_size=40,
    )

    # If the result is not empty, append it to df
    if result is not None:
        df = pd.concat([df, result])
        print(len(df))

# Make Function to Return Regression Values

In [None]:
event_study_data = pd.read_pickle(
    "/Users/nicolasroever/Documents/Promotion/Debt Crisis/debt_crisis/bld/data/event_study_approach/event_study_full_model_data_negative_and_positive_20_.pkl",
)
from debt_crisis.config import EVENT_STUDY_COUNTRIES, EVENT_STUDY_TIME_PERIOD

In [None]:
def extract_column_names_from_regression_formula(formula):
    """This function takes in a string giving a regression formula and returns a list with the variable names."""
    # Split the formula into left and right parts
    left, right = formula.split("~")

    # Extract the terms from the right part of the formula
    terms = re.split(r"\+", right.strip())

    # Remove the 'Q('')' and 'C('')' wrappers and strip whitespace
    column_names = [
        re.sub(r"Q\('([^']*)'\)|C\(([^)]*)\)", r"\1\2", term).strip() for term in terms
    ]

    # Remove duplicates and sort the column names
    return sorted({name for name in column_names if name})

In [None]:
EVENT_STUDY_MODELS = [
    "Q('10y_Maturity_Bond_Yield') ~ Q('Public_Debt_as_%_of_GDP')+ GDP_in_Current_Prices_Growth + Moody_Rating_PD + "
    "VIX_Daily_Close_Quarterly_Mean + Q('10y_Maturity_Bond_Yield_US') + C(Country) + C(Date) +",
    "Q('10y_Maturity_Bond_Yield') ~ Q('Public_Debt_as_%_of_GDP')+ GDP_in_Current_Prices_Growth + Moody_Rating_PD + "
    "VIX_Daily_Close_Quarterly_Mean + Q('10y_Maturity_Bond_Yield_US') + C(Country) +",
]

In [None]:
def run_event_study_for_given_configuration(
    event_study_data,
    configuraton,
    event_study_countries,
    event_study_time_period,
    standard_errors="hac-panel",
):
    """This function runs a regression with the dataset as an input and the given configuration. THe function returns the statsmodel.model object."""
    # Drop all rows where a variable in the formula is NA and the US
    columns_for_dropping = extract_column_names_from_regression_formula(configuraton)
    data = event_study_data.dropna(subset=columns_for_dropping)
    data = data.loc[data["Country"] != "usa", :]

    # Make the event study configuration:

    formula = configuraton + " + ".join(
        f"Dummy_{country}_{quarter}"
        for country in event_study_countries
        for quarter in pd.period_range(
            start=event_study_time_period[0],
            end=event_study_time_period[1],
            freq="Q",
        )
    )

    # Sort the data
    # (This is required for the HAC standard errors to work correctly)

    data = data.sort_values(by=["Country", "Date"])

    # Run the regression
    return smf.ols(formula=formula, data=data).fit(
        cov_type=standard_errors,
        cov_kwds={"groups": data["Country"], "maxlags": 2},
    )

In [None]:
trained_model = run_event_study_for_given_configuration(
    event_study_data,
    EVENT_STUDY_MODELS[0],
    EVENT_STUDY_COUNTRIES,
    EVENT_STUDY_TIME_PERIOD,
)

In [None]:
print(trained_model.params.keys().to_list())

In [None]:
def extract_parameters_for_regression_table_from_model(model, configuration):
    # Define the significance levels
    significance_levels = [0.01, 0.05, 0.1]

    # Define the stars for each significance level
    stars = ["***", "**", "*"]

    # Define the parameters to extract
    parameters = [
        "Q('Public_Debt_as_%_of_GDP')",
        "GDP_in_Current_Prices_Growth",
        "Moody_Rating_PD",
        "VIX_Daily_Close_Quarterly_Mean",
        "Q('10y_Maturity_Bond_Yield_US')",
        "Q('3_Month_US_Treasury_Yield_Quarterly_Mean')",
        "Q('NASDAQ_Value_Quarterly_Mean')",
        "Q('Current_Account_in_USD')",
        "'Eurostat_CPI_Annualised Growth_Rate",
    ]

    # Initialize an empty dictionary to store the coefficients with stars
    coefficients_with_stars = {}

    # Loop over each parameter
    for param in parameters:
        # Get the coefficient value
        coefficient = model.params.get(param, "")

        # Get the p-value of the coefficient
        p_value = model.pvalues.get(param, 1)

        # Add stars to the coefficient based on its p-value
        for level, star in zip(significance_levels, stars):
            if p_value < level:
                coefficient = f"{coefficient:.2f}{star}"
                break

        # Add the coefficient with stars to the dictionary
        coefficients_with_stars[param] = coefficient

    # Check for fixed effects
    country_fe = "Yes" if "C('Country')" in configuration else "No"
    time_fe = "Yes" if "C('Date')" in configuration else "No"

    # Add other model statistics to the dictionary
    coefficients_with_stars.update(
        {
            "Country Fixed Effects": country_fe,
            "Time Fixed Effects": time_fe,
            "Number of Observations": round(model.nobs, 0),
            "R-Squared": round(model.rsquared, 2),
        },
    )

    # Convert the dictionary to a pandas Series and return it
    return pd.Series(coefficients_with_stars)

In [None]:
def get_parameters_for_regression_table_for_configuration(
    event_study_data,
    configuraton,
    event_study_countries,
    event_study_time_period,
    standard_errors="hac-panel",
):
    model = run_event_study_for_given_configuration(
        event_study_data,
        configuraton,
        event_study_countries,
        event_study_time_period,
        standard_errors,
    )

    return extract_parameters_for_regression_table_from_model(model, configuraton)

In [None]:
get_parameters_for_regression_table_for_configuration(
    event_study_data,
    EVENT_STUDY_MODELS[0],
    EVENT_STUDY_COUNTRIES,
    EVENT_STUDY_TIME_PERIOD,
)

In [None]:
def generate_regresssion_table_for_list_of_configurations(
    event_study_data,
    EVENT_STUDY_MODELS,
    EVENT_STUDY_COUNTRIES,
    EVENT_STUDY_TIME_PERIOD,
):
    # Initialize empty dataframe to store results
    results = pd.DataFrame()

    # Loop over each configuration
    for index, configuration in enumerate(EVENT_STUDY_MODELS):
        parameters = get_parameters_for_regression_table_for_configuration(
            event_study_data,
            configuration,
            EVENT_STUDY_COUNTRIES,
            EVENT_STUDY_TIME_PERIOD,
        )
        results[str(index)] = parameters

    return results

In [None]:
results = generate_regresssion_table_for_list_of_configurations(
    event_study_data,
    EVENT_STUDY_MODELS,
    EVENT_STUDY_COUNTRIES,
    EVENT_STUDY_TIME_PERIOD,
)

In [None]:
results.head()

In [None]:
public_debt = trained_model.params.get("Q('Public_Debt_as_%_of_GDP')", "")
real_gdp_growth = trained_model.params.get("GDP_in_Current_Prices_Growth", "")
moody_rating = trained_model.params.get("Moody_Rating_PD", "")
vix = trained_model.params.get("VIX_Daily_Close_Quarterly_Mean", "")
us_bond_yield = trained_model.params.get("Q('10y_Maturity_Bond_Yield_US')", "")
three_month_us_trasury = trained_model.params.get(
    "Q('3_Month_US_Treasury_Yield_Quarterly_Mean')",
    "",
)
nasdaq_value = trained_model.params.get("Q('NASDAQ_Value_Quarterly_Mean')", "")
current_account = trained_model.params.get("Q('Current_Account_in_USD')", "")
consumer_price_index = trained_model.params.get(
    "'Eurostat_CPI_Annualised Growth_Rate",
    "",
)
number_obserations = trained_model.nobs
r_squared = trained_model.rsquared
country_fe = "Yes" if "C('Country')" in configuration else "No"

time_fe = "Yes" if "C('Date')" in configuration else "No"

In [None]:
final_data = pd.read_pickle(
    "/Users/nicolasroever/Documents/Promotion/Debt Crisis/debt_crisis/bld/data/event_study_approach/event_study_regression_table_data.pkl",
)

In [None]:
final_data.head()

# Make Plot with Daily Sentiment and event study coefficient data

In [None]:
event_study_coefficients = pd.read_pickle(
    "/Users/nicolasroever/Documents/Promotion/Debt Crisis/debt_crisis/bld/data/event_study_approach/event_study_coefficients_data_negative_and_positive_20_.pkl",
)

In [None]:
event_study_coefficients.head()

In [None]:
mcdonald_sentiment_daily = pd.read_pickle(
    "/Users/nicolasroever/Documents/Promotion/Debt Crisis/debt_crisis/bld/data/mcdonald_sentiment_index_cleaned_negative_and_positive_20_.pkl",
)

In [None]:
mcdonald_sentiment_daily.head()

# Make correlation table bond spread, sentiment index

In [None]:
event_study_data = pd.read_pickle(
    "/Users/nicolasroever/Documents/Promotion/Debt Crisis/debt_crisis/bld/data/event_study_approach/event_study_full_model_data_negative_and_positive_20_.pkl",
)

In [None]:
print(event_study_data.columns.tolist())

In [None]:
# Group the data by 'Country' and calculate the correlation of 'Bond_Yield_Spread' and 'McDonald_Sentiment_Index'
correlations = event_study_data.groupby("Country").apply(
    lambda x: x[["Bond_Yield_Spread", "McDonald_Sentiment_Index"]].corr().iloc[0, 1],
)

# Convert the Series to a DataFrame
correlations = correlations.to_frame().reset_index()
# Rename the columns
correlations.columns = ["Country", "Correlation"]

# Drop NA's and round
correlations = correlations.dropna().round(2)

# Capitalise the country names
correlations["Country"] = correlations["Country"].str.title()

In [None]:
correlations

In [None]:
sentiments = pd.read_pickle(
    "/Users/nicolasroever/Documents/Promotion/Debt Crisis/debt_crisis/bld/data/df_transcripts_clean_step_2_negative_and_positive_20_.pkl",
)

In [None]:
sentiments.columns

In [None]:
sentiments_greece = sentiments[
    sentiments["Sentiment_Index_McDonald_greece"] <= -0.2
].sort_values("Sentiment_Index_McDonald_greece")[
    [
        "Date",
        "Sentiment_Index_McDonald_greece",
        "Preprocessed_Transcript_Step_1",
        "Company",
    ]
]

In [None]:
sentiments_greece.head()

In [None]:
sentiment_word_count = pd.read_csv(
    "/Users/nicolasroever/Documents/Promotion/Debt Crisis/debt_crisis/bld/data/sentiment_data/sentiment_word_count_clean.csv",
)

In [None]:
sns.set_style("white")

# Filter the DataFrame
filtered_data = (
    sentiment_word_count[(sentiment_word_count["Positive_Indicator"] == 1)]
    .sort_values(by="Count", ascending=False)
    .head(20)
)

# Sort the DataFrame
sorted_data = filtered_data.sort_values(by="Count", ascending=False)

# Create the plot
fig = plt.figure(figsize=(8, 7))

plt.barh(sorted_data["Word"], sorted_data["Count"], color="#3c5488")
plt.xlabel("Total Number of Occurences")
plt.yticks(fontsize=8)  # Adjust font size here

# Remove the top and right spines from plot
sns.despine()

# Show the plot
plt.show()

In [None]:
dictionary_data = pd.read_pickle(
    "/Users/nicolasroever/Documents/Promotion/Debt Crisis/debt_crisis/bld/data/sentiment_dictionary_clean.pkl",
)

In [None]:
dictionary_data["Negative_Indicator"].sum()

In [None]:
print(dictionary_data.loc[dictionary_data["Negative_Indicator"] == 1, "Word"].tolist())

In [None]:
dictionary_data["Positive_Indicator"].sum()

In [None]:
print(dictionary_data.loc[dictionary_data["Positive_Indicator"] == 1, "Word"].tolist())

In [None]:
event_study_data = pd.read_pickle(
    "/Users/nicolasroever/Documents/Promotion/Debt Crisis/debt_crisis/bld/data/event_study_approach/event_study_full_model_data_negative_and_positive_20_.pkl",
)

In [None]:
event_study_data.columns[1:25]

In [None]:
# Create Dexcriptive Statistics

# Group the data by 'Country' and calculate the mean of the specified columns
average_data = event_study_data.groupby("Country")[
    [
        "Public_Debt_as_%_of_GDP",
        "10y_Maturity_Bond_Yield",
        "GDP_in_Current_Prices_Growth",
    ]
].mean()

# Calculate the most frequent 'Rating_Moody_Last_Quarter_Day' for each country
average_data["Most Frequent Rating_Moody_Last_Quarter_Day"] = event_study_data.groupby(
    "Country",
)["Rating_Moody_Last_Quarter_Day"].agg(pd.Series.mode)

# Calculate the number of observations for each country
average_data["Number of Observations"] = event_study_data.groupby("Country").size()

average_data = average_data.round(2)

# Reset the index
average_data = average_data.reset_index()

average_data["Country"] = average_data["Country"].str.title()

# Insert empty columns for breaks
average_data.insert(2, "Break1", "")
average_data.insert(5, "Break2", "")

In [None]:
average_data.tail()

In [None]:
def convert_dataframe_content_to_latex_table_body(data):
    # Convert each row to a string with ' & ' as the separator
    data_string = data.apply(lambda row: " & ".join(row.astype(str)), axis=1)

    # Join all rows into a single string with ' \\\\\n' as the separator
    data_string = " \\\\".join(data_string)

    # Add ' \\\\' at the end of the string
    data_string += " \\\\"

    return data_string

In [None]:
data_string

In [None]:
def _make_missing_values_heatmap(data, data_name, index=None):
    """Create a heatmap to visualize missing values in a DataFrame."""
    if index is not None:
        data = data.set_index(index)

    plt.figure(figsize=(10, 6))
    sns.heatmap(data.isnull(), cbar=False, cmap="viridis")
    plt.title("Missing Values in Dataset " + data_name)
    plt.show()

In [None]:
event_study_coefficients = pd.read_pickle(
    "/Users/nicolasroever/Documents/Promotion/Debt Crisis/debt_crisis/bld/data/event_study_approach/event_study_coefficients_data.pkl",
)

In [None]:
event_study_coefficients.head()

In [None]:
event_study_data = pd.read_pickle(
    "/Users/nicolasroever/Documents/Promotion/Debt Crisis/debt_crisis/bld/data/event_study_approach/event_study_dataset_negative_and_positive_20_.pkl",
)

In [None]:
chile_data = event_study_data.loc[event_study_data["Country"] == "chile", :]

In [None]:
_make_missing_values_heatmap(
    chile_data[
        [
            "date",
            "Country",
            "Date",
            "GDP_in_USD_Current_Prices",
            "REF_AREA",
            "Eurostat_CPI_Annualised Growth_Rate",
            "Public_Debt_as_%_of_GDP",
            "Real_Quarterly_GVA_in_Domestic_Currency",
            "Current_Account_in_USD",
            "Rating_Moody_Last_Quarter_Day",
        ]
    ],
    "Germany Data",
    index="Date",
)

In [None]:
event_study_data["10y_Maturity_Bond_Yield"

In [None]:
event_study_data.Country.unique()

In [None]:
def plot_sentiment_index_and_bond_yield_spread_for_country(
    first_step_regression_data,
    country,
    color_scheme=None,
):
    # Filter the data for the given country
    if color_scheme is None:
        color_scheme = ["#3c5488", "#e64b35", "#4dbbd5", "#00a087", "#f39b7f"]
    country_data = first_step_regression_data[
        first_step_regression_data["Country"] == country
    ]
    country_data = country_data.sort_values("Date")

    # Set the style of the plot
    sns.set_style("white")

    # Create the plot
    fig, ax1 = plt.subplots(figsize=(8, 5))

    ax1.plot(
        country_data["Date"],
        country_data["Bond_Yield_Spread"],
        marker="o",
        color=color_scheme[0],
        label=f"Bond Yield Spread {country.capitalize()} ",
    )
    ax1.set_ylabel("Bond Yield Spread in Basis Points", fontsize=14)

    ax2 = ax1.twinx()
    ax2.plot(
        country_data["Date"],
        country_data["McDonald_Sentiment_Index"],
        marker="o",
        color=color_scheme[1],
        label=f"Sentiment Index {country.capitalize()} ",
    )
    ax2.set_ylabel("Sentiment Index", fontsize=14)
    ax2.invert_yaxis()  # Invert the right y-axis

    # Add a horizontal line at y=0

    # Set the title and labels
    plt.title(
        f"Raw Sentiment Data {country.capitalize()} with Bond Yield Spread for {country.capitalize()} ",
        fontsize=16,
    )
    plt.xlabel("Date", fontsize=14)

    # Keep only the y-axis and x-axis
    sns.despine(left=False, bottom=False, right=False, top=True)

    # Create a legend for both lines
    lines, labels = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax2.legend(lines + lines2, labels + labels2, loc="upper right")

    # Use LaTeX style for the font
    plt.rc("text", usetex=True)

    # Align the zero of both y-axes

    return fig

In [None]:
fig = plot_sentiment_index_and_bond_yield_spread_for_country(
    event_study_data,
    "portugal",
)
fig.show()

In [None]:
portugal_filter = event_study_data[event_study_data["Country"] == "portugal"]

In [None]:
portugal_filter["McDonald_Sentiment_Index"].corr(portugal_filter["Bond_Yield_Spread"])

In [None]:
pattern = r"^Dummy_\w+_\w+$"
coefficient_data = event_study_data.loc[
    event_study_data["Variable"].str.contains(pattern, regex=True),
    :,
]

coefficient_data["Date"] = pd.to_datetime(
    coefficient_data["Variable"].str.split("_").str[-1],
)
coefficient_data["Country"] = coefficient_data["Variable"].str.split("_").str[-2]
coefficient_data["CI_95_lower"] = (
    coefficient_data["Coefficient"] - coefficient_data["Standard Errors"] * 1.96
)
coefficient_data["CI_95_upper"] = (
    coefficient_data["Coefficient"] + coefficient_data["Standard Errors"] * 1.96
)

In [None]:
coefficient_data.head()

In [None]:
nature_color_scheme = ["#3c5488", "#e64b35", "#4dbbd5", "#00a087", "#f39b7f"]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style of the plot
sns.set_style("white")

# Filter the data for the given country
greece_data = coefficient_data[coefficient_data["Country"] == "greece"]
greece_data = greece_data.sort_values("Date")

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(
    greece_data["Date"],
    greece_data["Coefficient"],
    marker="o",
    color=nature_color_scheme[0],
)

# Add a horizontal line at y=0
plt.axhline(0, color="grey", linestyle=":")

# Plot the confidence interval
plt.fill_between(
    greece_data["Date"],
    greece_data["CI_95_lower"],
    greece_data["CI_95_upper"],
    color="b",
    alpha=0.1,
)

# Set the title and labels
plt.title("Coefficients for Greece Over Time with Confidence Interval", fontsize=16)
plt.xlabel("Date", fontsize=14)
plt.ylabel("Coefficient", fontsize=14)

# Remove the legend

# Keep only the y-axis and x-axis
sns.despine(left=False, bottom=False, right=True, top=True)

# Use LaTeX style for the font
plt.rc("text", usetex=True)

# Show the plot
plt.show()

In [None]:
EVENT_STUDY_COUNTRIES = [
    "netherlands",
    "latvia",
    "austria",
    "italy",
    "finland",
    "slovenia",
    "lithuania",
    "greece",
    "portugal",
    "spain",
    "germany",
    "belgium",
    "ireland",
    "france",
]

In [None]:
" + ".join(
    [
        f"Dummy_{country}_{quarter}"
        for country in EVENT_STUDY_COUNTRIES
        for quarter in pd.period_range(start="2009Q1", end="2011Q4", freq="Q")
    ],
)

In [None]:
formula = (
    "Bond_Yield_Spread ~ Q('Public_Debt_as_%_of_GDP')+ GDP_in_Current_Prices_Growth + "
    "GDP_in_Current_Prices_Growth_Lead + Current_Account_in_USD + "
    "VIX_Daily_Close_Quarterly_Mean + Q('Eurostat_CPI_Annualised Growth_Rate') + "
    "NASDAQ_Daily_Close_Quarterly_Mean + Q('3_Month_US_Treasury_Yield_Quarterly_Mean')"
    + " + ".join(
        [
            f"Dummy_{country}_{quarter}"
            for country in EVENT_STUDY_COUNTRIES
            for quarter in pd.period_range(start="2009Q1", end="2009Q4", freq="Q")
        ],
    )
)

In [None]:
pd.read_pickle(
    "/Users/nicolasroever/Documents/Promotion/Debt Crisis/debt_crisis/bld/data/event_study_approach/event_study_coefficients_data.pkl",
)

In [None]:
event_study_data.head()

In [None]:
def add_quarter_columns(df):
    quarters = pd.period_range(start="2009Q1", end="2011Q4", freq="Q")
    for quarter in quarters:
        start_date = quarter.start_time
        end_date = quarter.end_time
        df[str(quarter)] = (
            (df["Date"] >= start_date)
            & (df["Date"] <= end_date)
            & (df["Country"] == "Greece")
        ).astype(int)
    return df

In [None]:
quarterly_data = pd.read_pickle(
    "/Users/nicolasroever/Documents/Promotion/Debt Crisis/debt_crisis/bld/data/step_one_regression_dataset_output_quarterly.pkl",
)

In [None]:
quarterly_data.columns

In [None]:
dictionary = pd.read_csv(
    "/Users/nicolasroever/Documents/Promotion/Debt Crisis/debt_crisis/bld/data/sentiment_data/sentiment_word_count_clean.csv",
)

In [None]:
# Plot positives

# Filter the DataFrame
filtered_data = dictionary[
    (dictionary["Positive_Indicator"] == 1) & (dictionary["Count"] > 600)
]

# Sort the DataFrame
sorted_data = filtered_data.sort_values(by="Count", ascending=False)

# Create the plot
plt.figure(figsize=(10, 6))
plt.barh(sorted_data["Word"], sorted_data["Count"])
plt.xlabel("Word")
plt.ylabel("Count")
plt.title("Word Counts for Positive Words")
plt.yticks(fontsize=8)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
# Plot positives

# Filter the DataFrame
filtered_data_2 = dictionary[
    (dictionary["Negative_Indicator"] == 1) & (dictionary["Count"] > 600)
]

# Sort the DataFrame
sorted_data_2 = filtered_data_2.sort_values(by="Count", ascending=False)

# Create the plot
plt.figure(figsize=(10, 6))
plt.barh(sorted_data_2["Word"], sorted_data_2["Count"])
plt.xlabel("Word")
plt.ylabel("Count")
plt.title("Word Counts for Negativev Words")
plt.yticks(fontsize=8)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
dictionary.sort_values(by="Count", ascending=False)

In [None]:
dictionary.T

In [None]:
dictionary.columns = ["Word", "Count"]

In [None]:
COUNTRIES_UNDER_STUDY = {
    "austria",
    "belgium",
    "bulgaria",
    "croatia",
    "cyprus",
    "czechia",
    "denmark",
    "estonia",
    "finland",
    "france",
    "germany",
    "greece",
    "hungary",
    "ireland",
    "italy",
    "latvia",
    "lithuania",
    "luxembourg",
    "malta",
    "netherlands",
    "poland",
    "portugal",
    "romania",
    "slovakia",
    "slovenia",
    "spain",
    "sweden",
}

In [None]:
quarterly_data.columns

In [None]:
quartertly_countries = set(quarterly_data["Country"].unique())

In [None]:
common_countries = COUNTRIES_UNDER_STUDY.intersection(quartertly_countries)

In [None]:
common_countries

# Parameters
Find the parameters at https://data-explorer.oecd.org/vis?df[ds]=dsDisseminateFinalDMZ&df[id]=DSD_NAMAIN1%40DF_QNA_EXPENDITURE_USD&df[ag]=OECD.SDD.NAD&df[vs]=1.0&pd=%2C&dq=Q..AUS.S1..B1GQ.....V..&ly[cl]=TIME_PERIOD&to[TIME_PERIOD]=false&lo=5&lom=LASTNPERIODS 

In [None]:
url = "https://sdmx.oecd.org/public/rest/data/OECD.SDD.TPS,DSD_BOP@DF_BOP,1.0/USA..CA.B..Q.USD_EXC+XDC.N?dimensionAtObservation=AllDimensions"

# Request Data

In [None]:
r = requests.get(url)

In [None]:
dictionary_data = xmltodict.parse(r.content)

In [None]:
dictionary_data

In [None]:
dictionary_data["message:GenericData"]["message:DataSet"]["generic:Obs"]

In [None]:
observation_dictionary = dictionary_data["message:GenericData"]["message:DataSet"][
    "generic:Obs"
][0]

In [None]:
dictionary_data["message:GenericData"]["message:DataSet"]["generic:Obs"][0][
    "generic:ObsValue"
]

In [None]:
# 1. obskey
obs_key_data = observation_dictionary["generic:ObsKey"]["generic:Value"]
obs_key_dict = {d["@id"]: d["@value"] for d in obs_key_data}

In [None]:
# Obs value
obs_value_dict = observation_dictionary["generic:ObsValue"]

In [None]:
# Obs value
obs_attributes_data = observation_dictionary["generic:Attributes"]["generic:Value"]
obs_attributes_dict = {d["@id"]: d["@value"] for d in obs_attributes_data}

In [None]:
full_observation = {**obs_key_dict, **obs_value_dict, **obs_attributes_dict}

In [None]:
pd.DataFrame(full_observation, index=[0])

In [None]:
full_data = pd.DataFrame()

In [None]:
for i in range(
    len(dictionary_data["message:GenericData"]["message:DataSet"]["generic:Obs"]),
):
    observation_dictionary = dictionary_data["message:GenericData"]["message:DataSet"][
        "generic:Obs"
    ][i]
    # 1. obskey
    obs_key_data = observation_dictionary["generic:ObsKey"]["generic:Value"]
    obs_key_dict = {d["@id"]: d["@value"] for d in obs_key_data}
    # Obs value
    obs_value_dict = observation_dictionary["generic:ObsValue"]
    # ObsAttributes
    obs_attributes_data = observation_dictionary["generic:Attributes"]["generic:Value"]
    obs_attributes_dict = {d["@id"]: d["@value"] for d in obs_attributes_data}

    full_observation = {**obs_key_dict, **obs_value_dict, **obs_attributes_dict}
    full_data = pd.concat([full_data, pd.DataFrame(full_observation, index=[0])])

In [None]:
full_data.head()

In [None]:
full_data["COUNTERPART_AREA"].unique()

In [None]:
# Set out everything about the request in the format specified by the OECD API
data = oecd.data(resource_id="DSD_NAMAIN1").to_pandas()

df = pd.DataFrame(data).reset_index()
df.head()

In [None]:
df["MEASURE"]

In [None]:
# Tell pdmx we want OECD data
oecd = pdmx.Request("OECD")
# Set out everything about the request in the format specified by the OECD API
data = oecd.data(
    resource_id="PDB_LV",
    key="GBR+FRA+CAN+ITA+DEU+JPN+USA.T_GDPEMP.CPC/all?startTime=2010",
).to_pandas()

df = pd.DataFrame(data).reset_index()
df.head()

In [None]:
data.content["OECD.SDD.NAD:DSD_NAMAIN1@DF_QNA_EXPENDITURE_CAPITA(1.0)"]

In [None]:
pd.read_xml(
    "https://sdmx.oecd.org/public/rest/data/OECD.SDD.NAD,DSD_NAMAIN1@DF_QNA_EXPENDITURE_CAPITA,1.0/Q............?startPeriod=2022-Q4&dimensionAtObservation=AllDimensions",
)