# Topic Distribution 

In [None]:
import pandas as pd
import plotly.express as px

In [None]:
data = pd.read_csv(
    "/Users/nicolasroever/Documents/Promotion/Debt_Crisis/debt_crisis/src/debt_crisis/data/llm_reasoning_with_topics/llm_reasoning_with_topics.csv",
)
clean_transcripts = pd.read_pickle(
    "/Users/nicolasroever/Documents/Promotion/Debt_Crisis/debt_crisis/bld/data/df_transcripts_raw.pkl",
)
clean_snippets_with_date = pd.read_pickle(
    "/Users/nicolasroever/Documents/Promotion/Debt_Crisis/debt_crisis/bld/data/gpt_sentiment_data/df_gpt_sentiment_training_dataset_cleaned.pkl",
)[["Transcript_ID", "Snippet_ID"]]

topic_words = pd.read_csv(
    "/Users/nicolasroever/Documents/Promotion/Debt_Crisis/debt_crisis/src/debt_crisis/data/llm_reasoning_with_topics/topic_words.csv",
    sep=";",
)

In [None]:
topic_words = topic_words[topic_words["Number of Topics"] == 40]
topic_words["Top Words"] = topic_words["Top Words"].apply(
    lambda x: " ".join(x.split()[:3]),
)

In [None]:
# Merge data with clean_transcripts
data = data.merge(
    clean_snippets_with_date,
    how="left",
    left_on="Snippet_ID",
    right_on="Snippet_ID",
    validate="one_to_one",
)

In [None]:
data = data.merge(
    clean_transcripts,
    how="left",
    left_on="Transcript_ID",
    right_on="Transcript_ID",
    validate="many_to_one",
)

In [None]:
data = data.merge(
    topic_words,
    how="left",
    left_on="Topic_40",
    right_on="Topic Index",
    validate="many_to_one",
)

In [None]:
data["Total_Occurrence_of_Topic"] = data.groupby("Topic Index")[
    "Topic Index"
].transform("count")

In [None]:
data.head()

In [None]:
data = data[data["Total_Occurrence_of_Topic"] > 1000]

In [None]:
# First, let's group the dataframe by 'Top Words' and count the occurrences of each topic.
df_top_words_occurrence = data.groupby("Top Words")["Topic Index"].count().reset_index()

# Rename the columns for clarity
df_top_words_occurrence.columns = ["Top Words", "Total Occurrence of Topic"]

In [None]:
# Creating the bar chart with Plotly for Topic_40 with Top Words as labels
fig = px.bar(
    df_top_words_occurrence,
    y="Top Words",
    x="Total Occurrence of Topic",
    title="Topic 40 Analysis",
    labels={
        "Top Words": "Top Words",
        "Total Occurrence of Topic": "Total Occurrence of Topic",
    },
    orientation="h",
)

# Update layout to improve readability of labels and reduce space between bars
fig.update_layout(
    yaxis_tickangle=0,  # No tilt needed for horizontal bars
    yaxis_title="Top Words",
    xaxis_title="Total Occurrence of Topic",
    bargap=0.2,  # Reduce the gap between bars
    height=800,  # Increase height for better spacing of labels
)

# Customize bar appearance
fig.update_traces(
    marker_color="rgb(158,202,225)",
    marker_line_color="rgb(8,48,107)",
    marker_line_width=1.5,
    opacity=0.6,
)

# Sort bars in descending order
fig.update_yaxes(categoryorder="total descending")

# Display the plot
fig.show()

In [None]:
# Group the dataframe by 'Top Words' and sum the 'Prediction' column
df_topic_scores = data.groupby("Top Words")["Prediction"].sum().reset_index()

# Rename the columns for clarity
df_topic_scores.columns = ["Top Words", "Sum of Prediction"]

In [None]:
df_topic_scores.head()

In [None]:
# Creating the bar chart with Plotly for Topic_40 with Top Words as labels
fig = px.bar(
    df_topic_scores,
    y="Top Words",
    x="Sum of Prediction",
    title="Topic 40 Analysis",
    labels={"Top Words": "Top Words", "Sum of Prediction": "Sum of Prediction"},
    orientation="h",
)

# Update layout to improve readability of labels and reduce space between bars
fig.update_layout(
    yaxis_tickangle=0,  # No tilt needed for horizontal bars
    yaxis_title="Top Words",
    xaxis_title="Total Sum of GPT Scores",
    bargap=0.2,  # Reduce the gap between bars
    height=800,  # Increase height for better spacing of labels
)

# Customize bar appearance
fig.update_traces(
    marker_color="rgb(158,202,225)",
    marker_line_color="rgb(8,48,107)",
    marker_line_width=1.5,
    opacity=0.6,
)

# Sort bars in descending order
fig.update_yaxes(categoryorder="total descending")

# Display the plot
fig.show()

In [None]:
# First, let's ensure that the Date column is in datetime format
data["Date"] = pd.to_datetime(data["Date"], errors="coerce")

# Extract the quarter and year from the Date column for grouping
data["Year_Quarter"] = data["Date"].dt.to_period("Q").astype(str)

data_filter = data[data["Total_Occurrence_of_Topic"] > 4500]

# Group the dataframe by 'Top Words' and 'Year_Quarter' and sum the 'Prediction' column
df_quarterly_sum = (
    data_filter.groupby(["Top Words", "Year_Quarter"])["Prediction"].sum().reset_index()
)

# Rename columns for clarity
df_quarterly_sum.columns = ["Top Words", "Year_Quarter", "Sum of Prediction"]

In [None]:
# Plotting the time series with Plotly
fig = px.line(
    df_quarterly_sum,
    x="Year_Quarter",
    y="Sum of Prediction",
    color="Top Words",
    title="Sum of Predictions per Top Words Over Time (only topics with over 4500 occurences)",
    labels={"Year_Quarter": "Quarter", "Sum of Prediction": "Sum of Predictions"},
)

fig.show()