<a href="https://colab.research.google.com/github/StephenJudeD/-NYTimes-Fox-News-Climate-Change-Topic-Modelling-BERTopic/blob/main/bertopic_nytimes_fox_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Topic Modelling using BERTopic & Climate Articles for NYTIMES & FOX NEWS

In [1]:
#pip install dataprep

In [66]:
pip install bertopic

In [3]:
from google.colab import drive  # Library for Google Drive integration

# Mount your Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import json
import pandas as pd

In [5]:
with open('/content/drive/MyDrive/climate_articles/data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Create DataFrame
df_data = pd.DataFrame(data)

In [6]:
# Load 'timeline.json'
with open('/content/drive/MyDrive/climate_articles/timeline.json', 'r') as f:
    timeline_data = json.load(f)

# Convert to a DataFrame
df_timeline = pd.DataFrame(timeline_data)

## Cleansing & preprocessing

In [7]:
#from dataprep.eda import create_report
# Generate an Exploratory Data Analysis (EDA) report with DataPrep
#create_report(df_data).show()

In [8]:
# Assuming df is your DataFrame
num_rows = df_data.shape[0]

print("Number of rows in the DataFrame:", num_rows)

Number of rows in the DataFrame: 4786


In [9]:
# Filter True duplicates
potential_duplicates_df = df_data[df_data['isDuplicate'] == True]

# Check for duplicate URLs
duplicate_urls = potential_duplicates_df[potential_duplicates_df.duplicated(subset=['url'], keep=False)]['url']

if not duplicate_urls.empty:
    print("Duplicate urls with True")
    print(duplicate_urls)
else:
    print("No Duplicate urls with True")

No Duplicate urls with True


In [10]:
# Find and print duplicate URLs
duplicate_urls = df_data[df_data.duplicated(subset=['url'], keep=False)]['url']

num_duplicates = duplicate_urls.size

print("Number of duplicate URLs:", num_duplicates)

Number of duplicate URLs: 39


In [11]:
# Delete duplicate rows, keeping the first occurrence
df_data = df_data.drop_duplicates(subset=['url'], keep='first')

In [12]:
import ast

# Delete duplicate rows, keeping the first occurrence
df_data = df_data.drop_duplicates(subset=['url'], keep='first')

# Filter for English
df_data = df_data[df_data['lang'] == 'eng']

# Create a new column
df_data['sourceUri'] = df_data['url'].apply(lambda x: 'nytimes.com' if 'nytimes.com' in x else 'foxnews.com')

# Merge title and body columns
df_data['text'] = df_data['title'] + " " + df_data['body']

In [13]:
# Assuming df is your DataFrame
num_rows = df_data.shape[0]

print("Number of rows in the DataFrame:", num_rows)

Number of rows in the DataFrame: 4607


In [14]:
# DataFrame with only the specified columns
df_data = df_data[['date', 'title', 'body' ,'text', 'sourceUri', 'concepts']]

## climate change more as a proportion of total output over time

In [15]:
# Terms related to climate change
# 'label': {'eng': 'Climate change'}}
climate_terms = ['climate change']

# Function to check if a concept label contains climate-related terms
def is_climate_related(concepts):
    for concept in concepts:
        label = concept.get('label', {}).get('eng', '').lower()
        for term in climate_terms:
            if term in label:
                return True
    return False

# Applying the function to filter climate change related articles
climate_articles = df_data[df_data['concepts'].apply(is_climate_related)]

# Count the number of climate change-related articles
climate_articles_count_data = climate_articles.shape[0]

# Total number of articles from the timeline data
total_articles_timeline = df_timeline['count'].sum()

print("Number of climate change-related articles in the data:", climate_articles_count_data)
print("Total number of articles in the timeline:", total_articles_timeline)

Number of climate change-related articles in the data: 2882
Total number of articles in the timeline: 113578


In [16]:
df_data.head()

Unnamed: 0,date,title,body,text,sourceUri,concepts
0,2023-12-27,"Hawaii woman, 78, becomes 100th victim of Laha...","Despite financial and emotional challenges, Ra...","Hawaii woman, 78, becomes 100th victim of Laha...",foxnews.com,[{'uri': 'http://en.wikipedia.org/wiki/Vulnera...
1,2023-12-27,Voters demand more out of Vice President Harri...,"By entering your email, you are agreeing to Fo...",Voters demand more out of Vice President Harri...,foxnews.com,[{'uri': 'http://en.wikipedia.org/wiki/Kamala_...
2,2023-12-27,Innovative Storytelling From 2023,"The Times is a newspaper, but it's not only a ...",Innovative Storytelling From 2023 The Times is...,nytimes.com,[{'uri': 'http://en.wikipedia.org/wiki/Ezra_Kl...
3,2023-12-27,Biden anti-consumer crusade targets 4 more typ...,Fox News Flash top headlines are here. Check o...,Biden anti-consumer crusade targets 4 more typ...,foxnews.com,[{'uri': 'http://en.wikipedia.org/wiki/U.S._Co...
4,2023-12-26,A Natural Gas Project Is Biden's Next Big Clim...,"David Gelles reported from New York City, Clif...",A Natural Gas Project Is Biden's Next Big Clim...,nytimes.com,[{'uri': 'http://en.wikipedia.org/wiki/Louisia...


In [17]:
df_data_copy = df_data

In [18]:
df_timeline.head()

Unnamed: 0,date,count,sourceUri
0,2024-01-01,34,nytimes.com
1,2024-01-02,30,nytimes.com
2,2024-01-03,67,nytimes.com
3,2024-01-04,74,nytimes.com
4,2024-01-05,40,nytimes.com


In [19]:
df_data_copy['Climate_Articles'] = df_data_copy['concepts'].apply(is_climate_related)

# Group by date and sourceUri and sum the counts of climate change-related articles
climate_articles_count_timeline = df_data_copy.groupby(['date', 'sourceUri'])['Climate_Articles'].sum().reset_index()

# Merge with the timeline DataFrame based on date and sourceUri
df_timeline_with_climate = pd.merge(df_timeline, climate_articles_count_timeline, on=['date', 'sourceUri'], how='left')

print(df_timeline_with_climate)

           date  count    sourceUri  Climate_Articles
0    2024-01-01     34  nytimes.com               NaN
1    2024-01-02     30  nytimes.com               0.0
2    2024-01-03     67  nytimes.com               2.0
3    2024-01-04     74  nytimes.com               2.0
4    2024-01-05     40  nytimes.com               1.0
..          ...    ...          ...               ...
726  2023-12-24     80  foxnews.com               1.0
727  2023-12-25     63  foxnews.com               NaN
728  2023-12-26     96  foxnews.com               NaN
729  2023-12-27    145  foxnews.com               3.0
730  2023-12-28     23  foxnews.com               NaN

[731 rows x 4 columns]


In [20]:
# Calculcations
total_climate_articles_by_outlet = df_timeline_with_climate.groupby('sourceUri')['Climate_Articles'].sum()
total_articles_by_outlet = df_timeline_with_climate.groupby('sourceUri')['count'].sum()
proportions = total_climate_articles_by_outlet / total_articles_by_outlet * 100

# Print the results
print("climate change Articles to total Articles by source:")
for outlet, proportion in proportions.items():
    print(f"{outlet}: {proportion:.2f}%")

climate change Articles to total Articles by source:
foxnews.com: 1.93%
nytimes.com: 3.44%


In [64]:
import plotly.express as px

# Plotly interactive Bar Chart

# Create a DataFrame for proportions
proportions_df = pd.DataFrame({
    'Outlet': proportions.index,
    'Proportion': proportions.values
})

# Define the colors for each outlet
colors = {'foxnews.com': 'red', 'nytimes.com': 'blue'}

# Create a simple bar chart with custom styling
fig = px.bar(proportions_df, x='Outlet', y='Proportion',
             title='Bar Plot Showing Climate Change Articles by Outlet',
             labels={'Proportion': 'Percentage (%)'},
             width=1250,
             color='Outlet',  # Specify the color parameter
             color_discrete_map=colors,  # Map each outlet to its respective color
             opacity=0.6  # Set the opacity
             )

fig.show()

In [65]:
# Visualize the topics over time for fox docs
fig.write_html("/content/drive/MyDrive/github_bert/climate_bar.html")

In [22]:
df_timeline_with_climate.head()

Unnamed: 0,date,count,sourceUri,Climate_Articles
0,2024-01-01,34,nytimes.com,
1,2024-01-02,30,nytimes.com,0.0
2,2024-01-03,67,nytimes.com,2.0
3,2024-01-04,74,nytimes.com,2.0
4,2024-01-05,40,nytimes.com,1.0


## Extreme Weather Articles Over Time

In [23]:
# Terms related to extreme weather
# Reusing Code from above to filter for extreme weather category
extreme_weather_terms = ['joe biden']

# Function to check if a concept label contains extreme weather-related terms
def is_extreme_weather_related(concepts):
    for concept in concepts:
        label = concept.get('label', {}).get('eng', '').lower()
        for term in extreme_weather_terms:
            if term in label:
                return True
    return False

# Applying the function to filter extreme weather related articles
extreme_weather_articles = df_data[df_data['concepts'].apply(is_extreme_weather_related)]

# Count the number of extreme weather-related articles
extreme_weather_articles_count_data = extreme_weather_articles.shape[0]

# Total number of articles from the timeline data
total_articles_timeline = df_timeline['count'].sum()

print("Number of extreme weather-related articles in the data:", extreme_weather_articles_count_data)
print("Total number of articles in the timeline:", total_articles_timeline)


Number of extreme weather-related articles in the data: 1723
Total number of articles in the timeline: 113578


In [24]:
import pandas as pd
import plotly.express as px

# Convert 'date' column to datetime type
df_timeline_with_extreme_weather = extreme_weather_articles.copy()
df_timeline_with_extreme_weather['date'] = pd.to_datetime(df_timeline_with_extreme_weather['date'])

# Group by outlet and week, and sum the number of extreme weather articles
df_weekly_extreme_weather = df_timeline_with_extreme_weather.groupby(['sourceUri', pd.Grouper(key='date', freq='W')])['Climate_Articles'].sum().reset_index()

# Line plot
fig = px.line(df_weekly_extreme_weather, x='date', y='Climate_Articles', color='sourceUri',
              title='Number of Extreme Weather Articles Over Time (Weekly)',
              labels={'date': 'Date', 'Climate_Articles': 'Number of Articles', 'sourceUri': 'Outlet'})

# Set custom tick values for x-axis to display every week
fig.update_xaxes(
    tickmode='linear',
    dtick='M1',  # Set tick interval to every month
    tickformat='%b\n%Y',  # Display month and year
    showgrid=True,
    gridcolor='lightgray',
    gridwidth=1
)

# Apply style
fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="1m", step="month", stepmode="backward"),
            dict(count=3, label="3m", step="month", stepmode="backward"),
            dict(count=6, label="6m", step="month", stepmode="backward"),
            dict(count=9, label="9m", step="month", stepmode="backward"),
            dict(step="all")
        ])
    ),
    tickformat="%b\n%Y",
)

# Customize line colors and style
fig.update_traces(line=dict(width=3)) # Keep lines a bit thicker

# Choose a color palette from Color Hunt: https://colorhunt.co/
color_palette = ['#88C0D0', '#E58E26', '#4DB6AC']

fig.for_each_trace(
    lambda trace: trace.update(line=dict(color=color_palette[fig.data.index(trace)]))
)

# Set plot title and label fonts
fig.update_layout(
    title_font=dict(size=22, color='black', family='Arial'),
    legend_font=dict(size=14, color='black', family='Comic Sans'),
    legend_title_font=dict(size=16, color='black'),
    plot_bgcolor='rgba(250, 250, 250, 0.95)'
)

fig.show()


In [62]:
import pandas as pd
import plotly.express as px

# Convert 'date' column to datetime type
df_timeline_with_extreme_weather = extreme_weather_articles.copy()
df_timeline_with_extreme_weather['date'] = pd.to_datetime(df_timeline_with_extreme_weather['date'])

# Group by outlet and week, and sum the number of extreme weather articles
df_weekly_extreme_weather = df_timeline_with_extreme_weather.groupby(['sourceUri', pd.Grouper(key='date', freq='W')])['Climate_Articles'].sum().reset_index()

# Line plot
fig = px.line(df_weekly_extreme_weather, x='date', y='Climate_Articles', color='sourceUri',
              title='Number of Extreme Weather Articles Over Time (Weekly)',
              labels={'date': 'Date', 'Climate_Articles': 'Number of Articles', 'sourceUri': 'Outlet'},
              template='plotly_dark')  # Dark theme for a modern look


# Set custom tick values for x-axis to display every week
fig.update_xaxes(
    tickmode='linear',
    dtick='M1',  # Set tick interval to every month
    tickformat='%b\n%Y',  # Display month and year
    showgrid=True,
    gridcolor='rgba(255, 255, 255, 0.05)',  # Lighter grid color
    gridwidth=1,
    rangeslider_visible=True  # Ensure the range slider is visible
)

# Apply style
fig.update_xaxes(
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="1m", step="month", stepmode="backward"),
            dict(count=3, label="3m", step="month", stepmode="backward"),
            dict(count=6, label="6m", step="month", stepmode="backward"),
            dict(count=9, label="9m", step="month", stepmode="backward"),
            dict(step="all")
        ]),
        bgcolor='rgba(0, 0, 0, 0.5)',  # Semi-transparent black background
        bordercolor='rgba(255, 255, 255, 0.5)',  # Semi-transparent white border
        font=dict(color='white')  # White font color
    ),
    tickformat="%b\n%Y",
)

# Smoother lines
fig.update_traces(line=dict(width=4, smoothing=0.5))  # Increase line width and apply some smoothing

# Choose a color palette from ColorBrewer: https://colorbrewer2.org/
color_palette = px.colors.qualitative.Dark24

fig.for_each_trace(
    lambda trace: trace.update(line=dict(color=color_palette[fig.data.index(trace) % len(color_palette)]))  # Apply color palette
)

# Set plot title and label fonts
fig.update_layout(
    title_font=dict(size=28, color='white', family='Arial'),  # Larger title font and white color
    legend_font=dict(size=16, color='white', family='Open Sans'),  # Larger legend font and white color
    legend_title_font=dict(size=18, color='white'),  # Larger legend title font and white color
    plot_bgcolor='rgba(0, 0, 0, 0)',  # Transparent plot background
    paper_bgcolor='rgba(0, 0, 0, 0)'  # Transparent paper background
)

fig.show()

In [63]:
# Visualize the topics over time for fox docs
fig.write_html("/content/drive/MyDrive/github_bert/extreme_weather.html")

## Topic Modelling overtime with BERTopic



https://maartengr.github.io/BERTopic/getting_started/topicsovertime/topicsovertime.html#example # Key example for topics overtime

https://www.sbert.net/docs/pretrained_models.html # sentence trasnformer choices


https://maartengr.github.io/BERTopic/algorithm/algorithm.html # berTopic

https://maartengr.github.io/BERTopic/getting_started/embeddings/embeddings.html # guide to embeddings

https://maartengr.github.io/BERTopic/getting_started/visualization/visualize_topics.html # inbuilt visualisations



In [26]:
df_data.head()

Unnamed: 0,date,title,body,text,sourceUri,concepts,Climate_Articles
0,2023-12-27,"Hawaii woman, 78, becomes 100th victim of Laha...","Despite financial and emotional challenges, Ra...","Hawaii woman, 78, becomes 100th victim of Laha...",foxnews.com,[{'uri': 'http://en.wikipedia.org/wiki/Vulnera...,True
1,2023-12-27,Voters demand more out of Vice President Harri...,"By entering your email, you are agreeing to Fo...",Voters demand more out of Vice President Harri...,foxnews.com,[{'uri': 'http://en.wikipedia.org/wiki/Kamala_...,True
2,2023-12-27,Innovative Storytelling From 2023,"The Times is a newspaper, but it's not only a ...",Innovative Storytelling From 2023 The Times is...,nytimes.com,[{'uri': 'http://en.wikipedia.org/wiki/Ezra_Kl...,False
3,2023-12-27,Biden anti-consumer crusade targets 4 more typ...,Fox News Flash top headlines are here. Check o...,Biden anti-consumer crusade targets 4 more typ...,foxnews.com,[{'uri': 'http://en.wikipedia.org/wiki/U.S._Co...,True
4,2023-12-26,A Natural Gas Project Is Biden's Next Big Clim...,"David Gelles reported from New York City, Clif...",A Natural Gas Project Is Biden's Next Big Clim...,nytimes.com,[{'uri': 'http://en.wikipedia.org/wiki/Louisia...,True


In [27]:
# Print the first element of the concepts column
print(df_data['concepts'].iloc[0])

[{'uri': 'http://en.wikipedia.org/wiki/Vulnerable_species', 'type': 'wiki', 'score': 5, 'label': {'eng': 'Vulnerable species'}}, {'uri': 'http://en.wikipedia.org/wiki/Wildfire', 'type': 'wiki', 'score': 5, 'label': {'eng': 'Wildfire'}}, {'uri': 'http://en.wikipedia.org/wiki/Cough', 'type': 'wiki', 'score': 5, 'label': {'eng': 'Cough'}}, {'uri': 'http://en.wikipedia.org/wiki/Maui', 'type': 'loc', 'score': 5, 'label': {'eng': 'Maui'}, 'location': {'type': 'place', 'label': {'eng': 'Maui'}, 'country': {'type': 'country', 'label': {'eng': 'United States'}}}}, {'uri': 'http://en.wikipedia.org/wiki/Lahaina,_Hawaii', 'type': 'loc', 'score': 5, 'label': {'eng': 'Lahaina, Hawaii'}, 'location': {'type': 'place', 'label': {'eng': 'Lahaina, Hawaii'}, 'country': {'type': 'country', 'label': {'eng': 'United States'}}}}, {'uri': 'http://en.wikipedia.org/wiki/Calico', 'type': 'wiki', 'score': 4, 'label': {'eng': 'Calico'}}, {'uri': 'http://en.wikipedia.org/wiki/Tropical_cyclone', 'type': 'wiki', 'scor

In [28]:
# Terms related to climate change (case-insensitive)
climate_terms = ['climate change']


def is_climate_related(concepts):
    for concept in concepts:
        if concept.get('score') in [4, 5]:  # Check for score 4 or 5
            label = concept.get('label', {}).get('eng', '').lower()
            for term in climate_terms:
                if term in label:
                    return True  # Return True immediately if a match is found
    return False  # Return False if no match is found


def create_climate_article_column(df):
    df['Climate_Article_4_or_5'] = df['concepts'].apply(is_climate_related)
    return df


# Apply the function to create a new column
df_data = create_climate_article_column(df_data.copy())  # Operate on a copy

# Print the first 5 rows of the dataframe (including the new column)
print(df_data.head())

         date                                              title  \
0  2023-12-27  Hawaii woman, 78, becomes 100th victim of Laha...   
1  2023-12-27  Voters demand more out of Vice President Harri...   
2  2023-12-27                  Innovative Storytelling From 2023   
3  2023-12-27  Biden anti-consumer crusade targets 4 more typ...   
4  2023-12-26  A Natural Gas Project Is Biden's Next Big Clim...   

                                                body  \
0  Despite financial and emotional challenges, Ra...   
1  By entering your email, you are agreeing to Fo...   
2  The Times is a newspaper, but it's not only a ...   
3  Fox News Flash top headlines are here. Check o...   
4  David Gelles reported from New York City, Clif...   

                                                text    sourceUri  \
0  Hawaii woman, 78, becomes 100th victim of Laha...  foxnews.com   
1  Voters demand more out of Vice President Harri...  foxnews.com   
2  Innovative Storytelling From 2023 The Times 

In [29]:
# Drop the misspelled column (optional)
df_data.head()

Unnamed: 0,date,title,body,text,sourceUri,concepts,Climate_Articles,Climate_Article_4_or_5
0,2023-12-27,"Hawaii woman, 78, becomes 100th victim of Laha...","Despite financial and emotional challenges, Ra...","Hawaii woman, 78, becomes 100th victim of Laha...",foxnews.com,[{'uri': 'http://en.wikipedia.org/wiki/Vulnera...,True,False
1,2023-12-27,Voters demand more out of Vice President Harri...,"By entering your email, you are agreeing to Fo...",Voters demand more out of Vice President Harri...,foxnews.com,[{'uri': 'http://en.wikipedia.org/wiki/Kamala_...,True,False
2,2023-12-27,Innovative Storytelling From 2023,"The Times is a newspaper, but it's not only a ...",Innovative Storytelling From 2023 The Times is...,nytimes.com,[{'uri': 'http://en.wikipedia.org/wiki/Ezra_Kl...,False,False
3,2023-12-27,Biden anti-consumer crusade targets 4 more typ...,Fox News Flash top headlines are here. Check o...,Biden anti-consumer crusade targets 4 more typ...,foxnews.com,[{'uri': 'http://en.wikipedia.org/wiki/U.S._Co...,True,True
4,2023-12-26,A Natural Gas Project Is Biden's Next Big Clim...,"David Gelles reported from New York City, Clif...",A Natural Gas Project Is Biden's Next Big Clim...,nytimes.com,[{'uri': 'http://en.wikipedia.org/wiki/Louisia...,True,False


In [30]:
# Filter the data for climate-related articles based on the 'climate_articles' column
climate_articles = df_data[df_data['Climate_Article_4_or_5'] == True]

# Count the number of climate-related articles
num_climate_articles = len(climate_articles)
print("Number of climate-related articles with a Score of 4 or 5 (Intensity):", num_climate_articles)

# Display the distribution of climate-related articles by sourceUri
climate_articles_distribution = climate_articles['sourceUri'].value_counts()
print("\nClimate-related articles distribution by sourceUri:\n", climate_articles_distribution)

# Filter NYTimes documents related to climate
nytimes_docs = climate_articles[climate_articles['sourceUri'] == 'nytimes.com'][['date', 'text']]

# Filter FoxNews documents related to climate
foxnews_docs = climate_articles[climate_articles['sourceUri'] == 'foxnews.com'][['date', 'text']]


Number of climate-related articles with a Score of 4 or 5 (Intensity): 769

Climate-related articles distribution by sourceUri:
 sourceUri
nytimes.com    477
foxnews.com    292
Name: count, dtype: int64


## Bertopic Overtime

In [32]:
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
import pandas as pd

# Initialize SentenceTransformer model
sentence_model = SentenceTransformer("all-distilroberta-v1")

# Define English stop words
stop_words = 'english'

# Preprocess documents using CountVectorizer with stop words removal
count_vectorizer = CountVectorizer(stop_words=stop_words)
nytimes_vectorized = count_vectorizer.fit_transform(nytimes_docs['text'])
foxnews_vectorized = count_vectorizer.transform(foxnews_docs['text'])

# Convert sparse matrix to list of strings
nytimes_texts = [' '.join(row.split()) for row in nytimes_docs['text']]
foxnews_texts = [' '.join(row.split()) for row in foxnews_docs['text']]

# Create and train a BERTopic model for NYTimes documents
nytimes_model = BERTopic(calculate_probabilities=True, verbose=True, embedding_model=sentence_model)
nytimes_topics, _ = nytimes_model.fit_transform(nytimes_texts)

# Update topics to remove stop words
nytimes_model.update_topics(nytimes_texts, vectorizer_model=count_vectorizer)

# Generate topic representations over time for NYTimes documents (without top_n_topics)
nytimes_topics_over_time = nytimes_model.topics_over_time(
    nytimes_texts,  # Pass the list of strings instead of the DataFrame
    nytimes_docs['date'].tolist(),
    nr_bins=20
)

# Visualize ALL the topics over time for NYTimes documents
nytimes_model.visualize_topics_over_time(nytimes_topics_over_time)

# Create and train a BERTopic model for FoxNews documents
foxnews_model = BERTopic(calculate_probabilities=True, verbose=True, embedding_model=sentence_model)
foxnews_topics, _ = foxnews_model.fit_transform(foxnews_texts)

# Update topics to remove stop words
foxnews_model.update_topics(foxnews_texts, vectorizer_model=count_vectorizer)

# Generate topic representations over time for FoxNews documents (without top_n_topics)
foxnews_topics_over_time = foxnews_model.topics_over_time(
    foxnews_texts,  # Pass the list of strings instead of the DataFrame
    foxnews_docs['date'].tolist(),
    nr_bins=20
)

# Visualize ALL the topics over time for FoxNews documents
foxnews_model.visualize_topics_over_time(foxnews_topics_over_time)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2024-05-04 11:18:23,693 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/15 [00:00<?, ?it/s]

2024-05-04 11:18:36,232 - BERTopic - Embedding - Completed ✓
2024-05-04 11:18:36,240 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-04 11:18:46,706 - BERTopic - Dimensionality - Completed ✓
2024-05-04 11:18:46,708 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-04 11:18:46,749 - BERTopic - Cluster - Completed ✓
2024-05-04 11:18:46,755 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-04 11:18:47,539 - BERTopic - Representation - Completed ✓
20it [00:01, 12.21it/s]
2024-05-04 11:18:50,573 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

2024-05-04 11:18:55,831 - BERTopic - Embedding - Completed ✓
2024-05-04 11:18:55,832 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-04 11:18:59,556 - BERTopic - Dimensionality - Completed ✓
2024-05-04 11:18:59,557 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-04 11:18:59,579 - BERTopic - Cluster - Completed ✓
2024-05-04 11:18:59,584 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-04 11:18:59,905 - BERTopic - Representation - Completed ✓
20it [00:00, 25.86it/s]


## Topics Over Time & Intertopic Distribution

In [47]:
def visualize_and_display_topics_over_time(model_name, model_object, top_n_topics=20, output_file=None):
  """
  Visualizes topics over time for a given model, displays the plot with a title, and optionally saves it as HTML.

  Args:
      model_name (str): Name of the model (e.g., "nytimes_model", "foxnews_model").
      model_object (object): The model object containing the `visualize_topics_over_time()` method (or similar).
      top_n_topics (int, optional): Number of top topics to display. Defaults to 20.
      output_file (str, optional): Path to save the visualization as HTML. Defaults to None (no saving).
  """
  # Generate the visualization
  fig = model_object.visualize_topics_over_time(nytimes_topics_over_time if model_name == "nytimes_model" else foxnews_topics_over_time, top_n_topics=top_n_topics)

  # Set informative title
  title = f"{model_name.title()} - Topics Over Time (Top {top_n_topics})"
  fig.update_layout(title=title)

  # Display the plot
  fig.show()

  # Save as HTML (optional)
  if output_file:
    fig.write_html(output_file)

def visualize_and_display_topics(model_name, model_object, output_file=None):
  """
  Visualizes topics for a given model, displays the plot with a title, and optionally saves it as HTML.

  Args:
      model_name (str): Name of the model (e.g., "nytimes_model", "foxnews_model").
      model_object (object): The model object containing the `visualize_topics()` method.
      output_file (str, optional): Path to save the visualization as HTML. Defaults to None (no saving).
  """
  # Generate the visualization
  fig = model_object.visualize_topics()

  # Set informative title
  title = f"{model_name.title()} - Topic Modeling"
  fig.update_layout(title=title)

  # Display the plot
  fig.show()

  # Save as HTML (optional)
  if output_file:
    fig.write_html(output_file)

In [48]:
# Visualize topics over time for NYT model with title and optional saving
visualize_and_display_topics_over_time("nytimes_model", nytimes_model, top_n_topics=20, output_file="/content/drive/MyDrive/github_bert/nytimes_dtm.html")

In [49]:
# Visualize topics over time for Fox News model with title and optional saving
visualize_and_display_topics_over_time("foxnews_model", foxnews_model, top_n_topics=20, output_file="/content/drive/MyDrive/github_bert/foxnews_dtm.html")

In [50]:
# Visualize topics for NYT model with title and optional saving
visualize_and_display_topics("nytimes_model", nytimes_model, output_file="/content/drive/MyDrive/github_bert/nytimes_topics.html")

In [51]:
# Visualize topics for Fox News model with title and optional saving
visualize_and_display_topics("foxnews_model", foxnews_model, output_file="/content/drive/MyDrive/github_bert/foxnews_topics.html")

## Heatmap & Bar Chart

In [52]:
def visualize_and_display(model_name, model_object, plot_type, output_file=None):
  """
  Visualizes a plot (heatmap or barchart) for a given model, displays it with a title, and optionally saves it as HTML.

  Args:
      model_name (str): Name of the model (e.g., "nytimes_model", "foxnews_model").
      model_object (object): The model object containing the visualization methods.
      plot_type (str): Type of plot to visualize ("heatmap" or "barchart").
      output_file (str, optional): Path to save the visualization as HTML. Defaults to None (no saving).
  """
  # Generate the visualization
  if plot_type == "heatmap":
    fig = model_object.visualize_heatmap()
  elif plot_type == "barchart":
    fig = model_object.visualize_barchart()
  else:
    raise ValueError(f"Invalid plot type: {plot_type}. Supported types: 'heatmap', 'barchart'")

  # Set informative title
  title = f"{model_name.title()} - {plot_type.title()}"
  fig.update_layout(title=title)

  # Display the plot
  fig.show()

  # Save as HTML (optional)
  if output_file:
    fig.write_html(output_file)

In [53]:
# Visualize heatmap for NYT model with title and optional saving
visualize_and_display("nytimes_model", nytimes_model, plot_type="heatmap", output_file="/content/drive/MyDrive/github_bert/nytimes_heatmap.html")

In [54]:
# Visualize heatmap for Fox News model with title and optional saving
visualize_and_display("foxnews_model", foxnews_model, plot_type="heatmap", output_file="/content/drive/MyDrive/github_bert/foxnews_heatmap.html")

In [55]:
# Visualize barchart for NYT model with title and optional saving
visualize_and_display("nytimes_model", nytimes_model, plot_type="barchart", output_file="/content/drive/MyDrive/github_bert/nytimes_barchart.html")

In [56]:
# Visualize barchart for Fox News model with title and optional saving
visualize_and_display("foxnews_model", foxnews_model, plot_type="barchart", output_file="/content/drive/MyDrive/github_bert/foxnews_barchart.html")

## Display Documents behind the Data

In [57]:
def visualize_and_display_documents(model_name, model_object, documents, title, output_file=None):
  """
  Visualizes documents using a model's visualization method, displays it with a title, and optionally saves it as HTML.

  Args:
      model_name (str): Name of the model (e.g., "nytimes_model", "foxnews_model").
      model_object (object): The model object containing the visualization method.
      documents (list): List of documents to visualize.
      title (str): Title for the visualization.
      output_file (str, optional): Path to save the visualization as HTML. Defaults to None (no saving).
  """
  # Generate the visualization
  fig = model_object.visualize_documents(documents)

  # Set informative title
  fig.update_layout(title=title)

  # Display the plot
  fig.show()

  # Save as HTML (optional)
  if output_file:
    fig.write_html(output_file)

In [58]:
# Visualize NYT documents with title and optional saving
visualize_and_display_documents("nytimes_model", nytimes_model, nytimes_texts, title="New York Times Document Visualization", output_file="/content/drive/MyDrive/github_bert/nytimes_dtm_documents.html")

In [59]:
# Visualize Fox News documents with title and optional saving
visualize_and_display_documents("foxnews_model", foxnews_model, foxnews_texts, title="Fox News Document Visualization", output_file="/content/drive/MyDrive/github_bert/foxnews_dtm_documents.html")