In [1]:
from dash import dcc, html
from dash.dependencies import Input, Output
from wordcloud import WordCloud

import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import numpy as np
import base64
import dash
import io

In [2]:
df = pd.read_csv("../../Datasets/bbc_news_preprocessed_UMAP.csv")
docs = list(df['docs'])

In [3]:
def createUMAP(dataFrame):
    fig = go.Figure()

    uniqueTopics = np.sort(dataFrame['topics'].unique())
    colors = {topic: f"rgba({i * 30 % 255}, {(i * 60 + 100) % 255}, {(i * 90 + 150) % 255}, 0.6)" for i, topic in enumerate(uniqueTopics)}
    
    centroids = df[['x','y','topicName']].groupby('topicName').mean()

    for topic in uniqueTopics:
        topicData = dataFrame[dataFrame['topics'] == topic]
        fig.add_trace(go.Scattergl(
            x=topicData['x'],
            y=topicData['y'],
            mode='markers',
            hovertext=topicData['title'],
            hoverinfo="text",
            name=topicData['topicName'].iloc[0],
            showlegend=True,
            marker=dict(color=colors[topic])
        ))
    
    for row in centroids.itertuples():
        fig.add_annotation(
            x=row[1], y=row[2],
            text=row[0],
            showarrow=False,
            font=dict(size=10, color='black')
        )

    fig.update_layout(
        template="simple_white",
        title={
            'text': "Documents and Topics",
            'x': 0.5,
            'font':{'size':15}
        },
    ) 
    
    fig.update_xaxes(visible=False)
    fig.update_yaxes(visible=False)
    
    return fig

In [12]:
def createHistogram(data, title):
    fig = px.histogram(
        data,
        x="shortPubDate",
        title=title,
        labels={
            "shortPubDate": "Publication Date (Month)",
            "amount": "Document Count"
        },
        color_discrete_sequence=['red']
    )
    fig.update_layout(
        template="simple_white",
        xaxis_title="Publication Date (Month)",
        yaxis_title="Document Count",
        font=dict(size=10, color='black'),
        title_x=0.5
    )
    fig.update_traces(
        hovertemplate="<br>".join([
            "Publication Date: %{x}",
            "Document Count: %{y}",
        ]),
    )
    return fig

In [5]:
def createWordCloud(topicName):
    topicNumber = df[df['topicName'] == topicName]['topics'].iat[0]
    filtered_df = df[df['topics'] == topicNumber]
    textDocuments = '\n'.join(filtered_df['docs'])
    wc = WordCloud(background_color='white',width=650,height=325).generate(textDocuments)
    return wc

In [6]:
def encodeImage(image_file):
    with open(image_file, 'rb') as f:
        encodedImage = base64.b64encode(f.read()).decode()
    return f"data:image/png;base64,{encodedImage}"

In [14]:
umap_fig = createUMAP(df)
app = dash.Dash(__name__)
currentTopic = ''

defaultImageSrc = encodeImage('../../Images/BBCNewsLogo.png')

app.layout = html.Div([
    html.Div([
        dcc.Graph(id='umap-graph', figure=umap_fig, style={'flex-basis': '60%', 'height': '100%'}),
        html.Div([
            html.Div([
                html.P(id='wc-title', style={'textAlign': 'center', 'fontFamily': 'Arial', 'fontSize': '12', 'color': 'black', 'fontWeight': 'normal'}),
                html.Img(id='image_wc', style={'height': 'calc(100% - 40px)', 'width': '100%'})
            ], style={'height': '50%', 'display': 'flex', 'flexDirection': 'column', 'alignItems': 'center'}),
            dcc.Graph(id='histogram', style={'height': '50%'})
            
        ], style={'flex-basis': '40%', 'height': '100%', 'display': 'flex', 'flexDirection': 'column'})
    ], style={'display': 'flex', 'flexDirection': 'row', 'width': '100vw', 'height': '100vh'})
])

@app.callback(
    [Output('histogram', 'figure'),
     Output('wc-title', 'children'),
     Output('image_wc', 'src')],
    [Input('umap-graph', 'clickData')]
)
def displayInfo(clickData):
    if clickData is None:
        histogram = createHistogram(df, "Document Count by Month - All Topics")
        wcTitle = ""
        wcSrc = defaultImageSrc
    else:
        clickTitle = clickData['points'][0]['hovertext']
        topicName = df[df['title'] == clickTitle]['topicName'].iat[0]
        filtered_df = df[df['topicName'] == topicName]

        histogram = createHistogram(
            filtered_df.sort_values(by='shortPubDate'),
            f"Document Count by Month - {topicName}"
        )
        
        wc = createWordCloud(topicName)
        wc_image = io.BytesIO()
        wc.to_image().save(wc_image, format='PNG')
        wc_image.seek(0)
        wcSrc = 'data:image/png;base64,{}'.format(base64.b64encode(wc_image.getvalue()).decode())
        wcTitle = f"Word Cloud - {topicName}"
    
    return histogram, wcTitle, wcSrc

if __name__ == '__main__':
    app.run_server(debug=False)