### Objective

In this notebook, we create a simple demo to showcase the concept of role-playing dual-chatbot for digesting ABB Review articles.

### 1. Import libraries

In [16]:
from chatbot import JournalistBot, AuthorBot
from embedding_engine import Embedder
from topic_classifier import TopicClassifier
import utilities
from pdf2image import convert_from_path
import PyPDF2
import os

import ipywidgets as widgets
from IPython.display import display, HTML, IFrame

We consider 4 issues of ABB Review

In [2]:
filenames = {
    'ABB Review 2022-03': 'ABB Review_03_2022_layout complete_EN_72-200dpi.pdf',
    'ABB Review 2023-01': 'ABB Review_01_2023_layout complete_EN_72-300dpi.pdf',
    'ABB Review 2023-02': 'ABB Review_02_2023_layout complete_EN_72-300dpi.pdf',
    'ABB Review 2023-03': 'ABB Review_03_2023_layout complete_EN_300dpi.pdf',
}

### 2. User journey

We instantiate an empty dictionary to store user selections.

In [3]:
user_journey = {}

#### 2.1 User selects journal issue 

In [4]:
# Create radio buttons for the journal issues
journal_issues = ['ABB Review 2022-03', 'ABB Review 2023-01', 
                  'ABB Review 2023-02', 'ABB Review 2023-03']

# User selection
issue_buttons = widgets.RadioButtons(
    options=journal_issues,
    value=None,  
    description='',
    disabled=False
)

output = widgets.Output()

# Display UI elements
display(HTML('<span style="font-weight:bold; font-size:18px;">Journal Issues:</span>'))
display(widgets.HBox([issue_buttons, output]))

# Function to display the first page of the selected journal
def show_journal_cover(journal_name):  
    file_path = os.path.join("./papers", filenames[journal_name])
    
    # Convert the first page of the PDF to an image
    images = convert_from_path(file_path, first_page=1, last_page=1, size=(300, None))
    
    # Clear the previous image
    output.clear_output()

    # Display the new image
    with output:
        display(images[0])

# Function to update and display the selected value
def on_value_change(change):
    if change['new'] in filenames:
        user_journey["selected_journal"] = filenames[change['new']]
        show_journal_cover(change['new'])

# Set up an observer to watch for changes
issue_buttons.observe(on_value_change, names='value')

HBox(children=(RadioButtons(options=('ABB Review 2022-03', 'ABB Review 2023-01', 'ABB Review 2023-02', 'ABB Re…

#### 2.2 User selects article

We first extract articles in the user-selected issue.

In [5]:
articles = utilities.extract_articles(user_journey["selected_journal"])
print(articles)

[{'start_page': 5, 'title': 'Editorial', 'subtitle': '', 'length': 5}, {'start_page': 10, 'title': 'For greater results', 'subtitle': 'High Speed Alignment – visual servoing technology for ultra- high precision assembly', 'length': 8}, {'start_page': 18, 'title': 'The right moves', 'subtitle': 'Software that optimizes robot performance', 'length': 6}, {'start_page': 24, 'title': 'The DCS of tomorrow', 'subtitle': 'Envisioning the future of process automation', 'length': 6}, {'start_page': 30, 'title': 'Safe cyber space', 'subtitle': 'ABB Ability™ Cyber Security Workplace', 'length': 6}, {'start_page': 36, 'title': 'The virtues of  virtualization', 'subtitle': 'Virtual protection and control for medium-  voltage substations', 'length': 6}, {'start_page': 42, 'title': 'Health monitor', 'subtitle': 'Better service decisions with ABB Ability™ Smart Master', 'length': 4}, {'start_page': 46, 'title': 'Access is everything', 'subtitle': 'Opening the door to a world of information', 'length': 

We then present the article list for users to select

In [6]:
article_widgets = []

def toggle_handler(change, article, button):
    if change['new']:  # Only act on the 'toggle on' action
        user_journey["selected_article"] = article
        # Deselect all other buttons
        for other_button in all_buttons:
            if other_button != button:
                other_button.value = False

all_buttons = []  # List to store all toggle buttons

for article in articles:
    title_button = widgets.ToggleButton(
        value=False,
        description=article['title'],
        tooltip=article['title'],
        button_style='',
        layout=widgets.Layout(width='90%')
    )
    subtitle_label = widgets.Label(
        value=article['subtitle'],
        layout=widgets.Layout(margin='0 0 20px 25px')  # Bottom margin added
    )
    
    # Attach the handler to the toggle button's value change event
    title_button.observe(lambda change, article=article, button=title_button: toggle_handler(change, article, button), names='value')
    all_buttons.append(title_button)
    
    # Group the button and label in a vertical box (VBox)
    group = widgets.VBox([title_button, subtitle_label])
    article_widgets.append(group)

# Layout the grouped widgets in a two-column grid
grid = widgets.GridBox(article_widgets, layout=widgets.Layout(grid_template_columns="repeat(2, 50%)"))

# Add a title label
title_label = widgets.HTML(value='<span style="font-size: 18px; font-weight: bold;">List of Articles:</span>', layout=widgets.Layout(margin='0 0 10px 0'))


display(title_label, grid)

HTML(value='<span style="font-size: 18px; font-weight: bold;">List of Articles:</span>', layout=Layout(margin=…

GridBox(children=(VBox(children=(ToggleButton(value=False, description='Editorial', layout=Layout(width='90%')…

We can also present the original PDF to the user.

In [31]:
def split_pdf(input_pdf_path, output_pdf_path, start_page, end_page):
    """Extracts a range of pages from a PDF and saves it as a new PDF.
    """
    with open(input_pdf_path, 'rb') as source_pdf:
        pdf_reader = PyPDF2.PdfReader(source_pdf)
        pdf_writer = PyPDF2.PdfWriter()
        
        for page_num in range(start_page, end_page + 1):
            page = pdf_reader.pages[page_num-1]
            pdf_writer.add_page(page)
        
        with open(output_pdf_path, 'wb') as output_pdf:
            pdf_writer.write(output_pdf)

In [34]:
# Extract selected PDF
split_pdf(input_pdf_path="./papers/"+user_journey['selected_journal'],
         output_pdf_path="./papers/temp.pdf",
         start_page=user_journey['selected_article']['start_page'],
         end_page=user_journey['selected_article']['start_page']+user_journey['selected_article']['length']-1)

# Display selected article
display(IFrame("./papers/temp.pdf", width=1000, height=600))

#### 2.3 Topic classification

Once the article is selected, we can run a topic classification to determine the relevant topics.

In [8]:
focal_points = {
        
    'Tech and product insights': {
        'description': 'Spotlight on new tech and product',
        'target audience': 'R&D engineers'
    },
    
    'Market dynamics': {
        'description': 'Explore the market implications',
        'target audience': 'marketing professionals'
    },
    
    'Operational transformation': {
        'description': 'Insights on optimized processes and operations',
        'target audience': 'operational experts & managers'
    },
    
    'Sustainability initiatives': {
        'description': "ABB's contributions to environmental sustainability",
        'target audience': 'sustainability officers'
    },
    
    'Customer experience': {
        'description': "Dive into the end-user benefits and experiences",
        'target audience': 'customers'
    },
    
    'Industry challenges and opportunities': {
        'description': "Peering into hurdles and growth areas",
        'target audience': 'business developers'
    },
    
    'Strategic collaborations': {
        'description': "Highlighting strategic partnerships",
        'target audience': 'partnership managers'
    },
    
    'Strategy innovation': {
        'description': "Unpacking ABB's approaches to business strategies",
        'target audience': 'executives'
    },
    
    'General overview': {
        'description': "A holistic breakdown of the article's key themes",
        'target audience': 'general public'
    }
}

In [9]:
# Extract metadata
issue_name = user_journey['selected_journal']
start_page = user_journey['selected_article']['start_page']
end_page = user_journey['selected_article']['start_page'] + user_journey['selected_article']['length'] - 1

# Initialize metadata
clf = TopicClassifier(issue=issue_name, page_num=[start_page, end_page])
relevant_topics = clf.classifier(topic_list=list(focal_points.keys()), verbose=True)
print(relevant_topics)

Processing 1/7th docs.
Processing 3/7th docs.
Processing 5/7th docs.
Processing 7/7th docs.


In [10]:
print(relevant_topics)

{'Sustainability initiatives': 6, 'Industry challenges and opportunities': 5, 'Tech and product insights': 4, 'Strategic collaborations': 3, 'Strategy innovation': 3, 'Market dynamics': 2, 'Operational transformation': 0, 'Customer experience': 0}
