In [1]:
import os, sys, logging, time, requests
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
from nltk.corpus import stopwords
import nltk

# Downloading NLTK stopwords
nltk.download('stopwords')

# GLOBAL CONFIGURATION
stop_words = set(stopwords.words('english'))
extra_stopwords = {"via", "vyopta"}

# Logger Configuration
class LoggerConfig:
    _instance = None

    def __new__(cls, log_file_path='logs/app.log', log_level=logging.INFO):
        if not cls._instance:
            cls._instance = super(LoggerConfig, cls).__new__(cls)
            cls._instance.log_file_path = log_file_path
            cls._instance.log_level = log_level
            cls._instance.configure_logging()
        return cls._instance

    def configure_logging(self):
        log_dir = os.path.dirname(self.log_file_path)
        if not os.path.exists(log_dir):
            os.makedirs(log_dir)

        logging.basicConfig(
            level=self.log_level,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(self.log_file_path, 'w', 'utf-8'),
                logging.StreamHandler()
            ]
        )

    @staticmethod
    def from_environment(default_path='logs/app.log'):
        log_level = os.getenv('LOG_LEVEL', 'INFO').upper()
        log_file = os.getenv('LOG_FILE', default_path)
        return LoggerConfig(log_file, getattr(logging, log_level, logging.INFO))

# Zendesk API Connector
class ZendeskAPIConnector:
    def __init__(self, base_url='https://support.vyopta.com/api/v2', api_token=''):
        self.base_url = base_url.rstrip('/')
        self.api_token = api_token
        self.api_call_count = 0
        self.page_size = 100
        self.retry_limit = 5
        self.checkpoints = {}
        LoggerConfig.from_environment()
        self.logger = logging.getLogger(__name__)

    def _fetch_resource(self, endpoint, resource_description):
        results = []
        page_url = self.checkpoints.get(endpoint, f"{self.base_url}{endpoint}?per_page={self.page_size}")
        retries = 0

        while page_url:
            try:
                headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {}
                response = requests.get(page_url, headers=headers)

                if response.status_code == 429:
                    wait_time = int(response.headers.get("Retry-After", 60))
                    self.logger.info(f"Rate limit reached. Waiting for {wait_time} seconds.")
                    time.sleep(wait_time)
                    continue

                response.raise_for_status()
                data = response.json()

                for key in ['sections', 'articles', 'topics', 'posts']:
                    if key in data:
                        results.extend(data[key])
                        page_url = data.get('next_page', None)
                        self.checkpoints[endpoint] = page_url
                        break
                else:
                    self.logger.info(f"No expected data keys found in the response for {resource_description}.")
                    break

                self.api_call_count += 1
                retries = 0
            except requests.exceptions.RequestException as e:
                self.logger.error(f"Error fetching {resource_description}: {e}")
                if retries < self.retry_limit:
                    retries += 1
                    time.sleep(min(2 ** retries, 120))
                else:
                    self.logger.error(f"Max retries exceeded for {resource_description}.")
                    page_url = None
                    retries = 0

        return results

    def fetch_data(self):
        self.logger.info("Fetching data from Zendesk API.")
        documents = self._fetch_resource('/help_center/en-us/articles.json', "public documentation")
        topics = self._fetch_resource('/community/topics.json', "community topics")
        all_documents = documents

        for topic in topics:
            topic_id = topic.get("id")
            topic_posts = self._fetch_resource(f'/community/topics/{topic_id}/posts.json', f"posts for topic {topic_id}")
            all_documents.extend(topic_posts)

        return all_documents

# Document Processor with Advanced NLP
class DocumentProcessor:
    def __init__(self, script_dir):
        self.script_dir = script_dir
        LoggerConfig.from_environment(default_path=os.path.join(script_dir))
        self.logger = logging.getLogger(self.__class__.__name__)

        # Initialize transformer models
        self.tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
        self.ner_model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
        self.sentiment_analyzer = pipeline("sentiment-analysis")
        self.ner_pipeline = pipeline("ner", model=self.ner_model, tokenizer=self.tokenizer)

    def advanced_tokenize(self, text):
        # Use NER to extract meaningful entities
        entities = self.ner_pipeline(text)
        tokens = []
        merged_word = ""

        for entity in entities:
            word = entity['word']

            # Accumulate parts of a split word
            if word.lower() in ["ana", "lyt", "lytics"] and not merged_word:
                merged_word += word
            elif merged_word:
                if word.lower() not in ["ana", "lyt", "lytics"]:
                    tokens.append(merged_word)
                    merged_word = ""
                else:
                    merged_word += word
            else:
                tokens.append(word)

        # Append any remaining merged word
        if merged_word:
            tokens.append(merged_word)

        # Filter out numeric tokens, stopwords, and extra stopwords
        tokens = [token for token in tokens 
                  if not token.isdigit()  # Exclude numeric tokens
                  and token.lower() not in stop_words 
                  and token.lower() not in extra_stopwords]
        return tokens

    def sentiment_analysis(self, text):
        sentiment = self.sentiment_analyzer(text)[0]
        return sentiment['label'], sentiment['score']

    def simple_search(self, text, pattern):
        positions = []
        start = 0
        while True:
            start = text.lower().find(pattern.lower(), start)
            if start == -1:
                break
            positions.append(start)
            start += len(pattern)
        return positions

    def process_documents(self, documents):
        if not documents:
            self.logger.info("No documents to process.")
            return [], {}

        document_index = {}
        permuted_titles_with_ids = []
        doc_keywords_processed = {}

        for doc in documents:
            title = doc.get('title', '').strip()
            doc_id = doc.get('id')
            url = doc.get('html_url', '#')
            document_index[doc_id] = {'title': title, 'url': url}

            try:
                # Tokenization using NER
                tokens = self.advanced_tokenize(title)
                # Sentiment analysis
                sentiment_label, sentiment_score = self.sentiment_analysis(title)
                self.logger.info(f"Sentiment for '{title}': {sentiment_label} ({sentiment_score:.2f})")
            except Exception as e:
                self.logger.error(f"Tokenization or Sentiment Analysis failed: {str(e)}")
                continue

            # Remove stopwords and custom stopwords
            keywords = [word for word in tokens 
                        if word.lower() not in stop_words and word.lower() not in extra_stopwords]

            for word in set(keywords):
                word_lower = word.lower()
                if word_lower not in doc_keywords_processed.get(doc_id, set()):
                    doc_keywords_processed.setdefault(doc_id, set()).add(word_lower)

                occurrences = self.simple_search(title, word)
                for match_index in occurrences:
                    start = match_index
                    end = match_index + len(word)
                    part_before = title[:start].strip()
                    part_after = title[end:].strip()
                    permuted_titles_with_ids.append((word_lower, part_before, word, part_after, doc_id))

        permuted_titles_with_ids.sort(key=lambda entry: (0, int(entry[0])) if entry[0].isdigit() else (1, entry[0].lower(), entry[1]))

        self.logger.info("Document processing complete with advanced NLP.")
        return permuted_titles_with_ids, document_index

# HTML Generator Class
class HTMLGenerator:
    def __init__(self, script_dir):
        self.script_dir = script_dir
        LoggerConfig.from_environment(default_path=os.path.join(script_dir))
        self.logger = logging.getLogger(self.__class__.__name__)

    def generate_part_html(self, permuted_titles, index):
        html_content = ""
        for _, (keyword, part_before, _, part_after, doc_id) in enumerate(permuted_titles):
            article_details = index.get(doc_id, {'title': 'No title', 'url': '#'})
            full_title = article_details['title']
            url = article_details['url']
            html_content += f"<tr><td class='text-right'>{part_before}</td><td class='keyword text-center'>{keyword}</td><td class='text-left'>{part_after}</td><td class='text-left'><a href='{url}' target='_blank'>{full_title}</a></td></tr>\n"
        return html_content

    def start_html(self):
        return """
                <!DOCTYPE html>
                <html lang="en">
                <head>
                <meta charset="UTF-8">
                <meta name="viewport" content="width=device-width, initial-scale=1.0">
                <title>Permuted Index of Vyopta Documentation</title>
                <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css">
                </head>
                <body>
                <div class="container mt-4">
                <h2 class="text-center mb-4">Permuted Index of Vyopta Documentation</h2>
                <table class="table table-bordered">
                <thead>
                <tr><th>Part before</th><th>Keyword</th><th>Part after</th><th>Link</th></tr>
                </thead>
                <tbody>
                """

    def end_html(self):
        return "</tbody></table></div></body></html>"

    def generate_and_save_html(self, permuted_titles_with_ids, index):
        filename = 'docs_permuted_index.html'
        path = os.path.join(self.script_dir, filename)
        if os.path.exists(path):
            os.remove(path)
            self.logger.info(f"Existing HTML file cleared at {path}")

        parts = [permuted_titles_with_ids[i:i + 100] for i in range(0, len(permuted_titles_with_ids), 100)]
        html_content = self.start_html()
        for part in parts:
            html_content += self.generate_part_html(part, index)
        html_content += self.end_html()

        try:
            with open(path, 'w', encoding='utf-8') as file:
                file.write(html_content)
            self.logger.info(f"HTML document saved at {path}")
        except IOError as e:
            self.logger.error(f"Failed to save HTML document: {str(e)}")
            sys.exit(1)

# Main Script
def main():
    script_dir = os.getcwd()

    LoggerConfig.from_environment(default_path=os.path.join(script_dir, 'docs_permuted_index.log'))
    logger = logging.getLogger(__name__)

    try:
        logger.info("Starting script execution...")

        zendesk_connector = ZendeskAPIConnector()
        documents = zendesk_connector.fetch_data()

        logger.info(f"Total API calls made: {zendesk_connector.api_call_count}")

        if not documents:
            logger.info("No documents retrieved from Zendesk. Exiting script.")
            return

        doc_processor = DocumentProcessor(script_dir)
        permuted_titles_with_ids, document_index = doc_processor.process_documents(documents)

        if not permuted_titles_with_ids:
            logger.info("No permuted titles were generated. Exiting script.")
            return

        html_generator = HTMLGenerator(script_dir)
        html_generator.generate_and_save_html(permuted_titles_with_ids, document_index)

        logger.info("SCRIPT EXECUTION ENDED SUCCESSFULLY!")
    except Exception as e:
        logger.error(f"An error occurred: {str(e)}", exc_info=True)
        sys.exit(1)

if __name__ == "__main__":
    main()





[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vijay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2024-10-22 14:29:59,118 - INFO - Starting script execution...
2024-10-22 14:29:59,119 - INFO - Fetching data from Zendesk API.
2024-10-22 14:30:11,061 - INFO - Total API calls made: 13
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a Be

2024-10-22 14:30:24,491 - INFO - Sentiment for 'Export Reports to Collector': NEGATIVE (0.78)
2024-10-22 14:30:24,658 - INFO - Sentiment for 'Office365 (O365) Integration': POSITIVE (0.95)
2024-10-22 14:30:24,825 - INFO - Sentiment for 'Microsoft Exchange Integration for Workspace Insights': POSITIVE (0.99)
2024-10-22 14:30:25,001 - INFO - Sentiment for 'Upgrading your data collector that is using a AD service account to monitor infrastructure': NEGATIVE (1.00)
2024-10-22 14:30:25,175 - INFO - Sentiment for 'How to download the latest version of the Vyopta Data Collector': NEGATIVE (0.91)
2024-10-22 14:30:25,420 - INFO - Sentiment for 'Log4J2 CVE-2021-44228 Vulnerability - UPDATED 12/21/2021': NEGATIVE (0.97)
2024-10-22 14:30:25,627 - INFO - Sentiment for 'Vyopta Collector 4.8.1 / 4.8.2 / 4.8.3 - Release Notes': NEGATIVE (0.83)
2024-10-22 14:30:25,818 - INFO - Sentiment for 'Identifying meetings with the most participants and/or the longest durations in dashboard': NEGATIVE (0.99)
2024

2024-10-22 14:30:36,499 - INFO - Sentiment for 'Interpreting call data shown in the infrastructure Tab in Tech Insights Monitoring': NEGATIVE (0.98)
2024-10-22 14:30:36,686 - INFO - Sentiment for 'Validate button missing in Vyopta Admin portal (collector version 4 and above)': NEGATIVE (1.00)
2024-10-22 14:30:36,867 - INFO - Sentiment for 'How to export list of managed endpoints from Tech Insights Analytics': POSITIVE (0.98)
2024-10-22 14:30:37,034 - INFO - Sentiment for 'Enable Automatic Upgrades of the Vyopta Collector': POSITIVE (0.99)
2024-10-22 14:30:37,229 - INFO - Sentiment for 'Trending and Metrics for Percentage/Rate-based Data in Tech Insights Analytics': NEGATIVE (0.94)
2024-10-22 14:30:37,384 - INFO - Sentiment for 'Vyopta for UCaaS': POSITIVE (0.83)
2024-10-22 14:30:37,563 - INFO - Sentiment for 'Poly RealPresence Resource Manager (RPRM) Integration': NEGATIVE (0.73)
2024-10-22 14:30:37,735 - INFO - Sentiment for 'Polycom RealPresence Collaboration Server (RMX) Integration

2024-10-22 14:30:48,150 - INFO - Sentiment for 'Same-day Call and Meeting Reporting in Tech Insights Monitoring Using Improved Panel Filters': NEGATIVE (0.97)
2024-10-22 14:30:48,309 - INFO - Sentiment for 'Cisco Call Manager (CUCM) Setup': NEGATIVE (0.97)
2024-10-22 14:30:48,486 - INFO - Sentiment for 'How to resolve error - The HTTP Feedback for slot X can not be delivered': NEGATIVE (1.00)
2024-10-22 14:30:48,671 - INFO - Sentiment for 'Can we have a second instance of Vyopta Data Collector as backup?': NEGATIVE (1.00)
2024-10-22 14:30:48,848 - INFO - Sentiment for 'Cisco Telepresence Management Suite (TMS) Integration': POSITIVE (0.90)
2024-10-22 14:30:49,002 - INFO - Sentiment for 'Pexip Infinity Integration': POSITIVE (0.95)
2024-10-22 14:30:49,175 - INFO - Sentiment for 'Can I Use Vyopta to Monitor my Entire  Infrastructure?': NEGATIVE (1.00)
2024-10-22 14:30:49,363 - INFO - Sentiment for 'Tech Insights Analytics and Rolled Back Tech Insights Monitoring do not Match. Why?': NEGA

2024-10-22 14:31:00,250 - INFO - Sentiment for 'CUCM - unknown error: No files present in CDR Repository': NEGATIVE (1.00)
2024-10-22 14:31:00,419 - INFO - Sentiment for 'How Vyopta Calculates Endpoint Utilisation': POSITIVE (1.00)
2024-10-22 14:31:00,578 - INFO - Sentiment for 'Running the data collector using a service account': NEGATIVE (0.99)
2024-10-22 14:31:00,758 - INFO - Sentiment for 'What does Vyopta mean when it says a system is "down?"': NEGATIVE (1.00)
2024-10-22 14:31:00,938 - INFO - Sentiment for 'I want to know when my systems are not sending data to Vyopta': NEGATIVE (0.99)
2024-10-22 14:31:01,106 - INFO - Sentiment for 'How Can I Access the Vyopta Knowledge Base?': NEGATIVE (1.00)
2024-10-22 14:31:01,290 - INFO - Sentiment for 'Vyopta Tech Insights Data Collector - 3.6.0.938': NEGATIVE (0.94)
2024-10-22 14:31:01,472 - INFO - Sentiment for 'Vyopta Tech Insights Data Collector - Version 3.6.0.959': NEGATIVE (0.97)
2024-10-22 14:31:01,644 - INFO - Sentiment for 'An Appro

2024-10-22 14:31:11,854 - INFO - Sentiment for 'Tech Insights Analytics Executive Dashboard': POSITIVE (0.94)
2024-10-22 14:31:12,010 - INFO - Sentiment for 'Infrastructure Network Requirements': POSITIVE (0.54)
2024-10-22 14:31:12,179 - INFO - Sentiment for 'Tech Insights Analytics Tagging Dashboard': POSITIVE (0.77)
2024-10-22 14:31:12,337 - INFO - Sentiment for 'Endpoint Monitoring Port Requirements': NEGATIVE (0.97)
2024-10-22 14:31:12,502 - INFO - Sentiment for 'Login to the Vyopta Admin Portal': NEGATIVE (0.99)
2024-10-22 14:31:12,668 - INFO - Sentiment for 'Tech Insights Analytics Issues Dashboard': NEGATIVE (0.98)
2024-10-22 14:31:12,819 - INFO - Sentiment for 'Infrastructure List': POSITIVE (0.55)
2024-10-22 14:31:12,990 - INFO - Sentiment for 'Tech Insights Analytics UCaas Overall Dashboard': POSITIVE (0.55)
2024-10-22 14:31:13,152 - INFO - Sentiment for 'VSI Optimizing Spaces Dashboard': POSITIVE (0.95)
2024-10-22 14:31:13,312 - INFO - Sentiment for 'Installing the Vyopta Co