# Install Packages

In [1]:
!pip install PyMuPDF pandas spacy azure-ai-textanalytics azure-core
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     - -------------------------------------- 0.5/12.8 MB 1.2 MB/s eta 0:00:11
     --- ------------------------------------ 1.0/12.8 MB 1.5 MB/s eta 0:00:08
     ---- ----------------------------------- 1.3/12.8 MB 1.6 MB/s eta 0:00:08
     ----- ---------------------------------- 1.8/12.8 MB 1.8 MB/s eta 0:00:07
     ------- -------------------------------- 2.4/12.8 MB 1.9 MB/s eta 0:00:06
     --------- ------------------------------ 2.9/12.8 MB 2.0 MB/s eta 0:00:05
     ----------- ---------------------------- 3.7/12.8 MB 2.2 MB/s eta 0:00:05
     ------------- -------------------------- 4.2/12.8 

# Imports

In [17]:
import fitz  # PyMuPDF - used to open and interact with PDF files for reading, extracting, and processing content.
import pandas as pd  # For creating and manipulating DataFrames, useful for handling structured data in tabular form.
import spacy  # NLP library used for tokenization, named entity recognition, and other linguistic features like parsing.
import re  # Regular expressions for text preprocessing, used for searching, cleaning, and manipulating strings.
import unicodedata  # For handling Unicode normalization, useful for dealing with special characters in text processing.
import os  # Provides functions for interacting with the operating system, such as file and directory manipulation.
import json  # For working with JSON data, useful for reading and writing structured data in JSON format.
import typing  # For adding type hints to your code, improving code readability and type safety.
import random  # Used for generating random numbers or selecting random elements, useful for tasks like random sampling.
from azure.core.credentials import AzureKeyCredential  # Provides a secure way to manage and pass API keys for authentication.
from azure.ai.textanalytics import TextAnalyticsClient  # Azure Text Analytics client, used for processing text via Azure Cognitive Services (e.g., detecting PII, sentiment analysis, etc.).

# Load Specialized NLP Library

In [18]:
# Load the spaCy model for English
nlp = spacy.load("en_core_web_sm")

# Set Pandas Display Option

In [19]:
# Set pandas options to enable word wrapping
pd.set_option('display.max_colwidth', None)  # This ensures that the entire content of each column is displayed
pd.set_option('display.width', 1000)  # This increases the width of the display to prevent truncation

### Function to pre process text and clean up strings

In [20]:
def preprocess_text(text):
    """
    Preprocess the input text to make it cleaner and more uniform.

    Steps:
    1. Normalize the text to remove special characters.
    2. Remove newline characters and replace them with a space.
    3. Remove carriage returns and replace them with a space.
    4. Remove extra spaces, tabs, and control characters.
    5. Remove non-ASCII characters to avoid issues with special characters.
    6. Optionally, handle specific punctuation or unwanted characters.

    Parameters:
    text (str): The input text to preprocess.

    Returns:
    str: The cleaned and normalized text.
    """

    # Normalize text to remove special characters and accents
    text = unicodedata.normalize("NFKD", text)

    # Remove newline characters, carriage returns, and tabs, and replace with a single space
    text = re.sub(r"[\n\r\t]+", " ", text)

    # Replace multiple spaces with a single space
    text = re.sub(r"\s+", " ", text)

    # Remove non-ASCII characters (optional, depends on the use case)
    text = text.encode("ascii", "ignore").decode("ascii")

    # Remove control characters (e.g., '\x0c')
    text = re.sub(r"[\x00-\x1f\x7f-\x9f]", "", text)

    # Optionally, remove or replace unwanted punctuation (e.g., dashes, quotes)
    text = re.sub(r"[“”‘’]", '"', text)  # Replace fancy quotes with standard quotes
    text = re.sub(r"[—–]", "-", text)  # Replace fancy dashes with a standard dash
    text = re.sub(r"[\[\]{}<>]", "", text)  # Remove brackets or angle brackets

    # Trim leading and trailing whitespace
    text = text.strip()

    return text

### Function to break pages into chunks

In [21]:
def chunk_sentences(text, chunk_size):
    """
    Splits the input text into chunks, where each chunk contains a specified number of sentences.

    Args:
        text (str): The input text to be chunked.
        chunk_size (int): The number of sentences per chunk.

    Returns:
        list: A list of text chunks, where each chunk is a string containing 'chunk_size' sentences.
    """
    # Process the text with spaCy to detect sentences
    doc = nlp(text)
    chunks = []
    current_chunk = []

    # Loop through each sentence and build chunks based on sentence count
    for i, sentence in enumerate(doc.sents):
        current_chunk.append(sentence.text)

        # Once the chunk contains the desired number of sentences, store it
        if len(current_chunk) == chunk_size:
            chunks.append(" ".join(current_chunk).strip())
            current_chunk = []  # Start a new chunk

    # If any sentences are left, they form the final chunk
    if current_chunk:
        chunks.append(" ".join(current_chunk).strip())

    return chunks

### Function to convert pdf to dataframe

In [22]:
def pdf_to_dataframe(pdf_path, chunk_size, max_pages=None):
    """
    Converts a PDF file into a pandas DataFrame, where each row contains chunks of text along with statistics for each page and chunk.

    Args:
        pdf_path (str): Path to the PDF file to be processed.
        chunk_size (int): Number of sentences per chunk in the output.
        max_pages (int, optional): Maximum number of pages to process. If None, all pages will be processed.

    Returns:
        pd.DataFrame: A DataFrame where each row corresponds to a chunk of text from the PDF, along with the following columns:
                      - 'page_number': Page number of the chunk.
                      - 'chunk': Text content of the chunk.
                      - 'chunk_char_count': Number of characters in the chunk.
                      - 'chunk_token_count': Number of tokens (words) in the chunk.
                      - 'page_word_count': Total number of words on the page.
                      - 'page_token_count': Total number of tokens (words) on the page.
    """
    # Open the PDF file
    doc = fitz.open(pdf_path)
    rows = []
    
    # Set the maximum number of pages to process
    num_pages = len(doc) if max_pages is None else min(max_pages, len(doc))

    # Loop through each page up to the max_pages limit
    for page_num in range(num_pages):
        page = doc.load_page(page_num)
        text = page.get_text("text")  # Extract the text from the page

        # Preprocess the text
        text = preprocess_text(text)

        # Create chunks of the text based on sentence count
        chunks = chunk_sentences(text, chunk_size)

        # Calculate page statistics
        page_word_count = len(text.split())
        page_token_count = len(nlp(text))

        # Record each chunk with its statistics
        for chunk in chunks:
            chunk_char_count = len(chunk)
            chunk_token_count = len(nlp(chunk))
            rows.append(
                {
                    "page_number": page_num + 1,
                    "chunk": chunk,
                    "chunk_char_count": chunk_char_count,
                    "chunk_token_count": chunk_token_count,
                    "page_word_count": page_word_count,
                    "page_token_count": page_token_count,
                }
            )

    # Convert to DataFrame
    df = pd.DataFrame(rows)

    return df

# Create the dataframe

In [24]:
input_file = r"Book-2Designing-data-intensive-applications.pdf"
df = pdf_to_dataframe(input_file, chunk_size=10, max_pages=50)

In [25]:
df

Unnamed: 0,page_number,chunk,chunk_char_count,chunk_token_count,page_word_count,page_token_count
0,1,"Martin Kleppmann Designing Data-Intensive Applications THE BIG IDEAS BEHIND RELIABLE, SCALABLE, AND MAINTAINABLE SYSTEMS",120,18,14,18
1,3,"Martin Kleppmann Designing Data-Intensive Applications The Big Ideas Behind Reliable, Scalable, and Maintainable Systems Boston Farnham Sebastopol Tokyo Beijing Boston Farnham Sebastopol Tokyo Beijing",200,28,24,28
2,4,"978-1-449-37332-0 LSI Designing Data-Intensive Applications by Martin Kleppmann Copyright 2017 Martin Kleppmann. All rights reserved. Printed in the United States of America. Published by OReilly Media, Inc., 1005 Gravenstein Highway North, Sebastopol, CA 95472. OReilly books may be purchased for educational, business, or sales promotional use. Online editions are also available for most titles (http://oreilly.com/safari). For more information, contact our corporate/insti tutional sales department: 800-998-9938 or corporate@oreilly.com. Editors: Ann Spencer and Marie Beaugureau Indexer: Ellen Troutman-Zaig Production Editor: Kristen Brown Interior Designer: David Futato Copyeditor: Rachel Head Cover Designer: Karen Montgomery Proofreader: Amanda Kersey Illustrator: Rebecca Demarest March 2017:",805,144,250,313
3,4,"First Edition Revision History for the First Edition 2017-03-01: First Release See http://oreilly.com/catalog/errata.csp?isbn=9781449373320 for release details. The OReilly logo is a registered trademark of OReilly Media, Inc. Designing Data-Intensive Applications, the cover image, and related trade dress are trademarks of OReilly Media, Inc. While the publisher and the author have used good faith efforts to ensure that the information and instructions contained in this work are accurate, the publisher and the author disclaim all responsibility for errors or omissions, including without limitation responsibility for damages resulting from the use of or reliance on this work. Use of the information and instructions contained in this work is at your own risk. If any code samples or other technology this work contains or describes is subject to open source licenses or the intellectual property rights of others, it is your responsibility to ensure that your use thereof complies with such licenses and/or rights.",1022,169,250,313
4,5,"Technology is a powerful force in our society. Data, software, and communication can be used for bad: to entrench unfair power structures, to undermine human rights, and to protect vested interests. But they can also be used for good: to make underrepresented peoples voices heard, to create opportunities for everyone, and to avert disasters. This book is dedicated to everyone working toward the good.",403,76,64,76
...,...,...,...,...,...,...
84,49,"CHAPTER 2 Data Models and Query Languages The limits of my language mean the limits of my world. Ludwig Wittgenstein, Tractatus Logico-Philosophicus (1922) Data models are perhaps the most important part of developing software, because they have such a profound effect: not only on how the software is written, but also on how we think about the problem that we are solving. Most applications are built by layering one data model on top of another. For each layer, the key question is: how is it represented in terms of the next-lower layer? For example: 1. As an application developer, you look at the real world (in which there are peo ple, organizations, goods, actions, money flows, sensors, etc.) and model it in terms of objects or data structures, and APIs that manipulate those data struc tures. Those structures are often specific to your application. 2. When you want to store those data structures, you express them in terms of a general-purpose data model, such as JSON or XML documents, tables in a rela tional database, or a graph model.",1051,218,295,358
85,49,"3. The engineers who built your database software decided on a way of representing that JSON/XML/relational/graph data in terms of bytes in memory, on disk, or on a network. The representation may allow the data to be queried, searched, manipulated, and processed in various ways. 4. On yet lower levels, hardware engineers have figured out how to represent bytes in terms of electrical currents, pulses of light, magnetic fields, and more. In a complex application there may be more intermediary levels, such as APIs built upon APIs, but the basic idea is still the same: each layer hides the complexity of the layers below it by providing a clean data model. These abstractions allow different 27",698,140,295,358
86,50,"groups of peoplefor example, the engineers at the database vendor and the applica tion developers using their databaseto work together effectively. There are many different kinds of data models, and every data model embodies assumptions about how it is going to be used. Some kinds of usage are easy and some are not supported; some operations are fast and some perform badly; some data transformations feel natural and some are awkward. It can take a lot of effort to master just one data model (think how many books there are on relational data modeling). Building software is hard enough, even when work ing with just one data model and without worrying about its inner workings. But since the data model has such a profound effect on what the software above it can and cant do, its important to choose one that is appropriate to the application. In this chapter we will look at a range of general-purpose data models for data stor age and querying (point 2 in the preceding list). In particular, we will compare the relational model, the document model, and a few graph-based data models. We will also look at various query languages and compare their use cases. In Chapter 3 we will discuss how storage engines work; that is, how these data models are actually implemented (point 3 in the list).",1300,257,467,537
87,50,"Relational Model Versus Document Model The best-known data model today is probably that of SQL, based on the relational model proposed by Edgar Codd in 1970 1: data is organized into relations (called tables in SQL), where each relation is an unordered collection of tuples (rows in SQL). The relational model was a theoretical proposal, and many people at the time doubted whether it could be implemented efficiently. However, by the mid-1980s, relational database management systems (RDBMSes) and SQL had become the tools of choice for most people who needed to store and query data with some kind of reg ular structure. The dominance of relational databases has lasted around 2530 years an eternity in computing history. The roots of relational databases lie in business data processing, which was performed on mainframe computers in the 1960s and 70s. The use cases appear mundane from todays perspective: typically transaction processing (entering sales or banking trans actions, airline reservations, stock-keeping in warehouses) and batch processing (cus tomer invoicing, payroll, reporting). Other databases at that time forced application developers to think a lot about the internal representation of the data in the database. The goal of the relational model was to hide that implementation detail behind a cleaner interface. Over the years, there have been many competing approaches to data storage and querying.",1424,256,467,537


In [27]:
df['chunk'][50]



### Function to get authentication information (endpoint+api key) from credentials.json

In [28]:
def get_authentication_client(credentials_file="credentials.json"):
    # Load credentials from the JSON file
    with open(credentials_file, "r") as file:
        credentials = json.load(file)

    # Access the endpoint and API key
    endpoint = credentials["endpoint"]
    api_key = credentials["api_key"]

    return endpoint, api_key

### Function to create client object

In [29]:
def get_document_analysis_client(endpoint, api_key):
    # Create and return the DocumentAnalysisClient object
    return TextAnalyticsClient(
        endpoint=endpoint, credential=AzureKeyCredential(api_key)
    )

# Test Key Phrase Extraction

In [30]:
# Get the endpoint and API key
endpoint, api_key = get_authentication_client()

# Get the client object using the endpoint and API key
text_analytics_client = get_document_analysis_client(endpoint, api_key)

# List of articles to analyze
articles = [
    """
    Washington, D.C. Autumn in DC is a uniquely beautiful season. The leaves fall from the trees
    in a city chock-full of forests, leaving yellow leaves on the ground and a clearer view of the
    blue sky above...
    """,
    """
    Redmond, WA. In the past few days, Microsoft has decided to further postpone the start date of
    its United States workers, due to the pandemic that rages with no end in sight...
    """,
    """
    Redmond, WA. Employees at Microsoft can be excited about the new coffee shop that will open on campus
    once workers no longer have to work remotely...
    """
]

# List to store articles that mention Microsoft
articles_that_mention_microsoft = []

# Extract key phrases from articles
result = text_analytics_client.extract_key_phrases(articles)
for idx, doc in enumerate(result):
    if not doc.is_error:
        # Print key phrases for each article
        print("Key phrases in article #{}: {}".format(
            idx + 1,
            ", ".join(doc.key_phrases)
        ))

        # Check if 'Microsoft' is mentioned in the key phrases
        if "Microsoft" in doc.key_phrases:
            articles_that_mention_microsoft.append(str(idx + 1))

# Print the list of articles that mention Microsoft
print(
    "The articles that mention Microsoft are articles number: {}. Those are the ones I'm interested in reading.".format(
        ", ".join(articles_that_mention_microsoft)
    )
)

Key phrases in article #1: D.C. Autumn, beautiful season, clearer view, blue sky, yellow leaves, Washington, DC, trees, city, forests, ground
Key phrases in article #2: United States workers, start date, Redmond, WA, past, days, Microsoft, pandemic, end, sight
Key phrases in article #3: new coffee shop, Redmond, WA, Employees, Microsoft, campus, workers
The articles that mention Microsoft are articles number: 2, 3. Those are the ones I'm interested in reading.


### Function to get key phrases for pandas .apply

In [129]:
def extract_key_phrases_from_text(text, client):
    """
    Extracts key phrases from a given text using the provided Azure Text Analytics client.
    
    Args:
        text (str): The input text from which key phrases will be extracted.
        client: The initialized Azure Text Analytics client object.
        
    Returns:
        list: A list of key phrases extracted from the text.
    """
    response = client.extract_key_phrases([text])
    if not response[0].is_error:
        return response[0].key_phrases
    return []

In [130]:
df['key_phrases'] = df['chunk'].apply(lambda x: extract_key_phrases_from_text(x, text_analytics_client))

In [131]:
df

Unnamed: 0,page_number,chunk,chunk_char_count,chunk_token_count,page_word_count,page_token_count,key_phrases
0,1,"Martin Kleppmann Designing Data-Intensive Applications THE BIG IDEAS BEHIND RELIABLE, SCALABLE, AND MAINTAINABLE SYSTEMS",120,18,14,18,"[Martin Kleppmann, Data-Intensive Applications, BIG IDEAS, MAINTAINABLE SYSTEMS]"
1,3,"Martin Kleppmann Designing Data-Intensive Applications The Big Ideas Behind Reliable, Scalable, and Maintainable Systems Boston Farnham Sebastopol Tokyo Beijing Boston Farnham Sebastopol Tokyo Beijing",200,28,24,28,"[Maintainable Systems Boston Farnham Sebastopol Tokyo Beijing Boston Farnham Sebastopol Tokyo Beijing, Martin Kleppmann, Data-Intensive Applications, Big Ideas, Reliable, Scalable]"
2,4,"978-1-449-37332-0 LSI Designing Data-Intensive Applications by Martin Kleppmann Copyright 2017 Martin Kleppmann. All rights reserved. Printed in the United States of America. Published by OReilly Media, Inc., 1005 Gravenstein Highway North, Sebastopol, CA 95472. OReilly books may be purchased for educational, business, or sales promotional use. Online editions are also available for most titles (http://oreilly.com/safari). For more information, contact our corporate/insti tutional sales department: 800-998-9938 or corporate@oreilly.com. Editors: Ann Spencer and Marie Beaugureau Indexer: Ellen Troutman-Zaig Production Editor: Kristen Brown Interior Designer: David Futato Copyeditor: Rachel Head Cover Designer: Karen Montgomery Proofreader: Amanda Kersey Illustrator: Rebecca Demarest March 2017:",805,144,250,313,"[LSI Designing Data-Intensive Applications, Ellen Troutman-Zaig Production Editor, Kristen Brown Interior Designer, Rachel Head Cover Designer, corporate/insti tutional sales department, sales promotional use, 1005 Gravenstein Highway North, Marie Beaugureau Indexer, David Futato Copyeditor, Karen Montgomery Proofreader, Amanda Kersey Illustrator, Martin Kleppmann, United States, OReilly Media, OReilly books, educational, business, Online editions, most titles, Ann Spencer, Rebecca Demarest, rights, America, Inc., Sebastopol, safari, information, Editors]"
3,4,"First Edition Revision History for the First Edition 2017-03-01: First Release See http://oreilly.com/catalog/errata.csp?isbn=9781449373320 for release details. The OReilly logo is a registered trademark of OReilly Media, Inc. Designing Data-Intensive Applications, the cover image, and related trade dress are trademarks of OReilly Media, Inc. While the publisher and the author have used good faith efforts to ensure that the information and instructions contained in this work are accurate, the publisher and the author disclaim all responsibility for errors or omissions, including without limitation responsibility for damages resulting from the use of or reliance on this work. Use of the information and instructions contained in this work is at your own risk. If any code samples or other technology this work contains or describes is subject to open source licenses or the intellectual property rights of others, it is your responsibility to ensure that your use thereof complies with such licenses and/or rights.",1022,169,250,313,"[First Edition Revision History, related trade dress, good faith efforts, The OReilly logo, open source licenses, intellectual property rights, First Release, release details, registered trademark, OReilly Media, Data-Intensive Applications, cover image, code samples, other technology, catalog, errata, trademarks, publisher, author, information, instructions, work, responsibility, errors, omissions, limitation, damages, use, reliance, risk, others]"
4,5,"Technology is a powerful force in our society. Data, software, and communication can be used for bad: to entrench unfair power structures, to undermine human rights, and to protect vested interests. But they can also be used for good: to make underrepresented peoples voices heard, to create opportunities for everyone, and to avert disasters. This book is dedicated to everyone working toward the good.",403,76,64,76,"[unfair power structures, powerful force, human rights, vested interests, Technology, society, Data, software, communication, good, underrepresented, peoples, voices, opportunities, everyone, disasters, book]"
5,7,"Computing is pop culture. ... Pop culture holds a disdain for history. Pop culture is all about identity and feeling like youre participating. It has nothing to do with cooperation, the past or the futureits living in the present. I think the same is true of most people who write code for money. They have no idea where their culture came from. Alan Kay, in interview with Dr Dobbs Journal (2012)",397,82,71,82,"[Dr Dobbs Journal, most people, Alan Kay, pop culture, Computing, disdain, history, identity, cooperation, past, futureits, present, code, money, idea, interview]"
6,9,"Table of Contents Preface. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . xiii Part I. Foundations of Data Systems 1. Reliable, Scalable, and Maintainable Applications. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 3 Thinking About Data Systems 4 Reliability 6 Hardware Faults 7 Software Errors 8 Human Errors 9 How Important Is Reliability? 10 Scalability 10 Describing Load 11 Describing Performance 13 Approaches for Coping with Load 17 Maintainability 18 Operability: Making Life Easy for Operations 19 Simplicity: Managing Complexity 20 Evolvability: Making Change Easy 21",682,179,248,270,"[Contents Preface, Part I., Data Systems, Maintainable Applications, Hardware Faults, 7 Software Errors, Human Errors, Managing Complexity, Table, xiii, Foundations, Reliable, Scalable, Reliability, 10 Scalability, Load, Performance, 13 Approaches, 17 Maintainability, Operability, Life, Operations, Simplicity, Evolvability, Change]"
7,9,Summary 22 2. Data Models and Query Languages. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 27 Relational Model Versus Document Model 28 The Birth of NoSQL 29 The Object-Relational Mismatch 29 Many-to-One and Many-to-Many Relationships 33 Are Document Databases Repeating History? 36 vii,326,91,248,270,"[27 Relational Model Versus, Document Model, Data Models, Query Languages, Object-Relational Mismatch, Many Relationships, Document Databases, Summary, Birth, NoSQL, One, History, 36 vii]"
8,10,Relational Versus Document Databases Today 38 Query Languages for Data 42 Declarative Queries on the Web 44 MapReduce Querying 46 Graph-Like Data Models 49 Property Graphs 50 The Cypher Query Language 52 Graph Queries in SQL 53 Triple-Stores and SPARQL 55 The Foundation: Datalog 60 Summary 63 3. Storage and Retrieval. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 69 Data Structures That Power Your Database 70 Hash Indexes 72 SSTables and LSM-Trees 76 B-Trees 79 Comparing B-Trees and LSM-Trees 83 Other Indexing Structures 85 Transaction Processing or Analytics? 90 Data Warehousing 91 Stars and Snowflakes: Schemas for Analytics 93 Column-Oriented Storage 95 Column Compression 97 Sort Order in Column Storage 99 Writing to Column-Oriented Storage 101 Aggregation: Data Cubes and Materialized Views 101 Summary 103 4. Encoding and Evolution. .,908,200,277,308,"[Relational Versus Document Databases, The Cypher Query Language, Other Indexing Structures, 46 Graph-Like Data Models, Column-Oriented Storage 101 Aggregation, 38 Query Languages, The Foundation, 69 Data Structures, Declarative Queries, 44 MapReduce Querying, 49 Property Graphs, 52 Graph Queries, 70 Hash Indexes, Transaction Processing, 90 Data Warehousing, Column Compression, Sort Order, Column Storage, Data Cubes, Materialized Views, Web, SQL, Triple-Stores, SPARQL, Datalog, Summary, Retrieval, 72 SSTables, LSM-Trees, B-Trees, Analytics, 91 Stars, Snowflakes, Schemas, Encoding, Evolution]"
9,10,". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 111 Formats for Encoding Data 112 Language-Specific Formats 113 JSON, XML, and Binary Variants 114 Thrift and Protocol Buffers 117 Avro 122 The Merits of Schemas 127 Modes of Dataflow 128 Dataflow Through Databases 129 Dataflow Through Services: REST and RPC 131 Message-Passing Dataflow 136 Summary 139 viii | Table of Contents",426,108,277,308,"[Data 112 Language-Specific Formats, Binary Variants, Protocol Buffers, Message-Passing Dataflow, 111 Formats, 129 Dataflow, JSON, XML, Thrift, Avro, Merits, Schemas, 127 Modes, Databases, Services, REST, RPC, Summary, Table, Contents]"


# Test Entity Recognition

In [132]:
# Get the endpoint and API key
endpoint, api_key = get_authentication_client()

# Get the client object using the endpoint and API key
text_analytics_client = get_document_analysis_client(endpoint, api_key)

# List of reviews to process
reviews = [
    """I work for Foo Company, and we hired Contoso for our annual founding ceremony. The food
    was amazing and we all can't say enough good words about the quality and the level of service.""",
    """We at the Foo Company re-hired Contoso after all of our past successes with the company.
    Though the food was still great, I feel there has been a quality drop since their last time
    catering for us. Is anyone else running into the same problem?""",
    """Bar Company is over the moon about the service we received from Contoso, the best sliders ever!!!!"""
]

# Recognize named entities (such as organizations) in the reviews
result = text_analytics_client.recognize_entities(reviews)

# Filter out reviews with errors
result = [review for review in result if not review.is_error]

# Dictionary to map organizations to the reviews mentioning them
organization_to_reviews: typing.Dict[str, typing.List[str]] = {}

# Loop through recognized entities and map organizations to reviews
for idx, review in enumerate(result):
    for entity in review.entities:
        print(f"Entity '{entity.text}' has category '{entity.category}'")
        if entity.category == 'Organization':
            organization_to_reviews.setdefault(entity.text, [])
            organization_to_reviews[entity.text].append(reviews[idx])

# Print out the reviews grouped by organization
for organization, org_reviews in organization_to_reviews.items():
    print(
        "\n\nOrganization '{}' has left us the following review(s): {}".format(
            organization, "\n\n".join(org_reviews)
        )
    )

Entity 'Foo Company' has category 'Organization'
Entity 'Contoso' has category 'Person'
Entity 'annual' has category 'DateTime'
Entity 'founding ceremony' has category 'Event'
Entity 'food' has category 'Product'
Entity 'Foo Company' has category 'Organization'
Entity 'Contoso' has category 'Person'
Entity 'food' has category 'Product'
Entity 'Bar Company' has category 'Organization'
Entity 'service' has category 'Skill'
Entity 'Contoso' has category 'Person'
Entity 'sliders' has category 'Product'


Organization 'Foo Company' has left us the following review(s): I work for Foo Company, and we hired Contoso for our annual founding ceremony. The food
    was amazing and we all can't say enough good words about the quality and the level of service.

We at the Foo Company re-hired Contoso after all of our past successes with the company.
    Though the food was still great, I feel there has been a quality drop since their last time
    catering for us. Is anyone else running into the same

### Function to get named entities for pandas .apply

In [133]:
# Function to extract all detected entities from a single review
def extract_entities_from_text(text, client):
    """
    Extracts all detected entities from a given review text using the Azure Text Analytics client.

    Args:
        text (str): The input review text.
        client: The initialized Azure Text Analytics client.

    Returns:
        list: A list of tuples, where each tuple contains the entity text and its category.
    """
    response = client.recognize_entities([text])
    entities = []
    
    if not response[0].is_error:
        for entity in response[0].entities:
            entities.append((entity.text, entity.category))
    
    return entities

In [134]:
# Apply the function to each review in the DataFrame
df['entities'] = df['chunk'].apply(lambda x: extract_entities_from_text(x, text_analytics_client))

In [135]:
df

Unnamed: 0,page_number,chunk,chunk_char_count,chunk_token_count,page_word_count,page_token_count,key_phrases,entities
0,1,"Martin Kleppmann Designing Data-Intensive Applications THE BIG IDEAS BEHIND RELIABLE, SCALABLE, AND MAINTAINABLE SYSTEMS",120,18,14,18,"[Martin Kleppmann, Data-Intensive Applications, BIG IDEAS, MAINTAINABLE SYSTEMS]","[(Martin Kleppmann, Person), (-Intensive, Skill), (Applications, Skill), (RELIABLE, Skill)]"
1,3,"Martin Kleppmann Designing Data-Intensive Applications The Big Ideas Behind Reliable, Scalable, and Maintainable Systems Boston Farnham Sebastopol Tokyo Beijing Boston Farnham Sebastopol Tokyo Beijing",200,28,24,28,"[Maintainable Systems Boston Farnham Sebastopol Tokyo Beijing Boston Farnham Sebastopol Tokyo Beijing, Martin Kleppmann, Data-Intensive Applications, Big Ideas, Reliable, Scalable]","[(Martin Kleppmann, Person), (Scalable, Skill), (Boston Farnham Sebastopol, Location), (Tokyo, Location), (Beijing, Location), (Boston Farnham, Location), (Farnham Sebastopol, Location), (Tokyo, Location), (Beijing, Location)]"
2,4,"978-1-449-37332-0 LSI Designing Data-Intensive Applications by Martin Kleppmann Copyright 2017 Martin Kleppmann. All rights reserved. Printed in the United States of America. Published by OReilly Media, Inc., 1005 Gravenstein Highway North, Sebastopol, CA 95472. OReilly books may be purchased for educational, business, or sales promotional use. Online editions are also available for most titles (http://oreilly.com/safari). For more information, contact our corporate/insti tutional sales department: 800-998-9938 or corporate@oreilly.com. Editors: Ann Spencer and Marie Beaugureau Indexer: Ellen Troutman-Zaig Production Editor: Kristen Brown Interior Designer: David Futato Copyeditor: Rachel Head Cover Designer: Karen Montgomery Proofreader: Amanda Kersey Illustrator: Rebecca Demarest March 2017:",805,144,250,313,"[LSI Designing Data-Intensive Applications, Ellen Troutman-Zaig Production Editor, Kristen Brown Interior Designer, Rachel Head Cover Designer, corporate/insti tutional sales department, sales promotional use, 1005 Gravenstein Highway North, Marie Beaugureau Indexer, David Futato Copyeditor, Karen Montgomery Proofreader, Amanda Kersey Illustrator, Martin Kleppmann, United States, OReilly Media, OReilly books, educational, business, Online editions, most titles, Ann Spencer, Rebecca Demarest, rights, America, Inc., Sebastopol, safari, information, Editors]","[(978-1-449, PhoneNumber), (37332, Quantity), (0, Quantity), (Designing Data-Intensive Applications, Product), (Martin Kleppmann, Person), (2017, Quantity), (Martin Kleppmann, Person), (United States of America, Location), (OReilly Media, Inc., Organization), (1005 Gravenstein Highway North, Sebastopol, CA 95472, Address), (OReilly books, Product), (educational, Skill), (business, Skill), (sales, Skill), (Online, Skill), (http://oreilly.com/safari)., URL), (sales, Skill), (800-998-9938, PhoneNumber), (corporate@oreilly.com, Email), (Editors, PersonType), (Ann Spencer, Person), (Marie Beaugureau, Person), (Ellen, Person), (-Zaig, Organization), (Kristen Brown, Person), (Interior Designer, PersonType), (David Futato, Person), (Copyeditor, PersonType), (Rachel Head, Person), (Cover Designer, PersonType), (Karen Montgomery, Person), (Proofreader, PersonType), (Amanda Kersey, Person), (Illustrator, PersonType), (Rebecca Demarest, Person), (March 2017:, DateTime)]"
3,4,"First Edition Revision History for the First Edition 2017-03-01: First Release See http://oreilly.com/catalog/errata.csp?isbn=9781449373320 for release details. The OReilly logo is a registered trademark of OReilly Media, Inc. Designing Data-Intensive Applications, the cover image, and related trade dress are trademarks of OReilly Media, Inc. While the publisher and the author have used good faith efforts to ensure that the information and instructions contained in this work are accurate, the publisher and the author disclaim all responsibility for errors or omissions, including without limitation responsibility for damages resulting from the use of or reliance on this work. Use of the information and instructions contained in this work is at your own risk. If any code samples or other technology this work contains or describes is subject to open source licenses or the intellectual property rights of others, it is your responsibility to ensure that your use thereof complies with such licenses and/or rights.",1022,169,250,313,"[First Edition Revision History, related trade dress, good faith efforts, The OReilly logo, open source licenses, intellectual property rights, First Release, release details, registered trademark, OReilly Media, Data-Intensive Applications, cover image, code samples, other technology, catalog, errata, trademarks, publisher, author, information, instructions, work, responsibility, errors, omissions, limitation, damages, use, reliance, risk, others]","[(First, Quantity), (First, Quantity), (2017-03-01, DateTime), (First, Quantity), (http://oreilly.com/catalog/errata.csp?isbn=9781449373320, URL), (OReilly, Organization), (OReilly Media, Inc., Organization), (Designing, Skill), (OReilly Media, Inc., Organization), (publisher, PersonType), (author, PersonType), (publisher, PersonType), (author, PersonType), (code samples, Skill), (technology, Skill), (open source, Skill)]"
4,5,"Technology is a powerful force in our society. Data, software, and communication can be used for bad: to entrench unfair power structures, to undermine human rights, and to protect vested interests. But they can also be used for good: to make underrepresented peoples voices heard, to create opportunities for everyone, and to avert disasters. This book is dedicated to everyone working toward the good.",403,76,64,76,"[unfair power structures, powerful force, human rights, vested interests, Technology, society, Data, software, communication, good, underrepresented, peoples, voices, opportunities, everyone, disasters, book]","[(Technology, Skill), (Data, Skill), (software, Skill), (communication, Skill), (underrepresented peoples, PersonType), (disasters, Event), (book, Product)]"
5,7,"Computing is pop culture. ... Pop culture holds a disdain for history. Pop culture is all about identity and feeling like youre participating. It has nothing to do with cooperation, the past or the futureits living in the present. I think the same is true of most people who write code for money. They have no idea where their culture came from. Alan Kay, in interview with Dr Dobbs Journal (2012)",397,82,71,82,"[Dr Dobbs Journal, most people, Alan Kay, pop culture, Computing, disdain, history, identity, cooperation, past, futureits, present, code, money, idea, interview]","[(Computing, Skill), (Pop, Skill), (cooperation, Skill), (people, PersonType), (Alan Kay, Person), (interview, Event), (Dr Dobbs Journal, Organization), ((2012), DateTime)]"
6,9,"Table of Contents Preface. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . xiii Part I. Foundations of Data Systems 1. Reliable, Scalable, and Maintainable Applications. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 3 Thinking About Data Systems 4 Reliability 6 Hardware Faults 7 Software Errors 8 Human Errors 9 How Important Is Reliability? 10 Scalability 10 Describing Load 11 Describing Performance 13 Approaches for Coping with Load 17 Maintainability 18 Operability: Making Life Easy for Operations 19 Simplicity: Managing Complexity 20 Evolvability: Making Change Easy 21",682,179,248,270,"[Contents Preface, Part I., Data Systems, Maintainable Applications, Hardware Faults, 7 Software Errors, Human Errors, Managing Complexity, Table, xiii, Foundations, Reliable, Scalable, Reliability, 10 Scalability, Load, Performance, 13 Approaches, 17 Maintainability, Operability, Life, Operations, Simplicity, Evolvability, Change]","[(1, Quantity), (Reliable, Skill), (Scalable, Skill), (Maintainable Applications., Skill), (3, Quantity), (Data Systems, Skill), (4, Quantity), (Reliability, Skill), (6, Quantity), (Hardware, Skill), (7, Quantity), (Software, Skill), (8, Quantity), (9, Quantity), (Reliability, Skill), (10, Quantity), (10, Quantity), (11, Quantity), (13, Quantity), (17, Quantity), (Maintainability, Skill), (18, Quantity), (19, Quantity), (20, Quantity), (21, Quantity)]"
7,9,Summary 22 2. Data Models and Query Languages. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 27 Relational Model Versus Document Model 28 The Birth of NoSQL 29 The Object-Relational Mismatch 29 Many-to-One and Many-to-Many Relationships 33 Are Document Databases Repeating History? 36 vii,326,91,248,270,"[27 Relational Model Versus, Document Model, Data Models, Query Languages, Object-Relational Mismatch, Many Relationships, Document Databases, Summary, Birth, NoSQL, One, History, 36 vii]","[(22, Quantity), (2, Quantity), (27, Quantity), (28, Quantity), (NoSQL, Skill), (29, Quantity), (29, Quantity), (One, Quantity), (33, Quantity), (36, Quantity)]"
8,10,Relational Versus Document Databases Today 38 Query Languages for Data 42 Declarative Queries on the Web 44 MapReduce Querying 46 Graph-Like Data Models 49 Property Graphs 50 The Cypher Query Language 52 Graph Queries in SQL 53 Triple-Stores and SPARQL 55 The Foundation: Datalog 60 Summary 63 3. Storage and Retrieval. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 69 Data Structures That Power Your Database 70 Hash Indexes 72 SSTables and LSM-Trees 76 B-Trees 79 Comparing B-Trees and LSM-Trees 83 Other Indexing Structures 85 Transaction Processing or Analytics? 90 Data Warehousing 91 Stars and Snowflakes: Schemas for Analytics 93 Column-Oriented Storage 95 Column Compression 97 Sort Order in Column Storage 99 Writing to Column-Oriented Storage 101 Aggregation: Data Cubes and Materialized Views 101 Summary 103 4. Encoding and Evolution. .,908,200,277,308,"[Relational Versus Document Databases, The Cypher Query Language, Other Indexing Structures, 46 Graph-Like Data Models, Column-Oriented Storage 101 Aggregation, 38 Query Languages, The Foundation, 69 Data Structures, Declarative Queries, 44 MapReduce Querying, 49 Property Graphs, 52 Graph Queries, 70 Hash Indexes, Transaction Processing, 90 Data Warehousing, Column Compression, Sort Order, Column Storage, Data Cubes, Materialized Views, Web, SQL, Triple-Stores, SPARQL, Datalog, Summary, Retrieval, 72 SSTables, LSM-Trees, B-Trees, Analytics, 91 Stars, Snowflakes, Schemas, Encoding, Evolution]","[(Databases, Skill), (Today, DateTime), (38, Quantity), (42, Quantity), (44, Quantity), (46, Quantity), (49, Quantity), (50, Quantity), (52, Quantity), (SQL, Skill), (53, Quantity), (55, Quantity), (60, Quantity), (63, Quantity), (3, Quantity), (Retrieval, Skill), (69, Quantity), (Database, Skill), (70, Quantity), (Hash Indexes, Skill), (72, Quantity), (76 B, Quantity), (79, Quantity), (83, Quantity), (85, Quantity), (Transaction Processing, Skill), (Analytics, Skill), (90, Quantity), (91, Quantity), (93, Quantity), (95, Quantity), (97, Quantity), (99, Quantity), (101, Quantity), (101, Quantity), (103, Quantity), (4, Quantity), (Encoding, Skill), (Evolution, Skill)]"
9,10,". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 111 Formats for Encoding Data 112 Language-Specific Formats 113 JSON, XML, and Binary Variants 114 Thrift and Protocol Buffers 117 Avro 122 The Merits of Schemas 127 Modes of Dataflow 128 Dataflow Through Databases 129 Dataflow Through Services: REST and RPC 131 Message-Passing Dataflow 136 Summary 139 viii | Table of Contents",426,108,277,308,"[Data 112 Language-Specific Formats, Binary Variants, Protocol Buffers, Message-Passing Dataflow, 111 Formats, 129 Dataflow, JSON, XML, Thrift, Avro, Merits, Schemas, 127 Modes, Databases, Services, REST, RPC, Summary, Table, Contents]","[(111, Quantity), (112, Quantity), (113, Quantity), (JSON, Skill), (XML, Skill), (114, Quantity), (117, Quantity), (122, Quantity), (Merits, Skill), (Schemas, Skill), (127, Quantity), (Dataflow, Skill), (128, Quantity), (Dataflow, Skill), (129, Quantity), (Dataflow, Skill), (Services, Skill), (RPC, Skill), (131, Quantity), (Message-Passing Dataflow, Skill), (136, Quantity), (139, Quantity)]"


# Test PII Recognition

In [136]:
# Function to get the endpoint and API key for authentication
endpoint, api_key = get_authentication_client()

# Get the TextAnalyticsClient object using the endpoint and API key
text_analytics_client = TextAnalyticsClient(endpoint=endpoint, credential=AzureKeyCredential(api_key))

# Document(s) to analyze for Personally Identifiable Information (PII)
documents = [
    """Parker Doe has repaid all of their loans as of 2020-04-25.
    Their SSN is 859-98-0987. To contact them, use their phone number
    555-555-5555. They are originally from Brazil and have Brazilian CPF number 998.214.865-68"""
]

# Analyze the document(s) to recognize PII entities
result = text_analytics_client.recognize_pii_entities(documents)

# Filter out any documents that encountered an error during analysis
docs = [doc for doc in result if not doc.is_error]

# Compare the original document with the redacted version and display all redacted entities
print("Let's compare the original document with the redacted document, and list all redacted entities.")
for idx, doc in enumerate(docs):
    print(f"Original Document: {documents[idx]}")
    print(f"Redacted Document: {doc.redacted_text}")
    for entity in doc.entities:
        print(f"...Entity '{entity.text}' with category '{entity.category}' got redacted.")

# Confirmation message for expected redactions
print("All of the expected information has been redacted!")

# Now extracting SSN information with a confidence score threshold
print("Extracting SSN information with a confidence score of 60% or higher.")
social_security_numbers = []

for doc in docs:
    for entity in doc.entities:
        # Extract SSNs if the entity is a US Social Security Number and confidence score is above 60%
        if entity.category == 'USSocialSecurityNumber' and entity.confidence_score >= 0.6:
            social_security_numbers.append(entity.text)

# Output the extracted SSNs
if social_security_numbers:
    print(f"Extracted SSNs: {', '.join(social_security_numbers)}")
else:
    print("No SSNs were extracted with sufficient confidence.")

Let's compare the original document with the redacted document, and list all redacted entities.
Original Document: Parker Doe has repaid all of their loans as of 2020-04-25.
    Their SSN is 859-98-0987. To contact them, use their phone number
    555-555-5555. They are originally from Brazil and have Brazilian CPF number 998.214.865-68
Redacted Document: ********** has repaid all of their loans as of **********.
    Their SSN is ***********. To contact them, use their phone number
    ************. They are originally from Brazil and have Brazilian CPF number 998.214.865-68
...Entity 'Parker Doe' with category 'Organization' got redacted.
...Entity '2020-04-25' with category 'DateTime' got redacted.
...Entity '859-98-0987' with category 'USSocialSecurityNumber' got redacted.
...Entity '555-555-5555' with category 'PhoneNumber' got redacted.
All of the expected information has been redacted!
Extracting SSN information with a confidence score of 60% or higher.
Extracted SSNs: 859-98-098

### Functions to Inject SSN and Phone Numbers into chunks

In [137]:
# Function to generate a valid SSN in the format XXX-XX-XXXX
def generate_ssn():
    part1 = random.randint(100, 999)
    part2 = random.randint(10, 99)
    part3 = random.randint(1000, 9999)
    return f"SSN: {part1}-{part2}-{part3}"

# Function to generate a valid phone number in the format XXX-XXX-XXXX
def generate_phone_number():
    part1 = random.randint(100, 999)
    part2 = random.randint(100, 999)
    part3 = random.randint(1000, 9999)
    return f"Phone: {part1}-{part2}-{part3}"

# Function to inject SSN and phone number at random positions in the text
def inject_ssn_and_phone(text):
    ssn = generate_ssn()  # Generate a random SSN
    phone_number = generate_phone_number()  # Generate a random phone number
    
    # Find all the spaces in the text
    space_indices = [i for i, char in enumerate(text) if char == ' ']
    
    # If there are fewer than 2 spaces, inject at the end
    if len(space_indices) < 2:
        return text + f" {ssn} {phone_number}"
    
    # Select two different random spaces for injecting SSN and phone number
    random_spaces = random.sample(space_indices, 2)
    
    # Sort the selected space indices to avoid overlap or out-of-order injection
    random_spaces.sort()
    
    # Inject SSN after the first random space
    first_insertion = text[:random_spaces[0] + 1] + ssn + " " + text[random_spaces[0] + 1:]
    
    # Adjust the second insertion index due to the added SSN length
    adjusted_index = random_spaces[1] + len(ssn) + 2  # Adding 2 for the space after SSN
    
    # Inject phone number after the second random space
    final_text = first_insertion[:adjusted_index + 1] + phone_number + " " + first_insertion[adjusted_index + 1:]
    
    return final_text
# Apply the function to inject SSN and phone number into the "chunk" column
df['chunk_with_pii'] = df['chunk'].apply(inject_ssn_and_phone)

In [138]:
df

Unnamed: 0,page_number,chunk,chunk_char_count,chunk_token_count,page_word_count,page_token_count,key_phrases,entities,chunk_with_pii
0,1,"Martin Kleppmann Designing Data-Intensive Applications THE BIG IDEAS BEHIND RELIABLE, SCALABLE, AND MAINTAINABLE SYSTEMS",120,18,14,18,"[Martin Kleppmann, Data-Intensive Applications, BIG IDEAS, MAINTAINABLE SYSTEMS]","[(Martin Kleppmann, Person), (-Intensive, Skill), (Applications, Skill), (RELIABLE, Skill)]","Martin Kleppmann Designing SSN: 566-46-5206 Data-Intensive APhone: 874-448-8234 pplications THE BIG IDEAS BEHIND RELIABLE, SCALABLE, AND MAINTAINABLE SYSTEMS"
1,3,"Martin Kleppmann Designing Data-Intensive Applications The Big Ideas Behind Reliable, Scalable, and Maintainable Systems Boston Farnham Sebastopol Tokyo Beijing Boston Farnham Sebastopol Tokyo Beijing",200,28,24,28,"[Maintainable Systems Boston Farnham Sebastopol Tokyo Beijing Boston Farnham Sebastopol Tokyo Beijing, Martin Kleppmann, Data-Intensive Applications, Big Ideas, Reliable, Scalable]","[(Martin Kleppmann, Person), (Scalable, Skill), (Boston Farnham Sebastopol, Location), (Tokyo, Location), (Beijing, Location), (Boston Farnham, Location), (Farnham Sebastopol, Location), (Tokyo, Location), (Beijing, Location)]","Martin Kleppmann Designing Data-Intensive Applications The Big Ideas Behind Reliable, Scalable, and Maintainable SSN: 100-64-2089 Systems Boston Farnham Sebastopol Tokyo Beijing Boston Farnham Sebastopol TPhone: 546-168-6637 okyo Beijing"
2,4,"978-1-449-37332-0 LSI Designing Data-Intensive Applications by Martin Kleppmann Copyright 2017 Martin Kleppmann. All rights reserved. Printed in the United States of America. Published by OReilly Media, Inc., 1005 Gravenstein Highway North, Sebastopol, CA 95472. OReilly books may be purchased for educational, business, or sales promotional use. Online editions are also available for most titles (http://oreilly.com/safari). For more information, contact our corporate/insti tutional sales department: 800-998-9938 or corporate@oreilly.com. Editors: Ann Spencer and Marie Beaugureau Indexer: Ellen Troutman-Zaig Production Editor: Kristen Brown Interior Designer: David Futato Copyeditor: Rachel Head Cover Designer: Karen Montgomery Proofreader: Amanda Kersey Illustrator: Rebecca Demarest March 2017:",805,144,250,313,"[LSI Designing Data-Intensive Applications, Ellen Troutman-Zaig Production Editor, Kristen Brown Interior Designer, Rachel Head Cover Designer, corporate/insti tutional sales department, sales promotional use, 1005 Gravenstein Highway North, Marie Beaugureau Indexer, David Futato Copyeditor, Karen Montgomery Proofreader, Amanda Kersey Illustrator, Martin Kleppmann, United States, OReilly Media, OReilly books, educational, business, Online editions, most titles, Ann Spencer, Rebecca Demarest, rights, America, Inc., Sebastopol, safari, information, Editors]","[(978-1-449, PhoneNumber), (37332, Quantity), (0, Quantity), (Designing Data-Intensive Applications, Product), (Martin Kleppmann, Person), (2017, Quantity), (Martin Kleppmann, Person), (United States of America, Location), (OReilly Media, Inc., Organization), (1005 Gravenstein Highway North, Sebastopol, CA 95472, Address), (OReilly books, Product), (educational, Skill), (business, Skill), (sales, Skill), (Online, Skill), (http://oreilly.com/safari)., URL), (sales, Skill), (800-998-9938, PhoneNumber), (corporate@oreilly.com, Email), (Editors, PersonType), (Ann Spencer, Person), (Marie Beaugureau, Person), (Ellen, Person), (-Zaig, Organization), (Kristen Brown, Person), (Interior Designer, PersonType), (David Futato, Person), (Copyeditor, PersonType), (Rachel Head, Person), (Cover Designer, PersonType), (Karen Montgomery, Person), (Proofreader, PersonType), (Amanda Kersey, Person), (Illustrator, PersonType), (Rebecca Demarest, Person), (March 2017:, DateTime)]","978-1-449-37332-0 LSI Designing Data-Intensive Applications by Martin Kleppmann Copyright 2017 Martin Kleppmann. All rights reserved. Printed in the United States of America. Published by OReilly Media, Inc., 1005 Gravenstein Highway North, Sebastopol, CA 95472. OReilly books may be purchased for educational, business, or sales promotional use. Online editions are also available for most titles (http://oreilly.com/safari). For more information, contact our corporate/insti tutional sales department: 800-998-9938 or corporate@oreilly.com. Editors: Ann Spencer and Marie Beaugureau Indexer: Ellen Troutman-Zaig Production Editor: Kristen Brown Interior Designer: David Futato Copyeditor: SSN: 808-97-9604 Rachel Head CPhone: 922-442-5848 over Designer: Karen Montgomery Proofreader: Amanda Kersey Illustrator: Rebecca Demarest March 2017:"
3,4,"First Edition Revision History for the First Edition 2017-03-01: First Release See http://oreilly.com/catalog/errata.csp?isbn=9781449373320 for release details. The OReilly logo is a registered trademark of OReilly Media, Inc. Designing Data-Intensive Applications, the cover image, and related trade dress are trademarks of OReilly Media, Inc. While the publisher and the author have used good faith efforts to ensure that the information and instructions contained in this work are accurate, the publisher and the author disclaim all responsibility for errors or omissions, including without limitation responsibility for damages resulting from the use of or reliance on this work. Use of the information and instructions contained in this work is at your own risk. If any code samples or other technology this work contains or describes is subject to open source licenses or the intellectual property rights of others, it is your responsibility to ensure that your use thereof complies with such licenses and/or rights.",1022,169,250,313,"[First Edition Revision History, related trade dress, good faith efforts, The OReilly logo, open source licenses, intellectual property rights, First Release, release details, registered trademark, OReilly Media, Data-Intensive Applications, cover image, code samples, other technology, catalog, errata, trademarks, publisher, author, information, instructions, work, responsibility, errors, omissions, limitation, damages, use, reliance, risk, others]","[(First, Quantity), (First, Quantity), (2017-03-01, DateTime), (First, Quantity), (http://oreilly.com/catalog/errata.csp?isbn=9781449373320, URL), (OReilly, Organization), (OReilly Media, Inc., Organization), (Designing, Skill), (OReilly Media, Inc., Organization), (publisher, PersonType), (author, PersonType), (publisher, PersonType), (author, PersonType), (code samples, Skill), (technology, Skill), (open source, Skill)]","First Edition Revision History for the First Edition 2017-03-01: First Release See http://oreilly.com/catalog/errata.csp?isbn=9781449373320 for release details. The OReilly logo is a SSN: 385-53-9999 registered trademark of OReilly Media, Inc. Designing Data-Intensive Applications, the cover image, and related trade dress are trademarks of OReilly Media, Inc. While the publisher and the author have used good faith efforts to ensure that the information and instructions contained in this work are accurate, the publisher and the author disclaim all responsibility for errors or omissions, including without limitation responsibility for damages resulting from the use of or reliance on this work. Use of the information and instructions contained in this work is at your own risk. If any code samples or other technology this work contains or describes is subject to open source licenses or the intellectual property rights of others, it is your responsibility to ensure that your use thereof complies with such lPhone: 999-777-5531 icenses and/or rights."
4,5,"Technology is a powerful force in our society. Data, software, and communication can be used for bad: to entrench unfair power structures, to undermine human rights, and to protect vested interests. But they can also be used for good: to make underrepresented peoples voices heard, to create opportunities for everyone, and to avert disasters. This book is dedicated to everyone working toward the good.",403,76,64,76,"[unfair power structures, powerful force, human rights, vested interests, Technology, society, Data, software, communication, good, underrepresented, peoples, voices, opportunities, everyone, disasters, book]","[(Technology, Skill), (Data, Skill), (software, Skill), (communication, Skill), (underrepresented peoples, PersonType), (disasters, Event), (book, Product)]","Technology is a powerful force in SSN: 383-61-1112 our society. Data, software, and communication can be used for bad: to entrench unfair power structures, to undermine human rights, and to protect vested interests. But they can also be used for gPhone: 432-160-6668 ood: to make underrepresented peoples voices heard, to create opportunities for everyone, and to avert disasters. This book is dedicated to everyone working toward the good."
5,7,"Computing is pop culture. ... Pop culture holds a disdain for history. Pop culture is all about identity and feeling like youre participating. It has nothing to do with cooperation, the past or the futureits living in the present. I think the same is true of most people who write code for money. They have no idea where their culture came from. Alan Kay, in interview with Dr Dobbs Journal (2012)",397,82,71,82,"[Dr Dobbs Journal, most people, Alan Kay, pop culture, Computing, disdain, history, identity, cooperation, past, futureits, present, code, money, idea, interview]","[(Computing, Skill), (Pop, Skill), (cooperation, Skill), (people, PersonType), (Alan Kay, Person), (interview, Event), (Dr Dobbs Journal, Organization), ((2012), DateTime)]","Computing is pop culture. ... Pop culture holds a disdain for history. SSN: 116-19-8428 Pop culture is all about identity and feeling like youre participating. It has nothing to do with cPhone: 774-878-6012 ooperation, the past or the futureits living in the present. I think the same is true of most people who write code for money. They have no idea where their culture came from. Alan Kay, in interview with Dr Dobbs Journal (2012)"
6,9,"Table of Contents Preface. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . xiii Part I. Foundations of Data Systems 1. Reliable, Scalable, and Maintainable Applications. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 3 Thinking About Data Systems 4 Reliability 6 Hardware Faults 7 Software Errors 8 Human Errors 9 How Important Is Reliability? 10 Scalability 10 Describing Load 11 Describing Performance 13 Approaches for Coping with Load 17 Maintainability 18 Operability: Making Life Easy for Operations 19 Simplicity: Managing Complexity 20 Evolvability: Making Change Easy 21",682,179,248,270,"[Contents Preface, Part I., Data Systems, Maintainable Applications, Hardware Faults, 7 Software Errors, Human Errors, Managing Complexity, Table, xiii, Foundations, Reliable, Scalable, Reliability, 10 Scalability, Load, Performance, 13 Approaches, 17 Maintainability, Operability, Life, Operations, Simplicity, Evolvability, Change]","[(1, Quantity), (Reliable, Skill), (Scalable, Skill), (Maintainable Applications., Skill), (3, Quantity), (Data Systems, Skill), (4, Quantity), (Reliability, Skill), (6, Quantity), (Hardware, Skill), (7, Quantity), (Software, Skill), (8, Quantity), (9, Quantity), (Reliability, Skill), (10, Quantity), (10, Quantity), (11, Quantity), (13, Quantity), (17, Quantity), (Maintainability, Skill), (18, Quantity), (19, Quantity), (20, Quantity), (21, Quantity)]","Table of Contents Preface. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . SSN: 417-80-2948 . . . . . . . . . . . . . . . . . . .Phone: 352-877-1727 . . . . . . . . . . . . . . . . . . xiii Part I. Foundations of Data Systems 1. Reliable, Scalable, and Maintainable Applications. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 3 Thinking About Data Systems 4 Reliability 6 Hardware Faults 7 Software Errors 8 Human Errors 9 How Important Is Reliability? 10 Scalability 10 Describing Load 11 Describing Performance 13 Approaches for Coping with Load 17 Maintainability 18 Operability: Making Life Easy for Operations 19 Simplicity: Managing Complexity 20 Evolvability: Making Change Easy 21"
7,9,Summary 22 2. Data Models and Query Languages. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 27 Relational Model Versus Document Model 28 The Birth of NoSQL 29 The Object-Relational Mismatch 29 Many-to-One and Many-to-Many Relationships 33 Are Document Databases Repeating History? 36 vii,326,91,248,270,"[27 Relational Model Versus, Document Model, Data Models, Query Languages, Object-Relational Mismatch, Many Relationships, Document Databases, Summary, Birth, NoSQL, One, History, 36 vii]","[(22, Quantity), (2, Quantity), (27, Quantity), (28, Quantity), (NoSQL, Skill), (29, Quantity), (29, Quantity), (One, Quantity), (33, Quantity), (36, Quantity)]",Summary 22 2. Data Models and Query Languages. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 27 Relational Model Versus Document Model 28 The Birth of NoSQL 29 SSN: 667-90-6165 The Object-Relational Mismatch 29 Many-to-One and Many-to-Many Relationships 33 Are Document Databases RPhone: 771-106-1949 epeating History? 36 vii
8,10,Relational Versus Document Databases Today 38 Query Languages for Data 42 Declarative Queries on the Web 44 MapReduce Querying 46 Graph-Like Data Models 49 Property Graphs 50 The Cypher Query Language 52 Graph Queries in SQL 53 Triple-Stores and SPARQL 55 The Foundation: Datalog 60 Summary 63 3. Storage and Retrieval. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 69 Data Structures That Power Your Database 70 Hash Indexes 72 SSTables and LSM-Trees 76 B-Trees 79 Comparing B-Trees and LSM-Trees 83 Other Indexing Structures 85 Transaction Processing or Analytics? 90 Data Warehousing 91 Stars and Snowflakes: Schemas for Analytics 93 Column-Oriented Storage 95 Column Compression 97 Sort Order in Column Storage 99 Writing to Column-Oriented Storage 101 Aggregation: Data Cubes and Materialized Views 101 Summary 103 4. Encoding and Evolution. .,908,200,277,308,"[Relational Versus Document Databases, The Cypher Query Language, Other Indexing Structures, 46 Graph-Like Data Models, Column-Oriented Storage 101 Aggregation, 38 Query Languages, The Foundation, 69 Data Structures, Declarative Queries, 44 MapReduce Querying, 49 Property Graphs, 52 Graph Queries, 70 Hash Indexes, Transaction Processing, 90 Data Warehousing, Column Compression, Sort Order, Column Storage, Data Cubes, Materialized Views, Web, SQL, Triple-Stores, SPARQL, Datalog, Summary, Retrieval, 72 SSTables, LSM-Trees, B-Trees, Analytics, 91 Stars, Snowflakes, Schemas, Encoding, Evolution]","[(Databases, Skill), (Today, DateTime), (38, Quantity), (42, Quantity), (44, Quantity), (46, Quantity), (49, Quantity), (50, Quantity), (52, Quantity), (SQL, Skill), (53, Quantity), (55, Quantity), (60, Quantity), (63, Quantity), (3, Quantity), (Retrieval, Skill), (69, Quantity), (Database, Skill), (70, Quantity), (Hash Indexes, Skill), (72, Quantity), (76 B, Quantity), (79, Quantity), (83, Quantity), (85, Quantity), (Transaction Processing, Skill), (Analytics, Skill), (90, Quantity), (91, Quantity), (93, Quantity), (95, Quantity), (97, Quantity), (99, Quantity), (101, Quantity), (101, Quantity), (103, Quantity), (4, Quantity), (Encoding, Skill), (Evolution, Skill)]",Relational Versus Document Databases Today 38 Query Languages for Data 42 Declarative Queries on the Web 44 MapReduce Querying 46 Graph-Like Data Models 49 Property Graphs 50 The Cypher Query Language 52 Graph Queries in SQL 53 Triple-Stores and SPARQL 55 The Foundation: Datalog 60 Summary 63 3. Storage and Retrieval. . SSN: 420-21-7683 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 69 Data Structures That Power Your Database 70 Hash Indexes 72 SSTables and LSM-Trees 76 B-Trees 79 Comparing B-Trees and LSM-Trees 83 Other Indexing Structures 85 Transaction Processing or Analytics? 90 Data Warehousing 91 Stars and Snowflakes: Schemas fPhone: 842-504-7544 or Analytics 93 Column-Oriented Storage 95 Column Compression 97 Sort Order in Column Storage 99 Writing to Column-Oriented Storage 101 Aggregation: Data Cubes and Materialized Views 101 Summary 103 4. Encoding and Evolution. .
9,10,". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 111 Formats for Encoding Data 112 Language-Specific Formats 113 JSON, XML, and Binary Variants 114 Thrift and Protocol Buffers 117 Avro 122 The Merits of Schemas 127 Modes of Dataflow 128 Dataflow Through Databases 129 Dataflow Through Services: REST and RPC 131 Message-Passing Dataflow 136 Summary 139 viii | Table of Contents",426,108,277,308,"[Data 112 Language-Specific Formats, Binary Variants, Protocol Buffers, Message-Passing Dataflow, 111 Formats, 129 Dataflow, JSON, XML, Thrift, Avro, Merits, Schemas, 127 Modes, Databases, Services, REST, RPC, Summary, Table, Contents]","[(111, Quantity), (112, Quantity), (113, Quantity), (JSON, Skill), (XML, Skill), (114, Quantity), (117, Quantity), (122, Quantity), (Merits, Skill), (Schemas, Skill), (127, Quantity), (Dataflow, Skill), (128, Quantity), (Dataflow, Skill), (129, Quantity), (Dataflow, Skill), (Services, Skill), (RPC, Skill), (131, Quantity), (Message-Passing Dataflow, Skill), (136, Quantity), (139, Quantity)]",". . . . . . . . . . . . . . . . . . . . . SSN: 344-49-7440 . . . . . . . . . . . . . . . . . . . . . . . . . . . . 111 Formats for Encoding Data 112 Language-Specific FPhone: 946-540-5872 ormats 113 JSON, XML, and Binary Variants 114 Thrift and Protocol Buffers 117 Avro 122 The Merits of Schemas 127 Modes of Dataflow 128 Dataflow Through Databases 129 Dataflow Through Services: REST and RPC 131 Message-Passing Dataflow 136 Summary 139 viii | Table of Contents"


### Functions to detect PII and redact the PII 

In [71]:
# Function to show all detected PII
def detect_pii(text, client):
    try:
        # Analyze the document(s) to recognize PII entities
        result = client.recognize_pii_entities([text])
        
        # Filter out any documents that encountered an error during analysis
        docs = [doc for doc in result if not doc.is_error]
        
        # List to store all detected PII entities
        detected_pii = []
        
        # Iterate through each detected PII entity in the document
        if docs:
            for entity in docs[0].entities:
                detected_pii.append({
                    'Entity': entity.text,
                    'Category': entity.category,
                    'Confidence': entity.confidence_score
                })
        return detected_pii
    
    except Exception as e:
        print(f"Error detecting PII: {e}")
        return []

# Function to redact PII based on a confidence threshold
def redact_pii_with_threshold(text, client, threshold=0.6):
    try:
        # Analyze the document(s) to recognize PII entities
        result = client.recognize_pii_entities([text])
        
        # Filter out any documents that encountered an error during analysis
        docs = [doc for doc in result if not doc.is_error]
        
        # If there is a valid document, apply redaction based on confidence threshold
        if docs:
            redacted_text = text  # Default to original text
            for entity in docs[0].entities:
                if entity.confidence_score >= threshold:
                    # Replace PII in the text with asterisks
                    redacted_text = redacted_text.replace(entity.text, '*' * len(entity.text))
            return redacted_text
        else:
            return text  # If there was an error, return the original text
    
    except Exception as e:
        print(f"Error redacting PII: {e}")
        return text  # Return original text in case of any failure

In [139]:
# Detect PII and store the results
df['detected_pii'] = df['chunk_with_pii'].apply(lambda x: detect_pii(x, text_analytics_client))

# Redact PII with a confidence threshold of 0.6
df['redacted_chunk'] = df['chunk_with_pii'].apply(lambda x: redact_pii_with_threshold(x, text_analytics_client, threshold=0.6))

In [140]:
df

Unnamed: 0,page_number,chunk,chunk_char_count,chunk_token_count,page_word_count,page_token_count,key_phrases,entities,chunk_with_pii,detected_pii,redacted_chunk
0,1,"Martin Kleppmann Designing Data-Intensive Applications THE BIG IDEAS BEHIND RELIABLE, SCALABLE, AND MAINTAINABLE SYSTEMS",120,18,14,18,"[Martin Kleppmann, Data-Intensive Applications, BIG IDEAS, MAINTAINABLE SYSTEMS]","[(Martin Kleppmann, Person), (-Intensive, Skill), (Applications, Skill), (RELIABLE, Skill)]","Martin Kleppmann Designing SSN: 566-46-5206 Data-Intensive APhone: 874-448-8234 pplications THE BIG IDEAS BEHIND RELIABLE, SCALABLE, AND MAINTAINABLE SYSTEMS","[{'Entity': 'Martin Kleppmann', 'Category': 'Person', 'Confidence': 1.0}, {'Entity': '566-46-5206', 'Category': 'USSocialSecurityNumber', 'Confidence': 0.85}, {'Entity': '874-448-8234', 'Category': 'PhoneNumber', 'Confidence': 0.8}]","**************** Designing SSN: *********** Data-Intensive APhone: ************ pplications THE BIG IDEAS BEHIND RELIABLE, SCALABLE, AND MAINTAINABLE SYSTEMS"
1,3,"Martin Kleppmann Designing Data-Intensive Applications The Big Ideas Behind Reliable, Scalable, and Maintainable Systems Boston Farnham Sebastopol Tokyo Beijing Boston Farnham Sebastopol Tokyo Beijing",200,28,24,28,"[Maintainable Systems Boston Farnham Sebastopol Tokyo Beijing Boston Farnham Sebastopol Tokyo Beijing, Martin Kleppmann, Data-Intensive Applications, Big Ideas, Reliable, Scalable]","[(Martin Kleppmann, Person), (Scalable, Skill), (Boston Farnham Sebastopol, Location), (Tokyo, Location), (Beijing, Location), (Boston Farnham, Location), (Farnham Sebastopol, Location), (Tokyo, Location), (Beijing, Location)]","Martin Kleppmann Designing Data-Intensive Applications The Big Ideas Behind Reliable, Scalable, and Maintainable SSN: 100-64-2089 Systems Boston Farnham Sebastopol Tokyo Beijing Boston Farnham Sebastopol TPhone: 546-168-6637 okyo Beijing","[{'Entity': 'Martin Kleppmann', 'Category': 'Person', 'Confidence': 1.0}, {'Entity': '100-64-2089', 'Category': 'USSocialSecurityNumber', 'Confidence': 0.85}, {'Entity': 'Farnham Sebastopol', 'Category': 'Address', 'Confidence': 0.8}, {'Entity': 'Farnham Sebastopol', 'Category': 'Address', 'Confidence': 0.78}, {'Entity': '546-168-6637', 'Category': 'PhoneNumber', 'Confidence': 0.8}, {'Entity': 'okyo', 'Category': 'Address', 'Confidence': 0.62}]","**************** Designing Data-Intensive Applications The Big Ideas Behind Reliable, Scalable, and Maintainable SSN: *********** Systems Boston ****************** T**** Beijing Boston ****************** TPhone: ************ **** Beijing"
2,4,"978-1-449-37332-0 LSI Designing Data-Intensive Applications by Martin Kleppmann Copyright 2017 Martin Kleppmann. All rights reserved. Printed in the United States of America. Published by OReilly Media, Inc., 1005 Gravenstein Highway North, Sebastopol, CA 95472. OReilly books may be purchased for educational, business, or sales promotional use. Online editions are also available for most titles (http://oreilly.com/safari). For more information, contact our corporate/insti tutional sales department: 800-998-9938 or corporate@oreilly.com. Editors: Ann Spencer and Marie Beaugureau Indexer: Ellen Troutman-Zaig Production Editor: Kristen Brown Interior Designer: David Futato Copyeditor: Rachel Head Cover Designer: Karen Montgomery Proofreader: Amanda Kersey Illustrator: Rebecca Demarest March 2017:",805,144,250,313,"[LSI Designing Data-Intensive Applications, Ellen Troutman-Zaig Production Editor, Kristen Brown Interior Designer, Rachel Head Cover Designer, corporate/insti tutional sales department, sales promotional use, 1005 Gravenstein Highway North, Marie Beaugureau Indexer, David Futato Copyeditor, Karen Montgomery Proofreader, Amanda Kersey Illustrator, Martin Kleppmann, United States, OReilly Media, OReilly books, educational, business, Online editions, most titles, Ann Spencer, Rebecca Demarest, rights, America, Inc., Sebastopol, safari, information, Editors]","[(978-1-449, PhoneNumber), (37332, Quantity), (0, Quantity), (Designing Data-Intensive Applications, Product), (Martin Kleppmann, Person), (2017, Quantity), (Martin Kleppmann, Person), (United States of America, Location), (OReilly Media, Inc., Organization), (1005 Gravenstein Highway North, Sebastopol, CA 95472, Address), (OReilly books, Product), (educational, Skill), (business, Skill), (sales, Skill), (Online, Skill), (http://oreilly.com/safari)., URL), (sales, Skill), (800-998-9938, PhoneNumber), (corporate@oreilly.com, Email), (Editors, PersonType), (Ann Spencer, Person), (Marie Beaugureau, Person), (Ellen, Person), (-Zaig, Organization), (Kristen Brown, Person), (Interior Designer, PersonType), (David Futato, Person), (Copyeditor, PersonType), (Rachel Head, Person), (Cover Designer, PersonType), (Karen Montgomery, Person), (Proofreader, PersonType), (Amanda Kersey, Person), (Illustrator, PersonType), (Rebecca Demarest, Person), (March 2017:, DateTime)]","978-1-449-37332-0 LSI Designing Data-Intensive Applications by Martin Kleppmann Copyright 2017 Martin Kleppmann. All rights reserved. Printed in the United States of America. Published by OReilly Media, Inc., 1005 Gravenstein Highway North, Sebastopol, CA 95472. OReilly books may be purchased for educational, business, or sales promotional use. Online editions are also available for most titles (http://oreilly.com/safari). For more information, contact our corporate/insti tutional sales department: 800-998-9938 or corporate@oreilly.com. Editors: Ann Spencer and Marie Beaugureau Indexer: Ellen Troutman-Zaig Production Editor: Kristen Brown Interior Designer: David Futato Copyeditor: SSN: 808-97-9604 Rachel Head CPhone: 922-442-5848 over Designer: Karen Montgomery Proofreader: Amanda Kersey Illustrator: Rebecca Demarest March 2017:","[{'Entity': '978-1-449', 'Category': 'PhoneNumber', 'Confidence': 0.8}, {'Entity': 'Martin Kleppmann', 'Category': 'Person', 'Confidence': 0.98}, {'Entity': 'Martin Kleppmann', 'Category': 'Person', 'Confidence': 0.92}, {'Entity': 'OReilly Media, Inc.', 'Category': 'Organization', 'Confidence': 1.0}, {'Entity': '1005 Gravenstein Highway North, Sebastopol, CA 95472', 'Category': 'Address', 'Confidence': 1.0}, {'Entity': 'http://oreilly.com/safari).', 'Category': 'URL', 'Confidence': 0.8}, {'Entity': '800-998-9938', 'Category': 'PhoneNumber', 'Confidence': 0.8}, {'Entity': 'corporate@oreilly.com', 'Category': 'Email', 'Confidence': 0.8}, {'Entity': 'Editors', 'Category': 'PersonType', 'Confidence': 0.8}, {'Entity': 'Ann Spencer', 'Category': 'Person', 'Confidence': 1.0}, {'Entity': 'Marie Beaugureau', 'Category': 'Person', 'Confidence': 1.0}, {'Entity': 'Troutman-Zaig', 'Category': 'Organization', 'Confidence': 0.87}, {'Entity': 'Editor', 'Category': 'PersonType', 'Confidence': 0.46}, {'Entity': 'Kristen Brown', 'Category': 'Person', 'Confidence': 1.0}, {'Entity': 'Interior Designer', 'Category': 'PersonType', 'Confidence': 0.48}, {'Entity': 'David Futato', 'Category': 'Person', 'Confidence': 1.0}, {'Entity': 'Copyeditor', 'Category': 'PersonType', 'Confidence': 0.77}, {'Entity': '808-97-9604', 'Category': 'USSocialSecurityNumber', 'Confidence': 0.85}, {'Entity': 'Rachel Head', 'Category': 'Person', 'Confidence': 0.98}, {'Entity': '922-442-5848', 'Category': 'PhoneNumber', 'Confidence': 0.8}, {'Entity': 'Designer', 'Category': 'PersonType', 'Confidence': 0.74}, {'Entity': 'Karen Montgomery', 'Category': 'Person', 'Confidence': 1.0}, {'Entity': 'Proofreader', 'Category': 'PersonType', 'Confidence': 0.85}, {'Entity': 'Amanda Kersey', 'Category': 'Person', 'Confidence': 1.0}, {'Entity': 'Illustrator', 'Category': 'PersonType', 'Confidence': 0.94}, {'Entity': 'Rebecca Demarest', 'Category': 'Person', 'Confidence': 1.0}]","*********-37332-0 LSI Designing Data-Intensive Applications by **************** Copyright 2017 ****************. All rights reserved. Printed in the United States of America. Published by *******************, ****************************************************. OReilly books may be purchased for educational, business, or sales promotional use. Online editions are also available for most titles (*************************** For more information, contact our corporate/insti tutional sales department: ************ or *********************. *******: *********** and **************** Indexer: Ellen ************* Production Editor: ************* Interior ********: ************ **********: SSN: *********** *********** CPhone: ************ over ********: **************** ***********: ************* ***********: **************** March 2017:"
3,4,"First Edition Revision History for the First Edition 2017-03-01: First Release See http://oreilly.com/catalog/errata.csp?isbn=9781449373320 for release details. The OReilly logo is a registered trademark of OReilly Media, Inc. Designing Data-Intensive Applications, the cover image, and related trade dress are trademarks of OReilly Media, Inc. While the publisher and the author have used good faith efforts to ensure that the information and instructions contained in this work are accurate, the publisher and the author disclaim all responsibility for errors or omissions, including without limitation responsibility for damages resulting from the use of or reliance on this work. Use of the information and instructions contained in this work is at your own risk. If any code samples or other technology this work contains or describes is subject to open source licenses or the intellectual property rights of others, it is your responsibility to ensure that your use thereof complies with such licenses and/or rights.",1022,169,250,313,"[First Edition Revision History, related trade dress, good faith efforts, The OReilly logo, open source licenses, intellectual property rights, First Release, release details, registered trademark, OReilly Media, Data-Intensive Applications, cover image, code samples, other technology, catalog, errata, trademarks, publisher, author, information, instructions, work, responsibility, errors, omissions, limitation, damages, use, reliance, risk, others]","[(First, Quantity), (First, Quantity), (2017-03-01, DateTime), (First, Quantity), (http://oreilly.com/catalog/errata.csp?isbn=9781449373320, URL), (OReilly, Organization), (OReilly Media, Inc., Organization), (Designing, Skill), (OReilly Media, Inc., Organization), (publisher, PersonType), (author, PersonType), (publisher, PersonType), (author, PersonType), (code samples, Skill), (technology, Skill), (open source, Skill)]","First Edition Revision History for the First Edition 2017-03-01: First Release See http://oreilly.com/catalog/errata.csp?isbn=9781449373320 for release details. The OReilly logo is a SSN: 385-53-9999 registered trademark of OReilly Media, Inc. Designing Data-Intensive Applications, the cover image, and related trade dress are trademarks of OReilly Media, Inc. While the publisher and the author have used good faith efforts to ensure that the information and instructions contained in this work are accurate, the publisher and the author disclaim all responsibility for errors or omissions, including without limitation responsibility for damages resulting from the use of or reliance on this work. Use of the information and instructions contained in this work is at your own risk. If any code samples or other technology this work contains or describes is subject to open source licenses or the intellectual property rights of others, it is your responsibility to ensure that your use thereof complies with such lPhone: 999-777-5531 icenses and/or rights.","[{'Entity': '2017-03-01', 'Category': 'DateTime', 'Confidence': 1.0}, {'Entity': 'http://oreilly.com/catalog/errata.csp?isbn=9781449373320', 'Category': 'URL', 'Confidence': 0.8}, {'Entity': 'OReilly', 'Category': 'Organization', 'Confidence': 0.99}, {'Entity': '385-53-9999', 'Category': 'USSocialSecurityNumber', 'Confidence': 0.85}, {'Entity': 'OReilly Media, Inc.', 'Category': 'Organization', 'Confidence': 0.98}, {'Entity': 'OReilly Media, Inc.', 'Category': 'Organization', 'Confidence': 0.93}, {'Entity': 'publisher', 'Category': 'PersonType', 'Confidence': 0.95}, {'Entity': 'author', 'Category': 'PersonType', 'Confidence': 0.98}, {'Entity': 'publisher', 'Category': 'PersonType', 'Confidence': 0.93}, {'Entity': 'author', 'Category': 'PersonType', 'Confidence': 0.97}, {'Entity': '999-777-5531', 'Category': 'PhoneNumber', 'Confidence': 0.8}]","First Edition Revision History for the First Edition **********: First Release See ******************************************************** for release details. The ******* logo is a SSN: *********** registered trademark of ******* Media, Inc. Designing Data-Intensive Applications, the cover image, and related trade dress are trademarks of ******* Media, Inc. While the ********* and the ****** have used good faith efforts to ensure that the information and instructions contained in this work are accurate, the ********* and the ****** disclaim all responsibility for errors or omissions, including without limitation responsibility for damages resulting from the use of or reliance on this work. Use of the information and instructions contained in this work is at your own risk. If any code samples or other technology this work contains or describes is subject to open source licenses or the intellectual property rights of others, it is your responsibility to ensure that your use thereof complies with such lPhone: ************ icenses and/or rights."
4,5,"Technology is a powerful force in our society. Data, software, and communication can be used for bad: to entrench unfair power structures, to undermine human rights, and to protect vested interests. But they can also be used for good: to make underrepresented peoples voices heard, to create opportunities for everyone, and to avert disasters. This book is dedicated to everyone working toward the good.",403,76,64,76,"[unfair power structures, powerful force, human rights, vested interests, Technology, society, Data, software, communication, good, underrepresented, peoples, voices, opportunities, everyone, disasters, book]","[(Technology, Skill), (Data, Skill), (software, Skill), (communication, Skill), (underrepresented peoples, PersonType), (disasters, Event), (book, Product)]","Technology is a powerful force in SSN: 383-61-1112 our society. Data, software, and communication can be used for bad: to entrench unfair power structures, to undermine human rights, and to protect vested interests. But they can also be used for gPhone: 432-160-6668 ood: to make underrepresented peoples voices heard, to create opportunities for everyone, and to avert disasters. This book is dedicated to everyone working toward the good.","[{'Entity': '383-61-1112', 'Category': 'USSocialSecurityNumber', 'Confidence': 0.85}, {'Entity': '432-160-6668', 'Category': 'PhoneNumber', 'Confidence': 0.8}, {'Entity': 'underrepresented', 'Category': 'PersonType', 'Confidence': 0.61}]","Technology is a powerful force in SSN: *********** our society. Data, software, and communication can be used for bad: to entrench unfair power structures, to undermine human rights, and to protect vested interests. But they can also be used for gPhone: ************ ood: to make **************** peoples voices heard, to create opportunities for everyone, and to avert disasters. This book is dedicated to everyone working toward the good."
5,7,"Computing is pop culture. ... Pop culture holds a disdain for history. Pop culture is all about identity and feeling like youre participating. It has nothing to do with cooperation, the past or the futureits living in the present. I think the same is true of most people who write code for money. They have no idea where their culture came from. Alan Kay, in interview with Dr Dobbs Journal (2012)",397,82,71,82,"[Dr Dobbs Journal, most people, Alan Kay, pop culture, Computing, disdain, history, identity, cooperation, past, futureits, present, code, money, idea, interview]","[(Computing, Skill), (Pop, Skill), (cooperation, Skill), (people, PersonType), (Alan Kay, Person), (interview, Event), (Dr Dobbs Journal, Organization), ((2012), DateTime)]","Computing is pop culture. ... Pop culture holds a disdain for history. SSN: 116-19-8428 Pop culture is all about identity and feeling like youre participating. It has nothing to do with cPhone: 774-878-6012 ooperation, the past or the futureits living in the present. I think the same is true of most people who write code for money. They have no idea where their culture came from. Alan Kay, in interview with Dr Dobbs Journal (2012)","[{'Entity': '116-19-8428', 'Category': 'USSocialSecurityNumber', 'Confidence': 0.85}, {'Entity': '774-878-6012', 'Category': 'PhoneNumber', 'Confidence': 0.8}, {'Entity': 'futureits', 'Category': 'PersonType', 'Confidence': 0.5}, {'Entity': 'people', 'Category': 'PersonType', 'Confidence': 0.91}, {'Entity': 'Alan Kay', 'Category': 'Person', 'Confidence': 1.0}, {'Entity': 'Dr Dobbs Journal', 'Category': 'Organization', 'Confidence': 0.77}]","Computing is pop culture. ... Pop culture holds a disdain for history. SSN: *********** Pop culture is all about identity and feeling like youre participating. It has nothing to do with cPhone: ************ ooperation, the past or the futureits living in the present. I think the same is true of most ****** who write code for money. They have no idea where their culture came from. ********, in interview with **************** (2012)"
6,9,"Table of Contents Preface. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . xiii Part I. Foundations of Data Systems 1. Reliable, Scalable, and Maintainable Applications. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 3 Thinking About Data Systems 4 Reliability 6 Hardware Faults 7 Software Errors 8 Human Errors 9 How Important Is Reliability? 10 Scalability 10 Describing Load 11 Describing Performance 13 Approaches for Coping with Load 17 Maintainability 18 Operability: Making Life Easy for Operations 19 Simplicity: Managing Complexity 20 Evolvability: Making Change Easy 21",682,179,248,270,"[Contents Preface, Part I., Data Systems, Maintainable Applications, Hardware Faults, 7 Software Errors, Human Errors, Managing Complexity, Table, xiii, Foundations, Reliable, Scalable, Reliability, 10 Scalability, Load, Performance, 13 Approaches, 17 Maintainability, Operability, Life, Operations, Simplicity, Evolvability, Change]","[(1, Quantity), (Reliable, Skill), (Scalable, Skill), (Maintainable Applications., Skill), (3, Quantity), (Data Systems, Skill), (4, Quantity), (Reliability, Skill), (6, Quantity), (Hardware, Skill), (7, Quantity), (Software, Skill), (8, Quantity), (9, Quantity), (Reliability, Skill), (10, Quantity), (10, Quantity), (11, Quantity), (13, Quantity), (17, Quantity), (Maintainability, Skill), (18, Quantity), (19, Quantity), (20, Quantity), (21, Quantity)]","Table of Contents Preface. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . SSN: 417-80-2948 . . . . . . . . . . . . . . . . . . .Phone: 352-877-1727 . . . . . . . . . . . . . . . . . . xiii Part I. Foundations of Data Systems 1. Reliable, Scalable, and Maintainable Applications. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 3 Thinking About Data Systems 4 Reliability 6 Hardware Faults 7 Software Errors 8 Human Errors 9 How Important Is Reliability? 10 Scalability 10 Describing Load 11 Describing Performance 13 Approaches for Coping with Load 17 Maintainability 18 Operability: Making Life Easy for Operations 19 Simplicity: Managing Complexity 20 Evolvability: Making Change Easy 21","[{'Entity': '417-80-2948', 'Category': 'USSocialSecurityNumber', 'Confidence': 0.85}, {'Entity': '352-877-1727', 'Category': 'PhoneNumber', 'Confidence': 0.8}]","Table of Contents Preface. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . SSN: *********** . . . . . . . . . . . . . . . . . . .Phone: ************ . . . . . . . . . . . . . . . . . . xiii Part I. Foundations of Data Systems 1. Reliable, Scalable, and Maintainable Applications. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 3 Thinking About Data Systems 4 Reliability 6 Hardware Faults 7 Software Errors 8 Human Errors 9 How Important Is Reliability? 10 Scalability 10 Describing Load 11 Describing Performance 13 Approaches for Coping with Load 17 Maintainability 18 Operability: Making Life Easy for Operations 19 Simplicity: Managing Complexity 20 Evolvability: Making Change Easy 21"
7,9,Summary 22 2. Data Models and Query Languages. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 27 Relational Model Versus Document Model 28 The Birth of NoSQL 29 The Object-Relational Mismatch 29 Many-to-One and Many-to-Many Relationships 33 Are Document Databases Repeating History? 36 vii,326,91,248,270,"[27 Relational Model Versus, Document Model, Data Models, Query Languages, Object-Relational Mismatch, Many Relationships, Document Databases, Summary, Birth, NoSQL, One, History, 36 vii]","[(22, Quantity), (2, Quantity), (27, Quantity), (28, Quantity), (NoSQL, Skill), (29, Quantity), (29, Quantity), (One, Quantity), (33, Quantity), (36, Quantity)]",Summary 22 2. Data Models and Query Languages. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 27 Relational Model Versus Document Model 28 The Birth of NoSQL 29 SSN: 667-90-6165 The Object-Relational Mismatch 29 Many-to-One and Many-to-Many Relationships 33 Are Document Databases RPhone: 771-106-1949 epeating History? 36 vii,"[{'Entity': '667-90-6165', 'Category': 'USSocialSecurityNumber', 'Confidence': 0.85}, {'Entity': '771-106-1949', 'Category': 'PhoneNumber', 'Confidence': 0.8}]",Summary 22 2. Data Models and Query Languages. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 27 Relational Model Versus Document Model 28 The Birth of NoSQL 29 SSN: *********** The Object-Relational Mismatch 29 Many-to-One and Many-to-Many Relationships 33 Are Document Databases RPhone: ************ epeating History? 36 vii
8,10,Relational Versus Document Databases Today 38 Query Languages for Data 42 Declarative Queries on the Web 44 MapReduce Querying 46 Graph-Like Data Models 49 Property Graphs 50 The Cypher Query Language 52 Graph Queries in SQL 53 Triple-Stores and SPARQL 55 The Foundation: Datalog 60 Summary 63 3. Storage and Retrieval. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 69 Data Structures That Power Your Database 70 Hash Indexes 72 SSTables and LSM-Trees 76 B-Trees 79 Comparing B-Trees and LSM-Trees 83 Other Indexing Structures 85 Transaction Processing or Analytics? 90 Data Warehousing 91 Stars and Snowflakes: Schemas for Analytics 93 Column-Oriented Storage 95 Column Compression 97 Sort Order in Column Storage 99 Writing to Column-Oriented Storage 101 Aggregation: Data Cubes and Materialized Views 101 Summary 103 4. Encoding and Evolution. .,908,200,277,308,"[Relational Versus Document Databases, The Cypher Query Language, Other Indexing Structures, 46 Graph-Like Data Models, Column-Oriented Storage 101 Aggregation, 38 Query Languages, The Foundation, 69 Data Structures, Declarative Queries, 44 MapReduce Querying, 49 Property Graphs, 52 Graph Queries, 70 Hash Indexes, Transaction Processing, 90 Data Warehousing, Column Compression, Sort Order, Column Storage, Data Cubes, Materialized Views, Web, SQL, Triple-Stores, SPARQL, Datalog, Summary, Retrieval, 72 SSTables, LSM-Trees, B-Trees, Analytics, 91 Stars, Snowflakes, Schemas, Encoding, Evolution]","[(Databases, Skill), (Today, DateTime), (38, Quantity), (42, Quantity), (44, Quantity), (46, Quantity), (49, Quantity), (50, Quantity), (52, Quantity), (SQL, Skill), (53, Quantity), (55, Quantity), (60, Quantity), (63, Quantity), (3, Quantity), (Retrieval, Skill), (69, Quantity), (Database, Skill), (70, Quantity), (Hash Indexes, Skill), (72, Quantity), (76 B, Quantity), (79, Quantity), (83, Quantity), (85, Quantity), (Transaction Processing, Skill), (Analytics, Skill), (90, Quantity), (91, Quantity), (93, Quantity), (95, Quantity), (97, Quantity), (99, Quantity), (101, Quantity), (101, Quantity), (103, Quantity), (4, Quantity), (Encoding, Skill), (Evolution, Skill)]",Relational Versus Document Databases Today 38 Query Languages for Data 42 Declarative Queries on the Web 44 MapReduce Querying 46 Graph-Like Data Models 49 Property Graphs 50 The Cypher Query Language 52 Graph Queries in SQL 53 Triple-Stores and SPARQL 55 The Foundation: Datalog 60 Summary 63 3. Storage and Retrieval. . SSN: 420-21-7683 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 69 Data Structures That Power Your Database 70 Hash Indexes 72 SSTables and LSM-Trees 76 B-Trees 79 Comparing B-Trees and LSM-Trees 83 Other Indexing Structures 85 Transaction Processing or Analytics? 90 Data Warehousing 91 Stars and Snowflakes: Schemas fPhone: 842-504-7544 or Analytics 93 Column-Oriented Storage 95 Column Compression 97 Sort Order in Column Storage 99 Writing to Column-Oriented Storage 101 Aggregation: Data Cubes and Materialized Views 101 Summary 103 4. Encoding and Evolution. .,"[{'Entity': 'Today', 'Category': 'DateTime', 'Confidence': 0.99}, {'Entity': '420-21-7683', 'Category': 'USSocialSecurityNumber', 'Confidence': 0.85}, {'Entity': '842-504-7544', 'Category': 'PhoneNumber', 'Confidence': 0.8}]",Relational Versus Document Databases ***** 38 Query Languages for Data 42 Declarative Queries on the Web 44 MapReduce Querying 46 Graph-Like Data Models 49 Property Graphs 50 The Cypher Query Language 52 Graph Queries in SQL 53 Triple-Stores and SPARQL 55 The Foundation: Datalog 60 Summary 63 3. Storage and Retrieval. . SSN: *********** . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 69 Data Structures That Power Your Database 70 Hash Indexes 72 SSTables and LSM-Trees 76 B-Trees 79 Comparing B-Trees and LSM-Trees 83 Other Indexing Structures 85 Transaction Processing or Analytics? 90 Data Warehousing 91 Stars and Snowflakes: Schemas fPhone: ************ or Analytics 93 Column-Oriented Storage 95 Column Compression 97 Sort Order in Column Storage 99 Writing to Column-Oriented Storage 101 Aggregation: Data Cubes and Materialized Views 101 Summary 103 4. Encoding and Evolution. .
9,10,". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 111 Formats for Encoding Data 112 Language-Specific Formats 113 JSON, XML, and Binary Variants 114 Thrift and Protocol Buffers 117 Avro 122 The Merits of Schemas 127 Modes of Dataflow 128 Dataflow Through Databases 129 Dataflow Through Services: REST and RPC 131 Message-Passing Dataflow 136 Summary 139 viii | Table of Contents",426,108,277,308,"[Data 112 Language-Specific Formats, Binary Variants, Protocol Buffers, Message-Passing Dataflow, 111 Formats, 129 Dataflow, JSON, XML, Thrift, Avro, Merits, Schemas, 127 Modes, Databases, Services, REST, RPC, Summary, Table, Contents]","[(111, Quantity), (112, Quantity), (113, Quantity), (JSON, Skill), (XML, Skill), (114, Quantity), (117, Quantity), (122, Quantity), (Merits, Skill), (Schemas, Skill), (127, Quantity), (Dataflow, Skill), (128, Quantity), (Dataflow, Skill), (129, Quantity), (Dataflow, Skill), (Services, Skill), (RPC, Skill), (131, Quantity), (Message-Passing Dataflow, Skill), (136, Quantity), (139, Quantity)]",". . . . . . . . . . . . . . . . . . . . . SSN: 344-49-7440 . . . . . . . . . . . . . . . . . . . . . . . . . . . . 111 Formats for Encoding Data 112 Language-Specific FPhone: 946-540-5872 ormats 113 JSON, XML, and Binary Variants 114 Thrift and Protocol Buffers 117 Avro 122 The Merits of Schemas 127 Modes of Dataflow 128 Dataflow Through Databases 129 Dataflow Through Services: REST and RPC 131 Message-Passing Dataflow 136 Summary 139 viii | Table of Contents","[{'Entity': '344-49-7440', 'Category': 'USSocialSecurityNumber', 'Confidence': 0.85}, {'Entity': '946-540-5872', 'Category': 'PhoneNumber', 'Confidence': 0.8}]",". . . . . . . . . . . . . . . . . . . . . SSN: *********** . . . . . . . . . . . . . . . . . . . . . . . . . . . . 111 Formats for Encoding Data 112 Language-Specific FPhone: ************ ormats 113 JSON, XML, and Binary Variants 114 Thrift and Protocol Buffers 117 Avro 122 The Merits of Schemas 127 Modes of Dataflow 128 Dataflow Through Databases 129 Dataflow Through Services: REST and RPC 131 Message-Passing Dataflow 136 Summary 139 viii | Table of Contents"


# Save final dataframe to csv

In [74]:
df.to_csv('AzureAINLPDemo.csv',index=False)