# Tutorial Initialization

NOTICE: All information contained herein is, and remains the property of SumUp Analytics Inc. and its suppliers, if any. The intellectual and technical concepts contained herein are proprietary to SumUp Analytics Inc. and its suppliers and may be covered by U.S. and Foreign Patents, patents in process, and are protected by trade secret or copyright law.

Dissemination of this information or reproduction of this material is strictly forbidden unless prior written permission is obtained from SumUp Analytics Inc.

Copyright (c) 2018-2020 SumUp Analytics, Inc. All Rights Reserved.

## Configure API host and key, and create a new API instance

In [None]:
import os
import csv
import json
import datetime
import time
import nucleus_api
from nucleus_api.rest import ApiException
import nucleus_api.api.nucleus_api as nucleus_helper
from pprint import pprint
import numpy as np
from pathlib import Path

# Determine if in Jupyter notebook or not
try:
    ip = get_ipython()
    running_notebook = True
except NameError:
    running_notebook = False

if running_notebook:
    print('Running example in Jupyter Notebook')
else:
    print('Running example in script mode')
    
configuration = nucleus_api.Configuration()
configuration.host = 'UPDATE-WITH-API-SERVER-HOSTNAME'
configuration.api_key['x-api-key'] = 'UPDATE-WITH-API-KEY'

# Create API instance
api_instance = nucleus_api.NucleusApi(nucleus_api.ApiClient(configuration))

# Dataset Management

In this section, we walk you through our dataset ingestion and management APIs. You will learn how to:
- Create a new dataset from different origin locations
- Customize the metadata you want to include in a dataset
- Append documents to an existing dataset
- Delete specific documents or entire datasets

## Append specific file from local drive

In [None]:
dataset = "dataset_test"
file = 'quarles20181109a.pdf'         
metadata = {"time": "1/2/2018", 
            "author": "Test Author"}  # Optional json containing additional document metadata
try:
    api_response = api_instance.post_upload_file(file, dataset, metadata=metadata)
    fp = api_response.result
    print(fp.filename, '(', fp.size, 'bytes) has been added to dataset', dataset,)    
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])

## Append all PDFs from a folder to a dataset using parallel injection

In [None]:
folder = 'fomc-minutes'         
dataset = 'dataset_test'# str | Destination dataset where the file will be inserted.

# build file iterable. Each item in the iterable is in the format below:
# {'filename': filename,   # filename to be uploaded. REQUIRED
#  'metadata': {           # metadata for the file. Optional
#      'key1': val1,       # keys can have arbiturary names as long as the names only
#      'key2': val2        # contain alphanumeric (0-9|a-z|A-Z) and underscore (_)
#   } 
# }

file_iter = []
for root, dirs, files in os.walk(folder):
    for file in files:
        if Path(file).suffix == '.pdf':
            file_dict = {'filename': os.path.join(root, file),
                         'metadata': {'field1': 'financial'}}
            file_iter.append(file_dict)

file_props = nucleus_helper.upload_files(api_instance, dataset, file_iter, processes=1)
for fp in file_props:
    print(fp.filename, '(', fp.size, 'bytes) has been added to dataset', dataset)

## Append a file from a URL to a dataset

In [None]:
dataset = 'dataset_test'
file_url = 'https://s3-us-west-2.amazonaws.com/sumup-public/nucleus-sdk/quarles20181109a.docx'
# Optional filename saved on the server for the URL. If not specified, Nucleus will make an intelligent guess from the file URL
filename = 'quarles20181109a-newname.pdf'  
payload = nucleus_api.UploadURLModel(dataset=dataset,
                                     file_url=file_url,
                                     filename=filename)
try:
    api_response = api_instance.post_upload_url(payload)
    url_prop = api_response.result
    print(url_prop.file_url, '(', url_prop.size, ' bytes) has been added to dataset', dataset)

except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])

## Append multiple URLs to a dataset using parallel injection

In [None]:
dataset = 'dataset_test'
file_urls = ['https://s3-us-west-2.amazonaws.com/sumup-public/nucleus-sdk/quarles20181109a.docx',
             'https://s3-us-west-2.amazonaws.com/sumup-public/nucleus-sdk/quarles20181109b.docx',
             'https://s3-us-west-2.amazonaws.com/sumup-public/nucleus-sdk/quarles20181109c.docx',
             'https://s3-us-west-2.amazonaws.com/sumup-public/nucleus-sdk/quarles20181109d.docx']

url_props = nucleus_helper.upload_urls(api_instance, dataset, file_urls, processes=1)

for up in url_props:
    print(up.file_url, '(', up.size, ' bytes) has been added to dataset', dataset)

## Append a JSON

In [None]:
dataset = 'dataset_test'

# The fields "title", "time", and "content" are mandatory in the JSON record.
# Users can add any custom fields to the JSON record and all the information will be saved as metadata for the document.
document = {"title": "This a test json title field",
            "time": "2019-01-01",
            "content": "This is a test json content field"}

payload = nucleus_api.Appendjsonparams(dataset=dataset,
                                       document=document)
try:
    api_response = api_instance.post_append_json_to_dataset(payload)
    print(api_response.result)
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])

The JSON record must have "title", "time", and "content" fields.

Users can add custom fields to the JSON record and all the information will be saved as metadata for the dataset.

This metadata can subsequently be used in the analytics APIs to apply custom selections of documents in your dataset.

## Append JSONs from CSV file using parallel injection

In [None]:
csv_file = 'trump-tweets-100.csv'
dataset = 'trump_tweets'

with open(csv_file, encoding='utf-8-sig') as csvfile:
    reader = csv.DictReader(csvfile)
    json_props = nucleus_helper.upload_jsons(api_instance, dataset, reader, processes=1)
    
    total_size = 0
    total_jsons = 0
    for jp in json_props:
        total_size += jp.size
        total_jsons += 1
        
    print(total_jsons, 'JSON records (', total_size, 'bytes) appended to', dataset)

The CSV file must have "title", "time", and "content" columns.

Users can add any column to their CSV file and all the information will be saved as metadata for the dataset.

This metadata can subsequently be used in the analytics APIs to apply custom selections of documents in your dataset.

## Create a dataset using embedded datafeeds:

The Nucleus platform makes available a collection of datafeeds in read-only mode to users.
- Central Banks: content in native language and English official translation where needed, grouped by content category, covering 14 Federal banks and all US Regional banks
- SEC filings: 10Ks, 10Qs, 8Ks, 6Ks, 20Fs and S1s, including revised /A files for all companies filing with the SEC
- News Media RSS: 200 English RSS feeds covering the fields of AI, Finance, Economics, News, Crypto, Culture

### Central Banks

In [None]:
dataset_central_bank = 'sumup/central_banks_chinese'
metadata_selection_central_bank = {'bank': 'people_bank_of_china', 
                                   'document_category': ('speech', 'press release', 'publication')}

Connect to these feeds by language using the following naming structure for the dataset name: 'sumup/central_banks_LANGUAGE'

with LANGUAGE in {english, chinese, japanese, german, portuguese, spanish, russian, french, italian}

You can then define a custom metadata selection off this feed by specifiying a set of banks and a set of document categories
- document_category in {speech, press release, publication, formal research}
- bank in {federal_reserve, bank_of_canada, banco_de_mexico, bank_of_brazil, ecb, bank_of_england, bundesbank, bank_of_france, bank_of_italy, bank_of_spain, russian_fed, people_bank_of_china, bank_of_japan, bank_of_australia, atlanta_fed, boston_fed, chicago_fed, cleveland_fed, dallas_fed, kansas_city_fed, minneapolis_fed, new_york_fed, philadelphia_fed, richmond_fed, san_francisco_fed, st_louis_fed}

When passing these parameters to any of the analytics APIs, you can also specify a time period selection using either of:
- The time_period input argument
- The period_start and period_end input arguments

Examples of such calls are detailed in this tutorial, within the sections discussing analytics APIs

### News RSS

In [None]:
dataset = 'sumup/rss_feed_ai'

Connect to these feeds by field using the following naming structure for the dataset name: 'sumup/rss_feed_FIELD'

with FIELD in {ai, finance, economics, news, crypto, culture}

When passing these parameters to any of the analytics APIs, you can also specify a time period selection using either of:
- The time_period input argument
- The period_start and period_end input arguments

Examples of such calls are detailed in this tutorial, within the sections discussing analytics APIs

### SEC Filings

In [None]:
# GET THE LIST OF ALL THE COMPANIES AVAILABLE IN THE FEED

payload = nucleus_api.EdgarFields(tickers=[], 
                                  filing_types=[], 
                                  sections=[])
try:
    api_response = api_instance.post_available_sec_filings(payload)
    api_ok = True
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])
    api_ok = False
    
if api_ok:
    print('SEC filings selected:')
    print('    Company count:', len(api_response.result.tickers))
    print('    Date range:', api_response.result.date_range)

In [None]:
# GET THE LIST OF AVAILABLE FILING TYPES FOR A COMPANY

payload = nucleus_api.EdgarFields(tickers=["IBM"], # Select IBM company
                                  filing_types=[], 
                                  sections=[])
try:
    api_response = api_instance.post_available_sec_filings(payload)
    api_ok = True
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])
    api_ok = False

if api_ok:
    print('SEC filings for:', api_response.result.tickers)
    print('    Types:', api_response.result.filing_types)
    print('    Count:', api_response.result.count)
    print('    Date ranges:', api_response.result.date_range)

In [None]:
# GET THE LIST OF AVAILABLE SECTIONS IN A GIVEN FILING TYPE FOR A GIVEN COMPANY

payload = nucleus_api.EdgarFields(tickers=["IBM"], # Select IBM company
                                  filing_types=["10-K"], # Get list of sections available in 10-Ks
                                  sections=[])
try:
    api_response = api_response = api_instance.post_available_sec_filings(payload)
    api_ok = True
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])
    api_ok = False
    
if api_ok:
    print('Sections in {} filings for {}'.format(api_response.result.filing_types, api_response.result.tickers))
    for section in api_response.result.sections:
        print('    {}'.format(section))

In [None]:
# BUILD A DATASET FROM A CUSTOM SELECTION OF SEC FILINGS

dataset = "dataset_sec1" 

# Dataset from a particular section for a ticker
payload = nucleus_api.EdgarQuery(destination_dataset=dataset,
                                 tickers=["BABA"], 
                                 filing_types=["20-F"], 
                                 sections=["Quantitative and Qualitative Disclosures about Market Risk"])
try:
    api_response = api_instance.post_create_dataset_from_sec_filings(payload)
    api_ok = True
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])
    api_ok = False
    
if api_ok:
    print('Dataset {} created successfully from SEC filings'.format(api_response.result['destination_dataset']))

In [None]:
# BUILD A DATASET FROM A CUSTOM SELECTION OF SEC FILINGS

dataset = "dataset_sec2" 
period_start = "2018-01-01" 
period_end= "2019-06-01"

# Dataset is all 8Ks for  the last 18 months
payload = nucleus_api.EdgarQuery(destination_dataset=dataset,
                                 tickers=["NFLX"], 
                                 filing_types=["8-K"], 
                                 sections=[],
                                 period_start=period_start,
                                 period_end=period_end)
try:
    api_response = api_instance.post_create_dataset_from_sec_filings(payload)
    api_ok = True
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])
    api_ok = False
    
if api_ok:
    print('Dataset {} created successfully from SEC filings'.format(api_response.result['destination_dataset']))

SEC filings have a more complex structure, therefore Nucleus provides a specific set of APIs to interact with this data and to allow you to create tailored datasets

There are two payloads available to you:
- nucleus_api.EdgarFields, which allows you to navigate the SEC filings' content alongside specific requirements using the post_available_sec_filings API
- nucleus_api.EdgarQuery, which allows you to create SEC filings' datasets with specific requirements using the post_create_dataset_from_sec_filings API

These payloads expose 5 optional arguments:
- tickers, which are as-recorded in the EDGAR database
- filing_types, which are to be chosen among {10-K, 10-Q, 8-K, 6-K, 20-F, S-1, 10-K/A, 10-Q/A, 8-K/A, 6-K/A, 20-F/A, S-1/A}
- sections, which are the standardized section names as-recorded in the EDGAR database for each form type
- period_start, the starting date of the period you are interested in
- period_end, the end date of the period you are interested in

## List all the datasets available to a user

In [None]:
try:
    api_response = api_instance.get_list_datasets()
except ApiException as e:
    print("Exception when calling DatasetsApi->get_list_datasets: %s\n" % e)

list_datasets = api_response.result

print(len(list_datasets), 'datasets in the database:')
for ds in list_datasets:
    print('    ', ds.name)

## Retrieve summary information for a dataset

In [None]:
dataset = 'dataset_sec2' # str | Dataset name.
query = '' # str | Fulltext query, using mysql MATCH boolean query format. (optional)
metadata_selection = '' # str | json object of {\"metadata_field\":[\"selected_values\"]} (optional)
time_period = '' # str | Time period selection (optional)
period_start = "" # str | Start date for the period to analyze within the dataset. Format: "YYYY-MM-DD HH:MM:SS"
period_end = "" # str | End date for the period to analyze within the dataset. Format: "YYYY-MM-DD HH:MM:SS"

try:
    payload = nucleus_api.DatasetInfo(dataset=dataset, 
                                    query=query, 
                                    metadata_selection=metadata_selection, 
                                    time_period=time_period)
    api_response = api_instance.post_dataset_info(payload)
    print('Information about dataset', dataset)
    print('    Language:', api_response.result.detected_language)
    print('    Number of documents:', api_response.result.num_documents)
    print('    Time range:', datetime.datetime.fromtimestamp(float(api_response.result.time_range[0])),
             'to', datetime.datetime.fromtimestamp(float(api_response.result.time_range[1])))
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])

## Delete documents from a dataset

In [None]:
dataset = 'dataset_test'

doc_ids = ['1']
payload = nucleus_api.DeleteDocumentModel(dataset=dataset,
                                          doc_ids=doc_ids)
try:
    api_response = api_instance.post_delete_document(payload)
except ApiException as e:
    print("Exception when calling DatasetsApi->post_delete_document: %s\n" % e)


print('Document', doc_ids, 'from dataset', dataset, 'deleted.')


## Delete a dataset

In [None]:
dataset = 'dataset_test'
payload = nucleus_api.DeleteDatasetModel(dataset=dataset) # Deletedatasetmodel | 

try:
    api_response = api_instance.post_delete_dataset(payload)
    print(api_response.result['dataset_deleted'])
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])
    
# List datasets again to check if the specified dataset has been deleted
try:
    api_response = api_instance.get_list_datasets()
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])

# Topic-Level Analytics

This section goes over all APIs that enable users to identify, extract and analyze topics found in a dataset.
- Topic modeling
- Topic transfer learning for propagation analysis of topics' strength, sentiment and consensus
- Sentiment analysis
- Consensus analysis
- Cross-documents topic summarization
- Historical analysis of topics' strength, sentiment, and consensus
- Authors similarity analysis
- Contrasted topic modeling: topic best separating two sub-categories of documents in a corpus

## Extract topics

### All topics

In [None]:
dataset = 'trump_tweets'
#query = '("Trump" OR "president")' # str | Fulltext query, using mysql MATCH boolean query format. Example, (\"word1\" OR \"word2\") AND (\"word3\" OR \"word4\") (optional)
query = ''
custom_stop_words = ["real","hillary"] # str | List of stop words. (optional)
num_topics = 8 # int | Number of topics to be extracted from the dataset. (optional) (default to 8)
metadata_selection = "" # dict | JSON object specifying metadata-based queries on the dataset, of type {"metadata_field": "selected_values"} (optional)
time_period = ""     # str | Time period selection. Choices: ["1M","3M","6M","12M","3Y","5Y",""] (optional)

try:
    payload = nucleus_api.Topics(dataset=dataset,                                
                                query=query,                   
                                custom_stop_words=custom_stop_words,     
                                num_topics=num_topics,
                                metadata_selection=metadata_selection,
                                time_period=time_period)
    api_response = api_instance.post_topic_api(payload)        
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])

doc_ids = api_response.result.doc_ids
topics = api_response.result.topics
for i, res in enumerate(topics):
    print('Topic', i, 'keywords:')
    print('    Keywords:', res.keywords)
    keywords_weight_str = ";".join(str(x) for x in res.keywords_weight)
    print('    Keyword weights:', keywords_weight_str)
    print('    Strength:', res.strength)
    doc_topic_exposure_sel = []  # list of non-zero doc_topic_exposures
    doc_id_sel = []        # list of doc ids matching doc_topic_exposure_sel
    for j in range(len(res.doc_topic_exposures)):
        doc_topic_exp = float(res.doc_topic_exposures[j])
        if doc_topic_exp != 0:
            doc_topic_exposure_sel.append(doc_topic_exp)
            doc_id_sel.append(doc_ids[j])
    
    doc_id_sel_str = ' '.join(str(x) for x in doc_id_sel)
    doc_topic_exposure_sel_str = ' '.join(str(x) for x in doc_topic_exposure_sel)
    print('    Document IDs:', doc_id_sel_str)
    print('    Document exposures:', doc_topic_exposure_sel_str)
    print('---------------')

### Topics within time range

In [None]:
dataset = 'trump_tweets'
#query = '("Trump" OR "president")' # str | Fulltext query, using mysql MATCH boolean query format. Example, (\"word1\" OR \"word2\") AND (\"word3\" OR \"word4\") (optional)
query = ''
custom_stop_words = ["real","hillary"] # str | List of stop words. (optional)
num_topics = 8 # int | Number of topics to be extracted from the dataset. (optional) (default to 8)
metadata_selection = "" # dict | JSON object specifying metadata-based queries on the dataset, of type {"metadata_field": "selected_values"} (optional)
period_start = "2016-10-15" # str | Start date for the period to analyze within the dataset. Format: "YYYY-MM-DD"
period_end = "2019-01-01" # str | End date for the period to analyze within the dataset. Format: "YYYY-MM-DD"

try:
    payload = nucleus_api.Topics(dataset=dataset,                                
                                 query=query,                   
                                 custom_stop_words=custom_stop_words,     
                                 num_topics=num_topics,
                                 metadata_selection=metadata_selection,
                                 period_start=period_start,
                                 period_end=period_end)
    api_response = api_instance.post_topic_api(payload)        
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])
    
doc_ids = api_response.result.doc_ids
topics = api_response.result.topics
for i, res in enumerate(topics):
    print('Topic', i, 'keywords:')
    print('    Keywords:', res.keywords)
    keywords_weight_str = ";".join(str(x) for x in res.keywords_weight)
    print('    Keyword weights:', keywords_weight_str)
    print('    Strength:', res.strength)
    doc_topic_exposure_sel = []  # list of non-zero doc_topic_exposure
    doc_id_sel = []        # list of doc ids matching doc_topic_exposure_sel
    for j in range(len(res.doc_topic_exposures)):
        doc_topic_exp = float(res.doc_topic_exposures[j])
        if doc_topic_exp != 0:
            doc_topic_exposure_sel.append(doc_topic_exp)
            doc_id_sel.append(doc_ids[j])
    
    doc_id_sel_str = ' '.join(str(x) for x in doc_id_sel)
    doc_topic_exposure_sel_str = ' '.join(str(x) for x in doc_topic_exposure_sel)
    print('    Document IDs:', doc_id_sel_str)
    print('    Document exposures:', doc_topic_exposure_sel_str)
    print('---------------')

### Topics with metadata selection

In [None]:
dataset = 'trump_tweets'
#query = '("Trump" OR "president")' # str | Fulltext query, using mysql MATCH boolean query format. Example, (\"word1\" OR \"word2\") AND (\"word3\" OR \"word4\") (optional)
query = ''
custom_stop_words = ["real","hillary"] # str | List of stop words. (optional)
num_topics = 8 # int | Number of topics to be extracted from the dataset. (optional) (default to 8)
metadata_selection = {"author": "D_Trump16"} # dict | JSON object specifying metadata-based queries on the dataset, of type {"metadata_field": "selected_values"} (optional)

try:
    payload = nucleus_api.Topics(dataset=dataset,                                
                                 query=query,                   
                                 custom_stop_words=custom_stop_words,     
                                 num_topics=num_topics,
                                 metadata_selection=metadata_selection)
    api_response = api_instance.post_topic_api(payload)        
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])
    
doc_ids = api_response.result.doc_ids
topics = api_response.result.topics
for i, res in enumerate(topics):
    print('Topic', i, 'keywords:')
    print('    Keywords:', res.keywords)
    keywords_weight_str = ";".join(str(x) for x in res.keywords_weight)
    print('    Keyword weights:', keywords_weight_str)
    print('    Strength:', res.strength)
    doc_topic_exposure_sel = []  # list of non-zero doc_topic_exposure
    doc_id_sel = []        # list of doc ids matching doc_topic_exposure_sel
    for j in range(len(res.doc_topic_exposures)):
        doc_topic_exp = float(res.doc_topic_exposures[j])
        if doc_topic_exp != 0:
            doc_topic_exposure_sel.append(doc_topic_exp)
            doc_id_sel.append(doc_ids[j])
    
    doc_id_sel_str = ' '.join(str(x) for x in doc_id_sel)
    doc_topic_exposure_sel_str = ' '.join(str(x) for x in doc_topic_exposure_sel)
    print('    Document IDs:', doc_id_sel_str)
    print('    Document exposures:', doc_topic_exposure_sel_str)
    print('---------------')

### Topics without removing redundant content

In [None]:
dataset = 'trump_tweets'
#query = '("Trump" OR "president")' # str | Fulltext query, using mysql MATCH boolean query format. Example, (\"word1\" OR \"word2\") AND (\"word3\" OR \"word4\") (optional)
query = ''
custom_stop_words = ["real","hillary"] # str | List of stop words. (optional)
num_topics = 8 # int | Number of topics to be extracted from the dataset. (optional) (default to 8)
metadata_selection = "" # dict | JSON object specifying metadata-based queries on the dataset, of type {"metadata_field": "selected_values"} (optional)
remove_redundancies = False # bool | If True, this option removes quasi-duplicates from the analysis. A quasi-duplicate would have the same NLP representation, but not necessarily the exact same text. (optional) (default True)

try:
    payload = nucleus_api.Topics(dataset=dataset,                                
                                 query=query,                   
                                 custom_stop_words=custom_stop_words,     
                                 num_topics=num_topics,
                                 metadata_selection=metadata_selection,
                                 remove_redundancies=remove_redundancies)
    api_response = api_instance.post_topic_api(payload)        
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])
    
doc_ids = api_response.result.doc_ids
topics = api_response.result.topics
for i, res in enumerate(topics):
    print('Topic', i, 'keywords:')
    print('    Keywords:', res.keywords)
    keywords_weight_str = ";".join(str(x) for x in res.keywords_weight)
    print('    Keyword weights:', keywords_weight_str)
    print('    Strength:', res.strength)
    doc_topic_exposure_sel = []  # list of non-zero doc_topic_exposure
    doc_id_sel = []        # list of doc ids matching doc_topic_exposure_sel
    for j in range(len(res.doc_topic_exposures)):
        doc_topic_exp = float(res.doc_topic_exposures[j])
        if doc_topic_exp != 0:
            doc_topic_exposure_sel.append(doc_topic_exp)
            doc_id_sel.append(doc_ids[j])
    
    doc_id_sel_str = ' '.join(str(x) for x in doc_id_sel)
    doc_topic_exposure_sel_str = ' '.join(str(x) for x in doc_topic_exposure_sel)
    print('    Document IDs:', doc_id_sel_str)
    print('    Document exposures:', doc_topic_exposure_sel_str)
    print('---------------')

## Generate summary for each topic

In [None]:
dataset = 'trump_tweets'
#query = '("Trump" OR "president")' # str | Fulltext query, using mysql MATCH boolean query format. Example, (\"word1\" OR \"word2\") AND (\"word3\" OR \"word4\") (optional)
query = ''
custom_stop_words = ["real","hillary"] # str | List of stop words. (optional)
num_topics = 8 # int | Number of topics to be extracted from the dataset. (optional) (default to 8)
num_keywords = 8 # int | Number of keywords per topic that is extracted from the dataset. (optional) (default to 8)
summary_length = 6 # int | The maximum number of bullet points a user wants to see in each topic summary. (optional) (default to 6)
context_amount = 0 # int | The number of sentences surrounding key summary sentences in the documents that they come from. (optional) (default to 0)
num_docs = 20 # int | The maximum number of key documents to use for summarization. (optional) (default to 20)
excluded_docs = '' # str | List of document IDs that should be excluded from the analysis. Example, ["docid1", "docid2", ..., "docidN"]  (optional)
remove_redundancies = False # bool | If True, this option removes quasi-duplicates from the analysis. A quasi-duplicate would have the same NLP representation, but not necessarily the exact same text. (optional) (default True)

metadata_selection ="" # dict | JSON object specifying metadata-based queries on the dataset, of type {"metadata_field": "selected_values"} (optional)
time_period = ""     # str | Time period selection. Choices: ["1M","3M","6M","12M","3Y","5Y",""]  (optional)
period_start = "" # str | Start date for the period to analyze within the dataset. Format: "YYYY-MM-DD HH:MM:SS"
period_end = "" # str | End date for the period to analyze within the dataset. Format: "YYYY-MM-DD HH:MM:SS"
api_response = None

try:
    payload = nucleus_api.TopicSummaryModel	(dataset=dataset, 
                                             query=query,
                                             custom_stop_words=custom_stop_words, 
                                             num_topics=num_topics, 
                                             num_keywords=num_keywords,
                                             metadata_selection=metadata_selection,
                                             summary_length=summary_length, 
                                             context_amount=context_amount, 
                                             num_docs=num_docs)
    api_response = api_instance.post_topic_summary_api(payload)
    api_ok = True
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])
    api_ok = False

if api_ok:
    for i,res in enumerate(api_response.result):
        print('Topic', i, 'summary:')
        print('    Keywords:', res.keywords)
        for j in range(len(res.summary)):
            print(res.summary[j])
            print('    Document ID:', res.summary[j].doc_id)
            print('        Title:', res.summary[j].title)
            print('        Sentences:', res.summary[j].sentences)
            print('        Author:', res.summary[j].attribute['author'])
            print('        Time:', datetime.datetime.fromtimestamp(float(res.summary[j].attribute['time'])))
        print('---------------')

## Measure sentiment on each topic

In [None]:
dataset = 'trump_tweets' # str | Dataset name
#query = '("Trump" OR "president")' # str | Fulltext query, using mysql MATCH boolean query format. Example, (\"word1\" OR \"word2\") AND (\"word3\" OR \"word4\") (optional)
query = ''
custom_stop_words = ["real","hillary"] # str | List of stop words. (optional)
num_topics = 8 # int | Number of topics to be extracted from the dataset. (optional) (default to 8)
num_keywords = 8 # int | Number of keywords per topic that is extracted from the dataset. (optional) (default to 8)
excluded_docs = '' # str | List of document IDs that should be excluded from the analysis. Example, ["docid1", "docid2", ..., "docidN"]  (optional)
custom_dict_file = {"great": 1.0, "awful": -1.0, "clinton":-1.0, "trump":1.0} # file | Custom sentiment dictionary JSON file. Example, {"field1": value1, ..., "fieldN": valueN} (optional)
remove_redundancies = False # bool | If True, this option removes quasi-duplicates from the analysis. A quasi-duplicate would have the same NLP representation, but not necessarily the exact same text. (optional) (default True)

metadata_selection ="" # dict | JSON object specifying metadata-based queries on the dataset, of type {"metadata_field": "selected_values"} (optional)
time_period = ""     # str | Time period selection. Choices: ["1M","3M","6M","12M","3Y","5Y",""] (optional)
period_start = "" # str | Start date for the period to analyze within the dataset. Format: "YYYY-MM-DD HH:MM:SS"
period_end = "" # str | End date for the period to analyze within the dataset. Format: "YYYY-MM-DD HH:MM:SS"

try:
    payload = nucleus_api.TopicSentimentModel(dataset=dataset, 
                                              query=query, 
                                              custom_stop_words=custom_stop_words, 
                                              num_topics=num_topics, 
                                              num_keywords=num_keywords,
                                              custom_dict_file=custom_dict_file)
    api_response = api_instance.post_topic_sentiment_api(payload)
    
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])

for i,res in enumerate(api_response.result):
    print('Topic', i, 'sentiment:')
    print('    Keywords:', res.keywords)
    print('    Sentiment:', res.sentiment)
    print('    Strength:', res.strength)
    
    doc_id_str = ' '.join(str(x) for x in res.doc_ids)
    doc_sentiment_str = ' '.join(str(x) for x in res.doc_sentiments)
    doc_score_str = ' '.join(str(x) for x in res.doc_topic_exposures)
    print('    Document IDs:', doc_id_str)
    print('    Document Sentiments:', doc_sentiment_str)
    print('    Document Exposures:', doc_score_str)
    print('---------------')

## Measure consensus on each topic

In [None]:
dataset = 'trump_tweets' # str | Dataset name.
query = '' # str | Fulltext query, using mysql MATCH boolean query format. Example, (\"word1\" OR \"word2\") AND (\"word3\" OR \"word4\") (optional)
custom_stop_words = ["real","hillary"] # str | List of stop words. (optional)
num_topics = 8 # int | Number of topics to be extracted from the dataset. (optional) (default to 8)
num_keywords = 8 # int | Number of keywords per topic that is extracted from the dataset. (optional) (default to 8)
excluded_docs = [''] # str | List of document IDs that should be excluded from the analysis. Example, ["docid1", "docid2", ..., "docidN"]  (optional)
custom_dict_file = {"great": 1.0, "awful": -1.0, "clinton":-1.0, "trump":1.0} # file | Custom sentiment dictionary JSON file. Example, {"field1": value1, ..., "fieldN": valueN} (optional)
remove_redundancies = False # bool | If True, this option removes quasi-duplicates from the analysis. A quasi-duplicate would have the same NLP representation, but not necessarily the exact same text. (optional) (default True)

metadata_selection ="" # dict | JSON object specifying metadata-based queries on the dataset, of type {"metadata_field": "selected_values"} (optional)
time_period = ""     # str | Time period selection. Choices: ["1M","3M","6M","12M","3Y","5Y",""] (optional)
period_start = "" # str | Start date for the period to analyze within the dataset. Format: "YYYY-MM-DD HH:MM:SS"
period_end = "" # str | End date for the period to analyze within the dataset. Format: "YYYY-MM-DD HH:MM:SS"

try:
    payload = nucleus_api.TopicConsensusModel(dataset=dataset, 
                                              query=query, 
                                              custom_stop_words=custom_stop_words, 
                                              num_topics=num_topics, 
                                              num_keywords=num_keywords,
                                              custom_dict_file=custom_dict_file)
    api_response = api_instance.post_topic_consensus_api(payload)
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])
    
for i, res in enumerate(api_response.result):
    print('Topic', i, 'consensus:')
    print('    Keywords:', res.keywords)
    print('    Consensus:', res.consensus)
    print('    Strength:', res.strength)
    print('---------------')

## Perform historical analysis of topics' strength, sentiment, and consensus

In [None]:
dataset = 'trump_tweets'   # str | Dataset name.
update_period = 'm' # str | Frequency at which the historical anlaysis is performed. choices=["d","m","H","M"] (default to d)
query = '' # str | Fulltext query, using mysql MATCH boolean query format. Example, (\"word1\" OR \"word2\") AND (\"word3\" OR \"word4\") (optional)
custom_stop_words = ["real","hillary"] # str | List of stop words (optional)
num_topics = 8 # int | Number of topics to be extracted from the dataset. (optional) (default to 8)
num_keywords = 8 # int | Number of keywords per topic that is extracted from the dataset. (optional) (default to 8)
inc_step = 1 # int | Number of increments of the udpate period in between two historical computations. (optional) (default to 1)
excluded_docs = [''] # str | List of document IDs that should be excluded from the analysis. Example, ["docid1", "docid2", ..., "docidN"]  (optional)
custom_dict_file = {} # file | Custom sentiment dictionary JSON file. Example, {"field1": value1, ..., "fieldN": valueN} (optional)
remove_redundancies = False # bool | If True, this option removes quasi-duplicates from the analysis. A quasi-duplicate would have the same NLP representation, but not necessarily the exact same text. (optional) (default True)

metadata_selection ="" # dict | JSON object specifying metadata-based queries on the dataset, of type {"metadata_field": "selected_values"} (optional)
time_period = "12M"     # str | Time period selection. Choices: ["1M","3M","6M","12M","3Y","5Y",""] (optional)
period_start = "" # str | Start date for the period to analyze within the dataset. Format: "YYYY-MM-DD HH:MM:SS"
period_end = "" # str | End date for the period to analyze within the dataset. Format: "YYYY-MM-DD HH:MM:SS"
api_response = None
try:
    payload = nucleus_api.TopicHistoryModel(dataset=dataset, 
                                            time_period=time_period, 
                                            query=query, 
                                            custom_stop_words=custom_stop_words, 
                                            num_topics=num_topics, 
                                            num_keywords=num_keywords, 
                                            metadata_selection=metadata_selection, 
                                            excluded_docs=excluded_docs,
                                            custom_dict_file=custom_dict_file)
    api_response = api_instance.post_topic_historical_analysis_api(payload)
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])
    print(e)

print('Printing historical metrics data...')
print('NOTE: historical metrics data can be plotted when running the example in Jupyter Notebook')

for i,res in enumerate(api_response.result):
    print('Topic', i, res.keywords)
    print('    Timestamps:', res.time_stamps)
    print('    Strengths:', res.strengths)
    print('    Consensuses:', res.consensuses)
    print('    Sentiments:', res.sentiments)
    print('----------------')
            

# chart the historical metrics when running in Jupyter Notebook
if running_notebook:
    print('Plotting historical metrics data...')
    historical_metrics = []
    for res in api_response.result:
        # construct a list of historical metrics dictionaries for charting
        historical_metrics.append({
            'topic'    : res.keywords,
            'time_stamps' : np.array(res.time_stamps),
            'strength' : np.array(res.strengths, dtype=np.float32),
            'consensus': np.array(res.consensuses, dtype=np.float32), 
            'sentiment': np.array(res.sentiments, dtype=np.float32)})

    selected_topics = range(len(historical_metrics)) 
    #nucleus_helper.topic_charts_historical(historical_metrics, selected_topics, True)

## Determine network of authors similar to a chosen contributor

In [None]:
dataset = 'trump_tweets' # str | Dataset name.
target_author = 'D_Trump16' # str | Name of the author to be analyzed.
query = '' # str | Fulltext query, using mysql MATCH boolean query format. Subject covered by the author, on which to focus the analysis of connectivity. Example, (\"word1\" OR \"word2\") AND (\"word3\" OR \"word4\") (optional)
custom_stop_words = ["real","hillary"] # str | List of words possibly used by the target author that are considered not information-bearing. (optional)
excluded_docs = [''] # str | List of document IDs that should be excluded from the analysis. Example, ["docid1", "docid2", ..., "docidN"]  (optional)

metadata_selection ="" # dict | JSON object specifying metadata-based queries on the dataset, of type {"metadata_field": "selected_values"} (optional)
time_period = "12M"     # str | Time period selection. Choices: ["1M","3M","6M","12M","3Y","5Y",""] (optional)
period_start = "" # str | Start date for the period to analyze within the dataset. Format: "YYYY-MM-DD HH:MM:SS"
period_end = "" # str | End date for the period to analyze within the dataset. Format: "YYYY-MM-DD HH:MM:SS"

try:
    payload = nucleus_api.AuthorConnection(dataset=dataset, 
                                           target_author=target_author, 
                                           query=query, 
                                           custom_stop_words=custom_stop_words, 
                                           time_period=time_period, 
                                           metadata_selection=metadata_selection, 
                                           excluded_docs=excluded_docs)
    api_response = api_instance.post_author_connectivity_api(payload)    
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])

res = api_response.result
print('Mainstream connections:')
for mc in res.mainstream_connections:
    print('    Keywords:', mc.keywords)
    print('    Authors:', " ".join(str(x) for x in mc.authors))
    
print('Niche connections:')
for nc in res.niche_connections:
    print('    Keywords:', nc.keywords)
    print('    Authors:', " ".join(str(x) for x in nc.authors))

## Apply transfer learning

### Apply transfer learning to topics in one dataset onto another

In [None]:
dataset0 = 'trump_tweets'
dataset1 = None # str | Validation dataset (optional if period_0 and period_1 dates provided)
#query = '("Trump" OR "president")' # str | Fulltext query, using mysql MATCH boolean query format. Example, (\"word1\" OR \"word2\") AND (\"word3\" OR \"word4\") (optional)
query = ''
custom_stop_words = [""] # str | List of stop words. (optional)
num_topics = 8 # int | Number of topics to be extracted from the dataset. (optional) (default to 8)
num_keywords = 8 # int | Number of keywords per topic that is extracted from the dataset. (optional) (default to 8)
metadata_selection = "" # dict | JSON object specifying metadata-based queries on the dataset, of type {"metadata_field": "selected_values"} (optional)
period_0_start = '2018-08-12' # Not needed if you provide a validation dataset in the "dataset1" variable 
period_0_end = '2018-08-16' # Not needed if you provide a validation dataset in the "dataset1" variable
period_1_start = '2018-08-14' # Not needed if you provide a validation dataset in the "dataset1" variable
period_1_end = '2018-08-18' # Not needed if you provide a validation dataset in the "dataset1" variable
excluded_docs = '' # str | List of document IDs that should be excluded from the analysis. Example, ["docid1", "docid2", ..., "docidN"]  (optional)
remove_redundancies = False # bool | If True, this option removes quasi-duplicates from the analysis. A quasi-duplicate would have the same NLP representation, but not necessarily the exact same text. (optional) (default True)

try:
    payload = nucleus_api.TopicTransferModel(dataset0=dataset0,
                                             dataset1=dataset1,
                                             query=query, 
                                             custom_stop_words=custom_stop_words, 
                                             num_topics=num_topics, 
                                             num_keywords=num_keywords,
                                             period_0_start=period_0_start,
                                             period_0_end=period_0_end,
                                             period_1_start=period_1_start,
                                             period_1_end=period_1_end,
                                             metadata_selection=metadata_selection)
    api_response = api_instance.post_topic_transfer_api(payload)
    api_ok = True
except ApiException as e:
    print(e)
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])
    api_ok = False

print(api_response)

if api_ok:
    doc_ids_t1 = api_response.result.doc_ids_t1
    topics = api_response.result.topics
    for i,res in enumerate(topics):
        print('Topic', i, 'exposure within validation dataset:')
        print('    Keywords:', res.keywords)
        print('    Strength:', res.strength)
        print('    Document IDs:', doc_ids_t1)
        print('    Exposure per Doc in Validation Dataset:', res.doc_topic_exposures_t1)
        print('---------------')

### Apply transfer learning to topics exogenously chosen onto a dataset

In [None]:
dataset0 = 'trump_tweets'
dataset1 = None # str | Validation dataset (optional if period_0 and period_1 dates provided)
fixed_topics = [{"keywords": ["north korea", "nuclear weapons", "real estate"], "weights": [0.5, 0.3, 0.2]},
               {"keywords": ["America", "jobs", "stock market"], "weights": [0.3, 0.3, 0.3]}] # The weights are optional
query = ''
custom_stop_words = [""] # str | List of stop words. (optional)
num_topics = 8 # int | Number of topics to be extracted from the dataset. (optional) (default to 8)
num_keywords = 8 # int | Number of keywords per topic that is extracted from the dataset. (optional) (default to 8)
metadata_selection = "" # dict | JSON object specifying metadata-based queries on the dataset, of type {"metadata_field": "selected_values"} (optional)
period_0_start = '2017-01-01' # Not needed if you provide a validation dataset in the "dataset1" variable 
period_0_end = '2017-12-31' # Not needed if you provide a validation dataset in the "dataset1" variable
period_1_start = '2018-01-01' # Not needed if you provide a validation dataset in the "dataset1" variable
period_1_end = '2018-08-18' # Not needed if you provide a validation dataset in the "dataset1" variable
period_0_start = '2018-08-12' # Not needed if you provide a validation dataset in the "dataset1" variable 
period_0_end = '2018-08-16' # Not needed if you provide a validation dataset in the "dataset1" variable
period_1_start = '2018-08-14' # Not needed if you provide a validation dataset in the "dataset1" variable
period_1_end = '2018-08-18' # Not needed if you provide a validation dataset in the "dataset1" variable

excluded_docs = '' # str | List of document IDs that should be excluded from the analysis. Example, ["docid1", "docid2", ..., "docidN"]  (optional)
remove_redundancies = False # bool | If True, this option removes quasi-duplicates from the analysis. A quasi-duplicate would have the same NLP representation, but not necessarily the exact same text. (optional) (default True)

try:
    payload = nucleus_api.TopicTransferModel(dataset0=dataset0,
                                             dataset1=dataset1,
                                             fixed_topics=fixed_topics,
                                             query=query, 
                                             custom_stop_words=custom_stop_words, 
                                             num_topics=num_topics, 
                                             num_keywords=num_keywords,
                                             period_0_start=period_0_start,
                                             period_0_end=period_0_end,
                                             period_1_start=period_1_start,
                                             period_1_end=period_1_end,
                                             metadata_selection=metadata_selection)
    api_response = api_instance.post_topic_transfer_api(payload)
    api_ok = True
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])
    api_ok = False

if api_ok:
    doc_ids_t1 = api_response.result.doc_ids_t1
    topics = api_response.result.topics
    for i,res in enumerate(topics):
        print('Topic', i, 'exposure within validation dataset:')
        print('    Keywords:', res.keywords)
        print('    Strength:', res.strength)
        print('    Document IDs:', doc_ids_t1)
        print('    Exposure per Doc in Validation Dataset:', res.doc_topic_exposures_t1)
        print('---------------')

### Apply transfer learning to topics in one dataset onto another for sentiment analysis

In [None]:
dataset0 = 'trump_tweets'
dataset1 = None
#dataset1 = dataset # str | Validation dataset (optional if period_0 and period_1 dates provided)
#query = '("Trump" OR "president")' # str | Fulltext query, using mysql MATCH boolean query format. Example, (\"word1\" OR \"word2\") AND (\"word3\" OR \"word4\") (optional)
#fixed_topic is also an available input argument
query = ''
custom_stop_words = [""] # str | List of stop words. (optional)
num_topics = 8 # int | Number of topics to be extracted from the dataset. (optional) (default to 8)
num_keywords = 8 # int | Number of keywords per topic that is extracted from the dataset. (optional) (default to 8)
metadata_selection = "" # dict | JSON object specifying metadata-based queries on the dataset, of type {"metadata_field": "selected_values"} (optional)
period_0_start = '2018-08-12' # Not needed if you provide a validation dataset in the "dataset1" variable 
period_0_end = '2018-08-16' # Not needed if you provide a validation dataset in the "dataset1" variable
period_1_start = '2018-08-14' # Not needed if you provide a validation dataset in the "dataset1" variable
period_1_end = '2018-08-18' # Not needed if you provide a validation dataset in the "dataset1" variable
excluded_docs = '' # str | List of document IDs that should be excluded from the analysis. Example, ["docid1", "docid2", ..., "docidN"]  (optional)
custom_dict_file = {"great": 1.0, "awful": -1.0, "clinton":-1.0, "trump":1.0} # file | Custom sentiment dictionary JSON file. Example, {"field1": value1, ..., "fieldN": valueN} (optional)
remove_redundancies = False # bool | If True, this option removes quasi-duplicates from the analysis. A quasi-duplicate would have the same NLP representation, but not necessarily the exact same text. (optional) (default True)

try:
    payload = nucleus_api.TopicSentimentTransferModel(dataset0=dataset0,
                                                      dataset1=dataset1,
                                                      query=query, 
                                                      custom_stop_words=custom_stop_words, 
                                                      num_topics=num_topics, 
                                                      num_keywords=num_keywords,
                                                      period_0_start=period_0_start,
                                                      period_0_end=period_0_end,
                                                      period_1_start=period_1_start,
                                                      period_1_end=period_1_end,
                                                      metadata_selection=metadata_selection,
                                                      custom_dict_file=custom_dict_file)
    
    api_response = api_instance.post_topic_sentiment_transfer_api(payload)
    api_ok = True
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])
    api_ok = False

if api_ok:
    topics = api_response.result
    for i,res in enumerate(topics):
        print('Topic', i, 'exposure within validation dataset:')
        print('    Keywords:', res.keywords)
        print('    Strength:', res.strength)
        print('    Sentiment:', res.sentiment)
        print('    Document IDs:', res.doc_ids_t1)
        print('    Sentiment per Doc in Validation Dataset:', res.doc_sentiments_t1)
        print('---------------')

### Apply transfer learning to topics in one dataset onto another for consensus analysis

In [None]:
dataset0 = 'trump_tweets'
dataset1 = None # str | Validation dataset (optional if period_0 and period_1 dates provided)
#query = '("Trump" OR "president")' # str | Fulltext query, using mysql MATCH boolean query format. Example, (\"word1\" OR \"word2\") AND (\"word3\" OR \"word4\") (optional)
#fixed_topic is also an available input argument
query = ''
custom_stop_words = [""] # str | List of stop words. (optional)
num_topics = 8 # int | Number of topics to be extracted from the dataset. (optional) (default to 8)
num_keywords = 8 # int | Number of keywords per topic that is extracted from the dataset. (optional) (default to 8)
metadata_selection = "" # dict | JSON object specifying metadata-based queries on the dataset, of type {"metadata_field": "selected_values"} (optional)
period_0_start = '2018-08-12' # Not needed if you provide a validation dataset in the "dataset1" variable 
period_0_end = '2018-08-16' # Not needed if you provide a validation dataset in the "dataset1" variable
period_1_start = '2018-08-14' # Not needed if you provide a validation dataset in the "dataset1" variable
period_1_end = '2019-08-18' # Not needed if you provide a validation dataset in the "dataset1" variable
excluded_docs = '' # str | List of document IDs that should be excluded from the analysis. Example, ["docid1", "docid2", ..., "docidN"]  (optional)
custom_dict_file = {"great": 1.0, "awful": -1.0, "clinton":-1.0, "trump":1.0} # file | Custom sentiment dictionary JSON file. Example, {"field1": value1, ..., "fieldN": valueN} (optional)
remove_redundancies = False # bool | If True, this option removes quasi-duplicates from the analysis. A quasi-duplicate would have the same NLP representation, but not necessarily the exact same text. (optional) (default True)

try:
    payload = nucleus_api.TopicConsensusTransferModel(dataset0=dataset0,
                                                      dataset1=dataset1,
                                                      query=query,
                                                      custom_stop_words=custom_stop_words, 
                                                      num_topics=num_topics, 
                                                      num_keywords=num_keywords,
                                                      period_0_start=period_0_start,
                                                      period_0_end=period_0_end,
                                                      period_1_start=period_1_start,
                                                      period_1_end=period_1_end,
                                                      metadata_selection=metadata_selection,
                                                      custom_dict_file=custom_dict_file)
    
    api_response = api_instance.post_topic_consensus_transfer_api(payload)
    api_ok = True
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])
    api_ok = False

if api_ok:
    topics = api_response.result
    for i,res in enumerate(topics):
        print('Topic', i, 'exposure within validation dataset:')
        print('    Keywords:', res.keywords)
        print('    Consensus:', res.consensus)
        print('---------------')

## Extract a topic contrasting two subsets of content in a dataset

In [None]:
dataset = 'trump_tweets' # str | Dataset name.
metadata_selection = {"content": "Trump"} # dict | The metadata selection defining the two categories of documents to contrast and summarize against each other
query = '' # str | Dataset-language-specific fulltext query, using mysql MATCH boolean query format (optional)
custom_stop_words = ["real","hillary"] # List of stop words. (optional)
time_period = "1M" # str | Alternative 1: time period counting back from today over which the analysis is conducted (optional)
period_start = '2018-08-12' # str | Alternative 2: start of period over which the analysis is conducted (optional)
period_end = '2018-08-15' # str | Alternative 2: start of period over which the analysis is conducted (optional)
excluded_docs = '' # str | List of document IDs that should be excluded from the analysis. Example, ["docid1", "docid2", ..., "docidN"]  (optional)
syntax_variables = True # bool | Specifies whether to take into account syntax aspects of each category of documents to help with contrasting them (optional) (default to False)
compression = 0.002 # float | Parameter controlling the breadth of the contrasted topic. Contained between 0 and 1, the smaller it is, the more contrasting terms will be captured, with decreasing weight. (optional) (default to 0.000002)
remove_redundancies = False # bool | If True, this option removes quasi-duplicates from the analysis and retain only one copy of it. A quasi-duplicate would have the same NLP representation, but not necessarily the exact same text. (optional) (default False)
metadata_selection_contrast = {"content": "Trump"}

try:
    payload = nucleus_api.TopicContrastModel(
        dataset=dataset, 
        metadata_selection=metadata_selection,
        metadata_selection_contrast=metadata_selection_contrast
    )
    api_response = api_instance.post_topic_contrast_api(payload)
    
    print('Contrasted Topic')
    print('    Keywords:', api_response.result.keywords)
    print('    Keywords Weight:', api_response.result.keywords_weight)
    
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])

# Document-Level Analytics

This section goes over all APIs that enable users to analyze documents in a dataset on a standalone basis.
- Sentiment analysis
- Document summarization and contrasted summarization
- Document classification
- Named Entity recognition (strict match off pre-determined list)
- Content recommendation

## Retrieve summary

### Summary of all documents

In [None]:
dataset = 'trump_tweets'
# doc_titles, doc_ids, and metadata_selection below are filters to narrow down 
# documents to be retrieved.
# The information of all documents will be retrived when no filters are provided.

# doc_titles: list of strings
# The titles of the documents to retrieve. Example: ["title1", "title2", ..., "titleN"]  (optional)
# doc_titles = ['D_Trump2018_8_18_1_47']   
doc_titles = []
# doc_ids: list of strings
# The docid of the documents to retrieve. Example: ["docid1", "docid2", ..., "docidN"]  (optional)
# doc_ids = ['3397215194896514820', '776902852041351634']
doc_ids = []

# metadata_selection = {"author": "D_Trump16"} # dict | A selector off metadata. Example: {"field": "value"}  (optional)
metadata_selection = ''

try:
    payload = nucleus_api.DocInfo(dataset=dataset, 
                                doc_titles=doc_titles, 
                                doc_ids=doc_ids,
                                metadata_selection=metadata_selection)
    api_response = api_instance.post_doc_info(payload)
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])

for res in api_response.result:
    print('Document ID:', res.doc_id)
    print('    title:', res.title)
    for attr in res.attribute.keys():
        if attr == 'time':
            print('   ', attr, ':', datetime.datetime.fromtimestamp(float(res.attribute[attr])))
        else:
            print('   ', attr, ':', res.attribute[attr])
    print('---------------')

### Summary of documents with a metadata selection

In [None]:
dataset = 'trump_tweets' # str | Dataset name.
metadata_selection = {"author": "D_Trump16"}      # dict | A selector off metadata. Example: {"field": "value"}  (optional)

try:
    payload = nucleus_api.DocInfo(dataset=dataset, metadata_selection=metadata_selection)
    api_response = api_instance.post_doc_info(payload)
    
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])

for res in api_response.result:
    print('Document ID:', res.doc_id)
    print('    title:', res.title)
    for attr in res.attribute.keys():
        if attr == 'time':
            print('   ', attr, ':', datetime.datetime.fromtimestamp(float(res.attribute[attr])))
        else:
            print('   ', attr, ':', res.attribute[attr])
    print('---------------')

## Display chosen documents, content and metadata included

In [None]:
dataset = 'trump_tweets' # str | Dataset name.
#doc_titles = ['D_Trump2018_8_18_1_47']   # str | The title of the documents to retrieve. Example: ["title1", "title2", ..., "titleN"]  (optional)
doc_ids = ['4046653213651213725']      # str | The docid of the documents to retrieve. Example: ["docid1", "docid2", ..., "docidN"]  (optional)

try:
    payload = nucleus_api.DocDisplay(dataset, doc_ids=doc_ids)
    api_response = api_instance.post_doc_display(payload)
    
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])

for res in api_response.result:
    print('Document ID:', res.doc_id)
    print('    Title:', res.title)
    print('    Author:', res.attribute['author'])
    print('    Time:', datetime.datetime.fromtimestamp(float(res.attribute['time'])))
    print('    Content', res.attribute['content'])
    print('---------------')

## Display chosen documents, content and metadata included, with a metadata selection

In [None]:
dataset = 'trump_tweets' # str | Dataset name.
metadata_selection = {"author": "D_Trump16"}      # dict | A selector off metadata. Example: {"field": "value"}  (optional)

try:
    payload = nucleus_api.DocDisplay(dataset=dataset, metadata_selection=metadata_selection)
    api_response = api_instance.post_doc_display(payload)
    
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])

for res in api_response.result:
    print('Document ID:', res.doc_id)
    print('    Title:', res.title)
    print('    Author:', res.attribute['author'])
    print('    Time:', datetime.datetime.fromtimestamp(float(res.attribute['time'])))
    print('    Content', res.attribute['content'])
    print('---------------')

## Generate document recommendations on topics

In [None]:
dataset = 'trump_tweets' # str | Dataset name.
#query = '("Trump" OR "president")' # str | Fulltext query, using mysql MATCH boolean query format. Example, (\"word1\" OR \"word2\") AND (\"word3\" OR \"word4\") (optional)
query = ''
custom_stop_words = ["real","hillary"] # str | List of stop words. (optional)
num_topics = 8 # int | Number of topics to be extracted from the dataset. (optional) (default to 8)
num_keywords = 8 # int | Number of keywords per topic that is extracted from the dataset. (optional) (default to 8)
excluded_docs = '' # str | List of document IDs that should be excluded from the analysis. Example, ["docid1", "docid2", ..., "docidN"]  (optional)
remove_redundancies = False # bool | If True, this option removes quasi-duplicates from the analysis. A quasi-duplicate would have the same NLP representation, but not necessarily the exact same text. (optional) (default True)

try:
    payload = nucleus_api.DocumentRecommendModel(dataset=dataset, 
                                                 query=query, 
                                                 custom_stop_words=custom_stop_words, 
                                                 num_topics=num_topics, 
                                                 num_keywords=num_keywords)
    api_response = api_instance.post_doc_recommend_api(payload)
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])
    
for i, res in enumerate(api_response.result):
    print('Document recommendations for topic', i, ':')
    print('    Keywords:', res.keywords)

    for j, doc in enumerate(res.recommendations):
        print('    Recommendation', j, ':')
        print('        Document ID:', doc.doc_id)
        print('        Title:', doc.title)
        print('        Attribute:', doc.attribute)
        print('        Author:', doc.attribute['author'])
        print('        Time:', datetime.datetime.fromtimestamp(float(doc.attribute['time'])))
    print('---------------')

## Summarize a document

In [None]:
dataset = 'trump_tweets' # str | Dataset name.
doc_title = 'D_Trump2018_8_17_14_10' # str | The title of the document to be summarized.
custom_stop_words = ["real","hillary"] # List of stop words. (optional)
summary_length = 6 # int | The maximum number of bullet points a user wants to see in the document summary. (optional) (default to 6)
context_amount = 0 # int | The number of sentences surrounding key summary sentences in the documents that they come from. (optional) (default to 0)
short_sentence_length = 0 # int | The sentence length below which a sentence is excluded from summarization (optional) (default to 4)
long_sentence_length = 40 # int | The sentence length beyond which a sentence is excluded from summarization (optional) (default to 40)

try:
    payload = nucleus_api.DocumentSummaryModel(dataset=dataset, 
                                               doc_title=doc_title, 
                                               custom_stop_words=custom_stop_words, 
                                                summary_length=summary_length, 
                                                context_amount=context_amount,
                                                short_sentence_length=short_sentence_length,
                                                long_sentence_length=long_sentence_length)
    api_response = api_instance.post_doc_summary_api(payload)
    
    print('Summary for', api_response.result.doc_title)
    for sent in api_response.result.summary.sentences:
        print('    *', sent)
    
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])

## Summarize what makes a document stand-out from the background

In [None]:
dataset = 'trump_tweets' # str | Dataset name.
metadata_selection = {"content": "Trump"} # dict | The metadata selection defining the two categories of documents to contrast and summarize against each other
query = '' # str | Dataset-language-specific fulltext query, using mysql MATCH boolean query format (optional)
custom_stop_words = ["real","hillary"] # List of stop words. (optional)
summary_length = 6 # int | The maximum number of bullet points a user wants to see in the contrasted summary. (optional) (default to 6)
context_amount = 0 # int | The number of sentences surrounding key summary sentences in the documents that they come from. (optional) (default to 0)
short_sentence_length = 0 # int | The sentence length below which a sentence is excluded from summarization (optional) (default to 4)
long_sentence_length = 40 # int | The sentence length beyond which a sentence is excluded from summarization (optional) (default to 40)
time_period = "1M" # str | Alternative 1: time period counting back from today over which the analysis is conducted (optional)
period_start = '2018-08-12' # str | Alternative 2: start of period over which the analysis is conducted (optional)
period_end = '2018-08-15' # str | Alternative 2: start of period over which the analysis is conducted (optional)
excluded_docs = '' # str | List of document IDs that should be excluded from the analysis. Example, ["docid1", "docid2", ..., "docidN"]  (optional)
syntax_variables = True # bool | Specifies whether to take into account syntax aspects of each category of documents to help with contrasting them (optional) (default to False)
compression = 0.002 # float | Parameter controlling the breadth of the contrasted summary. Contained between 0 and 1, the smaller it is, the more contrasting terms will be captured, with decreasing weight. (optional) (default to 0.000002)
remove_redundancies = False # bool | If True, this option removes quasi-duplicates from the analysis and retain only one copy of it. A quasi-duplicate would have the same NLP representation, but not necessarily the exact same text. (optional) (default False)
metadata_selection_contrast = {"content": "Trump"}

try:
    payload = nucleus_api.DocumentContrastSummaryModel(
        dataset=dataset, 
        metadata_selection=metadata_selection,
        metadata_selection_contrast=metadata_selection_contrast
        )
    api_response = api_instance.post_document_contrast_summary_api(payload)
    
    print('Summary for', [x for x in  metadata_selection.values()])
    for sent in api_response.result.class_1_content.sentences:
        print('    *', sent)
    print('======')
    for sent in api_response.result.class_2_content.sentences:
        print('    *', sent)    
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])

## Measure sentiment of document

In [None]:
dataset = 'trump_tweets' # str | Dataset name.
doc_title = 'D_Trump2018_8_17_14_10' # str | The title of the document to be analyzed.
custom_stop_words = ["real","hillary"] # List of stop words. (optional)
num_topics = 8 # int | Number of topics to be extracted from the document. (optional) (default to 8)
num_keywords = 8 # int | Number of keywords per topic that is extracted from the document. (optional) (default to 8)

try:
    payload = nucleus_api.DocumentSentimentModel(dataset=dataset, 
                                                 doc_title=doc_title, 
                                                 custom_stop_words=custom_stop_words, 
                                                 num_topics=num_topics, 
                                                 num_keywords=num_keywords)
    api_response = api_instance.post_doc_sentiment_api(payload)
    
    print('Sentiment for', api_response.result.doc_title)
    print(api_response.result.sentiment)

except ValueError as e:
    print('ERROR:', e)
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])

## Classify documents based on a topic contrasting two categories of content

In [None]:
dataset = 'trump_tweets' # str | Dataset name.
fixed_topics = {"keywords": ["america", "jobs", "economy"], "weights": [0.5, 0.25, 0.25]} # dict | The contrasting topic used to separate the two categories of documents

# Here we want to classify documents that talk about Trump vs documents that don't talk about Trump based on their exposure to the topic [america, jobs, economy]
# A more natural classification task for the algo is to define metadata-based categories such as metadata_selection = {"document_category": ["speech", "press release"]}
metadata_selection = {"content": "Trump"} # dict | The metadata selection defining the two categories of documents that a document can be classified into
query = '' # str | Dataset-language-specific fulltext query, using mysql MATCH boolean query format (optional)
custom_stop_words = ["real","hillary"] # List of stop words. (optional)
time_period = "1M" # str | Alternative 1: time period counting back from today over which the analysis is conducted (optional)
period_start = '2018-08-12' # str | Alternative 2: start of period over which the analysis is conducted (optional)
period_end = '2018-08-15' # str | Alternative 2: start of period over which the analysis is conducted (optional)
excluded_docs = '' # str | List of document IDs that should be excluded from the analysis. Example, ["docid1", "docid2", ..., "docidN"]  (optional)
syntax_variables = True # bool | If True, the classifier will include syntax-related variables on top of content variables (optional) (default to False)
validation_phase = False # bool | If True, the classifier assumes that the dataset provided is labeled with the 2 classes and will use that to compute accuracy/precision/recall (optional) (default to False)
threshold = 0 # float | Threshold value for a document exposure to the contrastic topic, above which the document is assigned to class 1 specified through metadata_selection. (optional) (default to 0)
remove_redundancies = False # bool | If True, this option removes quasi-duplicates from the analysis and retain only one copy of it. A quasi-duplicate would have the same NLP representation, but not necessarily the exact same text. (optional) (default False)
# classifier_config is the classifier configuration dictionary generated from post_topic_contrast_api
# Below is an example showing how to contruct the classifier_config paramenter
# classifier_config = {"keywords": topic_contrast_response.result.keywords,
#                      "coefs": topic_contrast_response.result.classifier_config.coefs,
#                      "intercept": topic_contrast_response.result.classifier_config.intercept}
classifier_config = {}
metadata_selection_contrast = {"content": "Hillary"} 

try:
    payload = nucleus_api.DocClassifyModel(
        dataset=dataset,
        fixed_topics=fixed_topics,
        metadata_selection=metadata_selection,
        classifier_config=classifier_config,
        metadata_selection_contrast=metadata_selection_contrast
    )
    
    api_response = api_instance.post_doc_classify_api(payload)
    
    print('Detailed Results')
    print('    Docids:', api_response.result.detailed_results.doc_ids)
    print('    Estimated Category:', api_response.result.detailed_results.estimated_class)
    print('    Actual Category:', api_response.result.detailed_results.true_class)
    print('\n')
    if validation_phase:
        print('Perf Metrics')
        print('    Accuracy:', api_response.result.perf_metrics.accuracy)
        print('    Recall:', api_response.result.perf_metrics.recall)
        print('    Precision:', api_response.result.perf_metrics.precision)
        print('    F1:', api_response.result.perf_metrics.f1)
        print('    Balanced accuracy:', api_response.result.perf_metrics.balanced_accuracy)
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])

## Tag documents based on pre-determined named-entity recognition

In [None]:
dataset = 'trump_tweets' # str | Dataset name.
payload = nucleus_api.DatasetTagging(
    dataset=dataset, 
    query='new york city OR big apple OR NYC OR New York', 
    metadata_selection='', 
    time_period='',
    period_start='2010-01-01',
    period_end='2019-04-30')

try:
    api_response = api_instance.post_dataset_tagging(payload)
    api_ok = True
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])
    api_ok = False
    
if api_ok:
    print('    Entities tagged:', api_response.result.entities_tagged)
    print('    Docids tagged with the entities:', api_response.result.doc_ids)
    print('    Entities count:', api_response.result.entities_count)

## Summarize files from a URL

In [None]:
######################################################################################
# file_params fields descriptions:  
#   file_url              : string, the URL at which the file is stored (could be a S3 bucket address for instance)
#   filename              : OPTIONAL string, filename saved on the server. also serves as the doc_title for summarization
#   custom_stop_words     : OPTIONAL a string list, user-provided list of stopwords to be excluded from the content analysis leading to document summarization
#                            ["word1", "word2", ...]. DEFAULT: empty
#   summary_length        : OPTIONAL an integer, the maximum number of bullet points a user wants to see in the document summary. DEFAULT: 6
#   context_amount        : OPTIONAL an integer, the number of sentences surrounding key summary sentences in the original document that a user wants to see in the document summary. DEFAULT: 0
#   short_sentence_length : OPTIONAL an integer, the sentence length below which a sentence is excluded from summarization. DEFAULT: 4 words
#   long_sentence_length  : OPTIONAL an integer, the sentence length beyond which a sentence is excluded from summarization. DEFAULT: 40 words

file_params = {
    'file_url': 'https://s3-us-west-2.amazonaws.com/sumup-public/nucleus-sdk/quarles20181109a.docx',
    'filename': 'quarles20181109a-newname.pdf',   
    'custom_stop_words': ["document", "sometimes"], 
    'summary_length': 6,
    'context_amount': 0, 
    'short_sentence_length': 4, 
    'long_sentence_length': 40}

result = nucleus_helper.summarize_file_url(api_instance, file_params)
  
print('Summary for', result.doc_title, ':')
for sent in result.summary.sentences:
    print('    *', sent)

# Dashboard Modules

## Key Authors

"Key Authors" API identifies the most prolific authors on important subjects in a dataset.

In [None]:
dataset = 'trump_tweets'
query = '' # Dataset-language-specific fulltext query, using SQL MATCH boolean query format. Example: "(word1 OR word2) AND (word3 OR word4)" [optional]
tracked_queries = [] # List of user-defined queries to track	[optional]
custom_stop_words = [] # List of dataset-language-specific stopwords that should be excluded from the analysis. Example: ["word1", "word2", ..., "wordN"] [optional]
num_topics = 8 # Number of topics to be extracted from the dataset per query to aggregate back into a tracker.	[optional]
num_keywords = 8 # Number of keywords per topic that is extracted from the dataset per query.	[optional]
metadata_selection = {} #JSON object specifying metadata-based queries on the dataset, of type {"metadata_field": "selected_values"} [optional]
time_period = '' #Alternative 1: Time period selection	[optional] [default to '1M']
period_start = "2018-08-13" #Alternative 2: Start date for the period to analyze within the dataset. Format: "YYYY-MM-DD"	[optional]
period_end = "2018-08-17" #Alternative 2: End date for the period to analyze within the dataset. Format: "YYYY-MM-DD"	[optional]
num_authors = 3 # Max number of key contributors that the user wants to see returned by the analysis. [optional]
num_keydocs= 3 # Max number of key contributions from key contributors that the user wants to see returned by the analysis. [optional]
excluded_docs = [] # List of document IDs that should be excluded from the analysis. Example: ["doc_id1", "doc_id2", ..., "doc_idN"] [optional]
custom_dict_file = {} # JSON records with custom sentiment dictionary: {"word1": value1, "word2": value2, ..., "wordN": valueN}
    
# Define the KeyAuthorsModel
payload = nucleus_api.KeyAuthorsModel(
    dataset=dataset, 
    query=query,
    tracked_queries=tracked_queries,
    custom_stop_words=custom_stop_words,
    num_topics=num_topics,
    num_keywords=num_keywords,
    metadata_selection=metadata_selection,
    time_period=time_period,
    period_start=period_start,
    period_end=period_end,
    num_authors=num_authors, 
    num_keydocs=num_keydocs,
    excluded_docs=excluded_docs,
    custom_dict_file=custom_dict_file)

try:
    api_response = api_instance.post_key_authors_api(payload)
    api_ok = True
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])
    api_ok = False
    
if api_ok:
    for i,t in enumerate(api_response.result):
        print('Query/Hot Topics', i)
        print('    Query/Hot Topic:', t.query)
        print('    Top Authors:')
        print('        Negative:', t.top_authors.negative)
        print('        Average:', t.top_authors.average)
        print('        Positive:', t.top_authors.positive)



## Custom Tracker

“Custom Tracker” API keeps a pulse of key metrics on important subjects.

In [None]:
dataset = 'trump_tweets'
query = '' #Dataset-language-specific fulltext query, using SQL MATCH boolean query format. Example: "(word1 OR word2) AND (word3 OR word4)"	[optional]
tracked_queries = [] #List of user-defined queries to track	[optional]
custom_stop_words = [] #List of dataset-language-specific stopwords that should be excluded from the analysis. Example: ["word1", "word2", ..., "wordN"]	[optional]
num_topics = 8 #Number of topics to be extracted from the dataset per query to aggregate back into a tracker.	[optional]
num_keywords = 8 #Number of keywords per topic that is extracted from the dataset per query.	[optional]
metadata_selection = {} #JSON object specifying metadata-based queries on the dataset, of type {"metadata_field": "selected_values"}	[optional]
time_period = '' #Alternative 1: Time period selection	[optional] [default to '1M']
period_start = '' #Alternative 2: Start date for the period to analyze within the dataset. Format: "YYYY-MM-DD"	[optional]
period_end = '' #Alternative 2: End date for the period to analyze within the dataset. Format: "YYYY-MM-DD"	[optional]
n_steps = 10 #Number of steps in the historical analysis over the requested period. Each step is such that they contain an equal number of documents.	[optional]
excluded_docs = [] # List of document IDs that should be excluded from the analysis. Example: ["doc_id1", "doc_id2", ..., "doc_idN"]	[optional]
custom_dict_file = {} # JSON records with custom sentiment dictionary: {"word1": value1, "word2": value2, ..., "wordN": valueN}
# Define the CustomTrackerModel
payload = nucleus_api.CustomTrackerModel(
    dataset=dataset,
    query=query,
    tracked_queries=tracked_queries,
    custom_stop_words=custom_stop_words,
    num_topics=num_topics,
    num_keywords=num_keywords,
    metadata_selection=metadata_selection,
    time_period=time_period,
    period_start=period_start,
    period_end=period_end,
    n_steps=n_steps,
    excluded_docs=excluded_docs,
    custom_dict_file=custom_dict_file
)

try:
    api_response = api_instance.post_custom_tracker_api(payload)
    api_ok = True
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])
    api_ok = False
    
if api_ok:
    for i,t in enumerate(api_response.result):
        print('Query/Hot Topics', i)
        print('    Query/Hot Topic:', t.query)
        print('    Consensuses:', t.consensuses)
        print('    Sentiments:', t.sentiments)
        print('    Strengths:', t.strengths)
        print('    Timestamps:', t.time_stamps)


## Smart Alert

“Smart Alert” API brings only novel information to your attention.

In [None]:
dataset = 'trump_tweets'  
period_start = "2018-08-13" #Alternative 2: Start date for the period to analyze within the dataset. Format: "YYYY-MM-DD"	[optional]
period_end = "2018-08-17" #Alternative 2: End date for the period to analyze within the dataset. Format: "YYYY-MM-DD"	[optional]

# Define SmartAlertsModel
payload = nucleus_api.SmartAlertsModel(dataset=dataset)

try:
    api_response = api_instance.post_smart_alerts_api(payload)
    api_ok = True
except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])
    api_ok = False
    
if api_ok:
    for i,s in enumerate(api_response.result):
        print('New sentences', i)
        print('    Query:', s.query)
        print('    New sentences:', s.new_sents)
        print('    New sentence IDs:', s.new_sents_docids)
        print('    New sentence titles:', s.new_sents_titles)
        print('    New sentence URLs:', s.new_sents_urls)
        print('    New words:', s.new_words)
        print('    Novel documents:', s.novel_docs)
