In [37]:
from __future__ import print_function
import csv, json
import time
import nucleus_client
from nucleus_client.rest import ApiException
from pprint import pprint

# Configure API host and key

In [38]:
configuration = nucleus_client.Configuration()
configuration.host = 'UPDATE-WITH-API-HOST'
configuration.api_key['x-api-key'] = 'UPDATE-WITH-API-KEY'


# Dataset APIs

## Create API instance

In [39]:
print('-------------------------------------------------------------')
print('--                Dataset API Examples                     --')
print('-------------------------------------------------------------')
api_instance_dataset = nucleus_client.DatasetsApi(nucleus_client.ApiClient(configuration))

-------------------------------------------------------------
--                Dataset API Examples                     --
-------------------------------------------------------------


## Append file from local drive to dataset

In [40]:
print('--------- Append file from local drive to dataset -----------')
print('')
file = 'quarles20181109a.pdf' # file | 
dataset = 'dataset_test' # str | Destination dataset where the file will be inserted.

try:
    api_instance_dataset.post_upload_file(file, dataset)
except ApiException as e:
    print("Exception when calling DatasetsApi->post_upload_file: %s\n" % e)
    
print('-------------------------------------------------------------')

--------- Append file from local drive to dataset -----------

-------------------------------------------------------------


## Append file from URL to dataset

In [41]:
print('------------ Append frile from URL to dataset ---------------')

dataset = dataset
file_url = 'https://www.federalreserve.gov/newsevents/speech/files/quarles20181109a.pdf'
payload = nucleus_client.UploadURLModel(
                dataset=dataset,
                file_url=file_url
            ) # UploadURLModel | 

try:
    api_response = api_instance_dataset.post_upload_url(payload)
    pprint(api_response)
except ApiException as e:
    print("Exception when calling DatasetsApi->post_upload_url: %s\n" % e)
print('-------------------------------------------------------------')

------------ Append frile from URL to dataset ---------------
{'success': 'https://www.federalreserve.gov/newsevents/speech/files/quarles20181109a.pdf'}
-------------------------------------------------------------


## Append json from csv to dataset

In [42]:
print('----------- Append json from CSV to dataset -----------------')
# add documents to dataset
csv_file = 'trump-tweets-100.csv'
dataset = dataset   

doc_cnt = 0
with open(csv_file, encoding='utf-8-sig') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if doc_cnt < 10:
            payload = nucleus_client.Appendjsonparams(dataset=dataset, 
                                                  language='english', 
                                                  document={'time'   : row['time'],
                                                            'title'  : row['title'],
                                                            'content': row['content'],
                                                            'author' : row['author']}
                                                 )

            try:
                response = api_instance_dataset.post_append_json_to_dataset(payload)
            except ApiException as e:
                print("Exception when calling DatasetsApi->post_append_json_to_dataset: %s\n" % e)
        
        doc_cnt = doc_cnt + 1
        
print('Dataset', dataset, 'now has', response.success, 'documents.')
print('-------------------------------------------------------------')

----------- Append json from CSV to dataset -----------------
Dataset dataset_test now has 12 documents.
-------------------------------------------------------------


## List available datasets

In [43]:
print('---------------- List available datasets ---------------------')
try:
    api_response = api_instance_dataset.get_list_datasets()
except ApiException as e:
    print("Exception when calling DatasetsApi->get_list_datasets: %s\n" % e)

list_datasets = api_response.to_dict()['list_datasets']

print(len(list_datasets), 'datasets in the database:')
for ds in list_datasets:
    print('    ', ds)

    
print('-------------------------------------------------------------')

---------------- List available datasets ---------------------
7 datasets in the database:
     aloxTest
     dataset_from_file
     dataset_from_url
     dataset_test
     trump_tweets
     trump_tweets_2
     trump_tweets_test
-------------------------------------------------------------


## Get dataset information

In [44]:
print('-------------------------------------------------------------')
dataset = dataset # str | Dataset name.
query = '' # str | Fulltext query, using mysql MATCH boolean query format. (optional)
metadata_selection = '' # str | json object of {\"metadata_field\":[\"selected_values\"]} (optional)
time_period = '' # str | Time period selection (optional)

try:
    api_response = api_instance_dataset.get_dataset_info(dataset, 
                                                 query=query, 
                                                 metadata_selection=metadata_selection, 
                                                 time_period=time_period)
    pprint(api_response)
except ApiException as e:
    print("Exception when calling DatasetsApi->get_dataset_info: %s\n" % e)
    
print('-------------------------------------------------------------')

-------------------------------------------------------------
{'dataset': 'dataset_test',
 'detected_language': 'en',
 'metadata': '{}',
 'num_documents': '12',
 'time_range': '[1534582020.0, 1542458749.0]'}
-------------------------------------------------------------


## Delete document

In [45]:
print('-------------------------------------------------------------')
dataset = dataset
payload = nucleus_client.Deletedocumentmodel(dataset=dataset,
                                             docid='1') # Deletedocumentmodel | 

try:
    api_response = api_instance_dataset.post_delete_document(payload)
    pprint(api_response)
except ApiException as e:
    print("Exception when calling DatasetsApi->post_delete_document: %s\n" % e)

print('-------------------------------------------------------------')

-------------------------------------------------------------
{'success': 'Document deleted'}
-------------------------------------------------------------


## Delete dataset

In [46]:
dataset = dataset     
payload = nucleus_client.Deletedatasetmodel(dataset=dataset) # Deletedatasetmodel | 

try:
    api_response = api_instance_dataset.post_delete_dataset(payload)
    pprint(api_response)
except ApiException as e:
    print("Exception when calling DatasetsApi->post_delete_dataset: %s\n" % e)
    
# List datasets again to check if the specified dataset has been deleted
try:
    api_response = api_instance_dataset.get_list_datasets()
    pprint(api_response)
except ApiException as e:
    print("Exception when calling DatasetsApi->get_list_datasets: %s\n" % e)

{'success': 'Dataset deleted'}
{'list_datasets': ['aloxTest',
                   'dataset_from_file',
                   'dataset_from_url',
                   'trump_tweets',
                   'trump_tweets_2',
                   'trump_tweets_test']}


## Create a full dataset for testing other APIs

In [47]:
print('-------------------------------------------------------------')
# add documents to dataset
csv_file = 'trump-tweets-100.csv'
dataset = 'trump_tweets'   

with open(csv_file, encoding='utf-8-sig') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        payload = nucleus_client.Appendjsonparams(dataset=dataset, 
                                                  language='english', 
                                                  document={'time'   : row['time'],
                                                            'title'  : row['title'],
                                                            'content': row['content'],
                                                            'author' : row['author']}
                                                 )

        try:
            response = api_instance_dataset.post_append_json_to_dataset(payload)
        except ApiException as e:
            print("Exception when calling DatasetsApi->post_append_json_to_dataset: %s\n" % e)
            
print(response)
print('-------------------------------------------------------------')

-------------------------------------------------------------
{'success': '34521'}
-------------------------------------------------------------


# Topic APIs

## Create API Instance

In [48]:
print('-------------------------------------------------------------')
print('--                Topic API Examples                     --')
print('-------------------------------------------------------------')
api_instance_topic = nucleus_client.TopicsApi(nucleus_client.ApiClient(configuration))

-------------------------------------------------------------
--                Topic API Examples                     --
-------------------------------------------------------------


## Get list of topics from dataset

In [49]:
print('-------------------------------------------------------------')
dataset = dataset
#query = '("Trump" OR "president")' # str | Fulltext query, using mysql MATCH boolean query format. Example, (\"word1\" OR \"word2\") AND (\"word3\" OR \"word4\") (optional)
query = ''
custom_stop_words = ["real","hillary"] # ERRORUNKNOWN | List of stop words. (optional)
num_topics = 8 # int | Number of topics to be extracted from the dataset. (optional) (default to 8)
metadata_selection ="" # str | json object of {\"metadata_field\":[\"selected_values\"]} (optional)
time_period =""# str | Time period selection (optional)

try:
    api_response = api_instance_topic.get_topic_api(
        dataset,                                
        query=query,                   
        custom_stop_words=custom_stop_words,     
        num_topics=num_topics,
        metadata_selection=metadata_selection,
        time_period=time_period)
    pprint(api_response)
except ApiException as e:
    print("Exception when calling TopicsApi->get_topic_api: %s\n" % e)
    
print('-------------------------------------------------------------')

-------------------------------------------------------------


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## Get topic summary

In [50]:
print('-------------------------------------------------------------')
dataset = dataset # str | Dataset name.
#query = '("Trump" OR "president")' # str | Fulltext query, using mysql MATCH boolean query format. Example, (\"word1\" OR \"word2\") AND (\"word3\" OR \"word4\") (optional)
query = ''
custom_stop_words = ["real","hillary"] # ERRORUNKNOWN | List of stop words. (optional)
num_topics = 8 # int | Number of topics to be extracted from the dataset. (optional) (default to 8)
num_keywords = 8 # int | Number of keywords per topic that is extracted from the dataset. (optional) (default to 8)
summary_length = 6 # int | The maximum number of bullet points a user wants to see in each topic summary. (optional) (default to 6)
context_amount = 0 # int | The number of sentences surrounding key summary sentences in the documents that they come from. (optional) (default to 0)
num_docs = 20 # int | The maximum number of key documents to use for summarization. (optional) (default to 20)
excluded_docs = '' # str | List of document IDs that should be excluded from the analysis. Example, \"docid1, docid2, ..., docidN\"  (optional)

try:
    api_response = api_instance_topic.get_topic_summary_api(dataset, 
                                                      query=query, 
                                                      custom_stop_words=custom_stop_words, 
                                                      num_topics=num_topics, 
                                                      num_keywords=num_keywords, 
                                                      summary_length=summary_length, 
                                                      context_amount=context_amount, 
                                                      num_docs=num_docs)
    pprint(api_response)
except ApiException as e:
    print("Exception when calling TopicsApi->get_topic_summary_api: %s\n" % e)

print('-------------------------------------------------------------')



-------------------------------------------------------------
{'results': [{'summary': "[{'title': 'D_Trump2015_6_19_12_10', 'sentences': "
                         "['@SMW5683: @realDonaldTrump Donald trump will do "
                         "this country  good!'], 'sourceid': 12301, "
                         "'attribute': {'time': 1434741000.0, 'counts': None, "
                         "'author': None}}, {'title': 'D_Trump2015_4_24_3_13', "
                         "'sentences': ['@FuturisticHub: @realDonaldTrump "
                         "DONALD TRUMP FOR PRESIDENT OF THE UNITED STATES!'], "
                         "'sourceid': 13378, 'attribute': {'time': "
                         "1429870380.0, 'counts': None, 'author': None}}, "
                         "{'title': 'D_Trump2015_4_10_6_26', 'sentences': "
                         "['@iamapatsfan: I would rather Donald Trump be the "
                         "president than Jeb Bush.'], 'sourceid': 13888, "
                    

## Get topic sentiment

In [51]:
print('-------------------------------------------------------------')
dataset = dataset # str | Dataset name.
#query = '("Trump" OR "president")' # str | Fulltext query, using mysql MATCH boolean query format. Example, (\"word1\" OR \"word2\") AND (\"word3\" OR \"word4\") (optional)
query = ''
custom_stop_words = ["real","hillary"] # ERRORUNKNOWN | List of stop words. (optional)
num_topics = 8 # int | Number of topics to be extracted from the dataset. (optional) (default to 8)
num_keywords = 8 # int | Number of keywords per topic that is extracted from the dataset. (optional) (default to 8)
excluded_docs = '' # str | List of document IDs that should be excluded from the analysis. Example, \"docid1, docid2, ..., docidN\"  (optional)

try:
    api_response = api_instance_topic.get_topic_sentiment_api(dataset, 
                                                        query=query, 
                                                        custom_stop_words=custom_stop_words, 
                                                        num_topics=num_topics, 
                                                        num_keywords=num_keywords)
    pprint(api_response)
except ApiException as e:
    print("Exception when calling TopicsApi->get_topic_sentiment_api: %s\n" % e)
    
print('-------------------------------------------------------------')

-------------------------------------------------------------
{'results': [{'doc_id': '[2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, '
                        '22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, '
                        '37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 50, '
                        '51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 62, 63, 65, '
                        '66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, '
                        '80, 81, 82, 83, 84, 86, 87, 88, 89, 90, 91, 92, 93, '
                        '94, 95, 96, 97, 98, 99, 101, 102, 103, 104, 105, 106, '
                        '107, 109, 110, 111, 112, 113, 114, 115, 116, 117, '
                        '118, 119, 120, 122, 123, 126, 127, 129, 130, 131, '
                        '132, 133, 134, 136, 137, 138, 139, 140, 141, 142, '
                        '143, 144, 145, 146, 147, 148, 149, 150, 151, 152, '
                        '153, 154, 155, 157, 158, 159, 161, 164, 165, 166

## Get topic consensus

In [52]:
print('-------------------------------------------------------------')
dataset = dataset # str | Dataset name.
query = '("Trump" OR "president")' # str | Fulltext query, using mysql MATCH boolean query format. Example, (\"word1\" OR \"word2\") AND (\"word3\" OR \"word4\") (optional)
custom_stop_words = ["real","hillary"] # ERRORUNKNOWN | List of stop words. (optional)
num_topics = 8 # int | Number of topics to be extracted from the dataset. (optional) (default to 8)
num_keywords = 8 # int | Number of keywords per topic that is extracted from the dataset. (optional) (default to 8)
excluded_docs = [''] # str | List of document IDs that should be excluded from the analysis. Example, \"docid1, docid2, ..., docidN\"  (optional)

try:
    api_response = api_instance_topic.get_topic_consensus_api(dataset, 
                                                        query=query, 
                                                        custom_stop_words=custom_stop_words, 
                                                        num_topics=num_topics, 
                                                        num_keywords=num_keywords)
    pprint(api_response)
except ApiException as e:
    print("Exception when calling TopicsApi->get_topic_consensus_api: %s\n" % e)
    
print('-------------------------------------------------------------')

-------------------------------------------------------------
{'results': [{'consensus': '0.492315642695774',
              'strength': '0.7209981196137677',
              'topic': 'donald trump;realdonaldtrump donald;trump will;mac '
                       'miller;hillary clinton;poll donald;jeb bush;vote '
                       'donald'},
             {'consensus': '0.5127957065616822',
              'strength': '0.06559755920686414',
              'topic': 'trump national;national doral;golf club;national '
                       'golf;blue monster;doral miami;monster '
                       'trump;realdonaldtrump trump'},
             {'consensus': '0.5330213743377867',
              'strength': '0.04739214803061891',
              'topic': 'trump international;golf links;international '
                       'hotel;international golf;hotel tower;links '
                       'scotland;trump golf;las vegas'},
             {'consensus': '0.5339868563590127',
              'stren

## Get author connectivity

In [53]:
print('-------------------------------------------------------------')
dataset = dataset # str | Dataset name.
target_author = 'D_Trump16' # str | Name of the author to be analyzed.
query = '' # str | Fulltext query, using mysql MATCH boolean query format. Subject covered by the author, on which to focus the analysis of connectivity. Example, (\"word1\" OR \"word2\") AND (\"word3\" OR \"word4\") (optional)
custom_stop_words = ["real","hillary"] # str | List of words possibly used by the target author that are considered not information-bearing. (optional)
time_period = '' # str | Time period selection (optional)
metadata_selection = '' # str | json object of {\"metadata_field\":[\"selected_values\"]} (optional)
excluded_docs = [''] # str | List of document IDs that should be excluded from the analysis. Example, \"docid1, docid2, ..., docidN\"  (optional)

try:
    api_response = api_instance_topic.get_author_connectivity_api(
        dataset, 
        target_author, 
        query=query, 
        custom_stop_words=custom_stop_words, 
        time_period=time_period, 
        metadata_selection=metadata_selection, 
        excluded_docs=excluded_docs)
    pprint(api_response)
except ApiException as e:
    print("Exception when calling TopicsApi->get_author_connectivity_api: %s\n" % e)
    
print('-------------------------------------------------------------')

-------------------------------------------------------------
{'results': None}
-------------------------------------------------------------


# Document APIs

## Create API instance

In [54]:
print('-------------------------------------------------------------')
print('--                Document API examples                    --')
print('-------------------------------------------------------------')

api_instance_doc = nucleus_client.DocumentsApi(nucleus_client.ApiClient(configuration))

-------------------------------------------------------------
--                Document API examples                    --
-------------------------------------------------------------


## Get document information without content

In [55]:
dataset = dataset # str | Dataset name.
doc_titles = ['D_Trump2018_8_18_1_47']   # str | The title of the document to retrieve. Example: \" \"title 1\" \"  (optional)
doc_ids = ['11', '12', '20']      # int | The docid of the document to retrieve. Example: \"docid1\"  (optional)

try:
    api_response = api_instance_doc.get_doc_info(dataset, doc_titles=doc_titles, doc_ids=doc_ids)
    pprint(api_response)
except ApiException as e:
    print("Exception when calling DocumentsApi->get_doc_info: %s\n" % e)
    
print('-------------------------------------------------------------')

{'results': [{'attribute': {'author': None,
                            'source': None,
                            'time': '1534582020.0'},
              'sourceid': '10',
              'title': 'D_Trump2018_8_18_1_47'},
             {'attribute': {'author': None,
                            'source': None,
                            'time': '1534581960.0'},
              'sourceid': '11',
              'title': 'D_Trump2018_8_18_1_46'},
             {'attribute': {'author': None,
                            'source': None,
                            'time': '1534581420.0'},
              'sourceid': '12',
              'title': 'D_Trump2018_8_18_1_37'},
             {'attribute': {'author': None,
                            'source': None,
                            'time': '1534534680.0'},
              'sourceid': '20',
              'title': 'D_Trump2018_8_17_12_38'}]}
-------------------------------------------------------------


## Display document details

In [56]:
print('-------------------------------------------------------------')

dataset = dataset # str | Dataset name.
doc_titles = ['D_Trump2018_8_18_1_47']   # str | The title of the document to retrieve. Example: \" \"title 1\" \"  (optional)
doc_ids = ['11']      # int | The docid of the document to retrieve. Example: \"docid1\"  (optional)

try:
    api_response = api_instance_doc.get_doc_display(dataset, doc_titles=doc_titles, doc_ids=doc_ids)
    pprint(api_response)
except ApiException as e:
    print("Exception when calling DocumentsApi->get_doc_display_api: %s\n" % e)
    
print('-------------------------------------------------------------')

-------------------------------------------------------------
{'results': [{'attribute': {'author': None,
                            'source': None,
                            'time': '1534582020.0'},
              'content': ' financial gain is a Federal Gratuity Statute '
                         'Violation Bribery Statute Violation Honest Services '
                         'Violation all Major Crimes because the DOJ is run by '
                         'BLANK Jeff Sessions ”  Gregg Jarrett. So when does '
                         'Mueller do what must be done? Probably never! '
                         '@FoxNews',
              'sourceid': '10',
              'title': 'D_Trump2018_8_18_1_47'},
             {'attribute': {'author': None,
                            'source': None,
                            'time': '1534581960.0'},
              'content': '“Bruce Ohr of DOJ is in legal jeopardy it’s '
                         'astonishing that he’s still employed. Bruce  Nelly '

## Get document recommendation

In [57]:
print('-------------------------------------------------------------')

dataset = dataset # str | Dataset name.
query = '("Trump" OR "president")' # str | Fulltext query, using mysql MATCH boolean query format. Example, (\"word1\" OR \"word2\") AND (\"word3\" OR \"word4\") (optional)
custom_stop_words = ["real","hillary"] # ERRORUNKNOWN | List of stop words. (optional)
num_topics = 8 # int | Number of topics to be extracted from the dataset. (optional) (default to 8)
num_keywords = 8 # int | Number of keywords per topic that is extracted from the dataset. (optional) (default to 8)
excluded_docs = '' # str | List of document IDs that should be excluded from the analysis. Example, \"docid1, docid2, ..., docidN\"  (optional)

try:
    api_response = api_instance_doc.get_doc_recommend_api(dataset, 
                                                      query=query, 
                                                      custom_stop_words=custom_stop_words, 
                                                      num_topics=num_topics, 
                                                      num_keywords=num_keywords)
    pprint(api_response)
except ApiException as e:
    print("Exception when calling DocumentsApi->get_doc_recommend_api: %s\n" % e)
    
print('-------------------------------------------------------------')

-------------------------------------------------------------
{'results': [{'recommendations': [{'attribute': {'author': None,
                                                 'source': None,
                                                 'time': '1418220600.0'},
                                   'sourceid': '16389',
                                   'title': 'D_Trump2014_12_10_6_10'},
                                  {'attribute': {'author': None,
                                                 'source': None,
                                                 'time': '1434741000.0'},
                                   'sourceid': '12301',
                                   'title': 'D_Trump2015_6_19_12_10'},
                                  {'attribute': {'author': None,
                                                 'source': None,
                                                 'time': '1418248860.0'},
                                   'sourceid': '16388',
                

## Get document summary

In [58]:
print('-------------------------------------------------------------')

dataset = dataset # str | Dataset name.
doc_title = 'D_Trump2018_8_15_15_4' # str | The title of the document to be summarized.
custom_stop_words = ["real","hillary"] # ERRORUNKNOWN | List of stop words. (optional)
summary_length = 6 # int | The maximum number of bullet points a user wants to see in the document summary. (optional) (default to 6)
context_amount = 0 # int | The number of sentences surrounding key summary sentences in the documents that they come from. (optional) (default to 0)

try:
    api_response = api_instance_doc.get_doc_summary_api(dataset, doc_title, custom_stop_words=custom_stop_words, summary_length=summary_length, context_amount=context_amount)
    pprint(api_response)
except ApiException as e:
    print("Exception when calling DocumentsApi->get_doc_summary_api: %s\n" % e)
    
print('-------------------------------------------------------------')

-------------------------------------------------------------
{'doc_title': 'D_Trump2018_8_15_15_4',
 'summary': {'sentences': "['Our Country was built on Tariffs and Tariffs are "
                          'now leading us to great new Trade Deals - as '
                          'opposed to the horrible and unfair Trade Deals that '
                          "I inherited as your President.', 'Other Countries "
                          'should not be allowed to come in and steal the '
                          "wealth of our great U.S.A. No longer!']",
             'sourceid': '50'}}
-------------------------------------------------------------
