# Search and Analyze with Open Collections API

### Import and setup all the things (again)

In [1]:
import json, requests, math, re, string, nltk

# allow matplotlib to run in-line
% matplotlib inline 

nltk.download("punkt") # Word tokenizer
nltk.download("stopwords") # Stop words
from nltk import word_tokenize

ocUrl = 'https://open.library.ubc.ca/'
ocApiUrl = 'https://oc-index.library.ubc.ca' # APPY URL

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Set our API Key

You can get your own API key at https://open.library.ubc.ca/research

In [2]:
apiKey = 'ac40e6c2cb345593ed1691e0a8b601bba398e42d85f81f893c5ab709cec63c6c'

### Search Query

In [3]:
query = '"Master of Journalism"'

### Repositories

__Available repositories__

* __oc__ - all repositories
* __dsp__ - only DSpace / cIRcle
* __cdm__ - only ContentDM
* __atm__ - only AtoM

In [4]:
repo = 'dsp'

### Build the search

https://open.library.ubc.ca/research to build a query

In [5]:
search = dict()

search['from'] = 0
search['size'] = 10
search['type'] = 'object'
search['body'] = dict()

# Sort settings
search['body']['sort'] = dict()
search['body']['sort']['_score'] = dict()
search['body']['sort']['_score']['order'] = 'desc'

# Fields to return
search['body']['fields'] = []
search['body']['fields'].append('title')
search['body']['fields'].append('ubc.transcript')
search['body']['fields'].append('description')
search['body']['fields'].append('ubc.internal.provenance.nick')

# Query String
search['body']['query'] = dict()
search['body']['query']['query_string'] = dict()
search['body']['query']['query_string']['query'] = query

#Set the repo
search['index'] = repo

jsonSearch = json.JSONEncoder(search)

print(json.dumps(search, indent=4, sort_keys=True))

{
    "body": {
        "fields": [
            "title",
            "ubc.transcript",
            "description",
            "ubc.internal.provenance.nick"
        ],
        "query": {
            "query_string": {
                "query": "\"Master of Journalism\""
            }
        },
        "sort": {
            "_score": {
                "order": "desc"
            }
        }
    },
    "from": 0,
    "index": "dsp",
    "size": 10,
    "type": "object"
}


### POST the search

In [None]:
searchUrl = ocApiUrl+'/search?apiKey='+apiKey
apiResponse = requests.post(searchUrl, json=search).json()

print(json.dumps(apiResponse, indent=4, sort_keys=True))

{
    "api_code": 3000,
    "api_text": "Endpoint: /search | API Version: 1",
    "data": {
        "data": {
            "_shards": {
                "failed": 0,
                "successful": 2540,
                "total": 2540
            },
            "hits": {
                "hits": [
                    {
                        "_id": "1.0071717",
                        "_index": "dsp.24-2015-09-29",
                        "_score": 0.07468312,
                        "_type": "object",
                        "fields": {
                            "description": [
                                "Academic sources are among the most potent sources a journalist can bring to bear on a subject, carrying auras of both authority and objectivity. Yet as serious flaws in the media\u2019s coverage of issues like climate change and the health consequences of cigarettes show, they are not always well used. This project is based in the belief that who sources are and how they are port

### Get just the Items

In [None]:
apiItems = apiResponse['data']['data']['hits']['hits']
print(apiItems)



### Parse items and clean full text

In [None]:
items = []
for apiItem in apiItems:
    item = dict()
    item['id'] = apiItem['_id']
    item['title'] = apiItem['fields']['title'][0]
    item['description'] = apiItem['fields']['description'][0]
    item['collection'] = apiItem['fields']['ubc.internal.provenance.nick'][0]
    
    # Clean Full Text
    cleanFullText = apiItem['fields']['ubc.transcript'][0].lower()
    pattern = re.compile('[\W_]+')
    cleanFullText = pattern.sub(' ', cleanFullText)
    
    item['fullText'] = cleanFullText
    item['words'] = word_tokenize(cleanFullText)
    items.append(item)
    
print(items)

### Item with most words?

In [None]:
mostWords = 0
winner = 0
for key, item in enumerate(items):
    if(len(item['words']) > mostWords):
        mostWords = len(item['words'])
        winner = key

print("Winner is "+ocUrl+'collections/'+items[winner]['collection']+'/items/'+items[winner]['id'] + 
      " with "+str(mostWords)+ " words!")

### Item with most unique words?

In [None]:
mostWords = 0
winner = 0
for key, item in enumerate(items):
    if(len(set(item['words'])) > mostWords):
        mostWords = len(set(item['words']))
        winner = key

print("Winner is "+ocUrl+'collections/'+items[winner]['collection']+'/items/'+items[winner]['id'] + 
      " with "+str(mostWords)+ " unique words!")

### Combining all the words

In [None]:
allWords = []
for item in items:
    allWords += item['words']
print(str(len(allWords)) + " words in total")
# print(allWords)

### Searching within the full text

In [None]:
search = "truth"
# search = "the"
text = nltk.Text(allWords)
text.count(search)

### Percentage of full text that the search takes up

In [None]:
100.0*allWords.count(search)/len(allWords) 

### Concordance search on the full text

In [None]:
text.concordance(search)

### Lexical dispersion of search

In [None]:
import numpy
# allow visuals to show up in this interface-
% matplotlib inline 
text.dispersion_plot([search])

### Words used similarly to our search

In [None]:
text.similar(search) # How does this work? Mag

### Collocations

In [None]:
text.collocations()

### Lexical dispersion of search