In [None]:
import json
import requests
from pprint import pprint

In [None]:
#Define the names for the data source, skillset, index and indexer
datasource_name="ds-scans"
skillset_name="scan-ocr"
index_name="scan-index"
indexer_name="scan-indexer"

Add the name and key of your search service.

In [None]:
#Setup the endpoint
endpoint = 'https://<yourSearchInstance>.search.windows.net/'
headers = {'Content-Type': 'application/json', 'api-key': '<yourAPIkey>' }
params = {'api-version': '2019-05-06'}

Add the full connection string to your storage account. This step assumes "basic-demo-data-pr" as the container name. Replace that string as well if your container name is different.

In [None]:
#Create a data source
datasourceConnectionString = ""
datasource_payload = {
    "name": datasource_name,
    "description": "Scanned pages from a book.",
    "type": "azureblob",
    "credentials": {
    "connectionString": datasourceConnectionString
   },
    "container": {
     "name": "cognitivesearch",
     "query": "scans"
   }
}
r = requests.put( endpoint + "/datasources/" + datasource_name, data=json.dumps(datasource_payload), headers=headers, params=params )
print(r.status_code)

In [None]:
#Create a skillset
skillset_payload = {
  "name": skillset_name,
  "description":
  "This skillset applies OCR on scanned book pages, extracts key phrases and the language from the text and eventually translates the text to a different language ",
  "skills":
  [
          {
          "@odata.type": "#Microsoft.Skills.Vision.OcrSkill",
          "textExtractionAlgorithm": "",
          "lineEnding": "Space",
          "defaultLanguageCode": "en",
          "detectOrientation": "true",
          "description": "scan-skill",
          "context": "/document/normalized_images/*",
          "inputs": [
            {
              "name": "image",
              "source": "/document/normalized_images/*"
            }
          ],
          "outputs": [
            {
              "name": "text",
              "targetName": "mitext"
            }
          ]
        },
        {
        "@odata.type": "#Microsoft.Skills.Text.MergeSkill",
        "description": "Create merged_text, which includes all the textual representation of each image inserted at the right location in the content field.",
        "context": "/document",
        "insertPreTag": " ",
        "insertPostTag": " ",
        "inputs": [
            {
              "name":"text", "source": "/document/content"
            },
            {
              "name": "itemsToInsert", "source": "/document/normalized_images/*/mitext"
            },
            {
              "name":"offsets", "source": "/document/normalized_images/*/contentOffset" 
            }
        ],
        "outputs": [
            {
              "name": "mergedText", "targetName" : "mergedText"
            }
          ]
        },
        {
        "@odata.type": "#Microsoft.Skills.Text.LanguageDetectionSkill",
        "context": "/document",
        "inputs": [
          {
            "name": "text",
            "source": "/document/mergedText"
          }
        ],
        "outputs": [
          {
            "name": "languageCode",
            "targetName": "languageCode"
          }       
        ]
       },
       {
        "@odata.type": "#Microsoft.Skills.Text.KeyPhraseExtractionSkill",
        "context": "/document",
        "inputs": [
          {
            "name": "text",
            "source": "/document/mergedText"
          },
          {
            "name": "languageCode",
            "source": "/document/languageCode" 
          }
        ],
        "outputs": [
          {
            "name": "keyPhrases",
            "targetName": "keyPhrases"
          }
        ]
       },
       {
       "@odata.type": "#Microsoft.Skills.Text.TranslationSkill",
       "defaultToLanguageCode": "de",
       "context": "/document",
       "inputs": [
          {
            "name": "text",
            "source": "/document/mergedText"
          },
          {
            "name": "fromLanguageCode",
            "source": "/document/languageCode"
          },
            
        ],
        "outputs": [
          {
            "name": "translatedText",
            "targetName": "translatedText"
          }
        ]
      }
    ],
    
    "cognitiveServices": {
    "@odata.type": "#Microsoft.Azure.Search.CognitiveServicesByKey",
    "description": "/subscriptions/2653c6f1-db32-41c9-8444-69430d77ef0e/resourceGroups/sapsmbinnovationsummit/providers/Microsoft.CognitiveServices/accounts/mismbsummitcognitiveservices",
    "key": "059695e6bf6a42d48e70ddb1ab2cf963"
  },
}

r = requests.put(endpoint + "/skillsets/" + skillset_name, data=json.dumps(skillset_payload), headers=headers, params=params)
print(r.status_code)

In [None]:
#Create an index
index_payload = {
    "name": index_name,
    "fields": [
      {
        "name": "id",
        "type": "Edm.String",
        "key": "true",
        "searchable": "true",
        "filterable": "false",
        "facetable": "false",
        "sortable": "true"
      },
      {
        "name": "content",
        "type": "Edm.String",
        "sortable": "false",
        "searchable": "true",
        "filterable": "false",
        "facetable": "false"
      },
        
      {
        "name": "documentPath",
        "type": "Edm.String",
        "searchable": "true",
        "filterable": "false",
        "facetable": "false"
      },
            {
        "name": "extractedText",
        "type": "Edm.String",
        "searchable": "true",
        "filterable": "false",
        "facetable": "false"
      },
      {
        "name": "languageCode",
        "type": "Edm.String",
        "searchable": "true",
        "filterable": "false",
        "facetable": "false"
      },
      {
        "name": "keyPhrases",
        "type": "Collection(Edm.String)",
        "searchable": "true",
        "filterable": "false",
        "facetable": "false"
      },
      {
        "name": "translatedText",
        "type": "Edm.String",
        "searchable": "true",
        "filterable": "false",
        "facetable": "false"
      } 
   ]
}

r = requests.put(endpoint + "/indexes/" + index_name, data=json.dumps(index_payload), headers=headers, params=params)
print(r.status_code)

The next step, Create an indexer, is where all the deep processing occurs. This step takes several minutes to complete. 

In [None]:
# Create an indexer
indexer_payload = {
    "name": indexer_name,
    "dataSourceName": datasource_name,
    "targetIndexName": index_name,
    "skillsetName": skillset_name,
    "fieldMappings" : [
    {
      "sourceFieldName" : "metadata_storage_path",
      "targetFieldName" : "id",
      "mappingFunction" :
        { "name" : "base64Encode" }
    },
    {
      "sourceFieldName" : "content",
      "targetFieldName" : "content"
    }
  ],
   "outputFieldMappings" :
  [ 
       {
      "sourceFieldName" : "document/metadata_storage_path", 
      "targetFieldName" : "documentPath"
      },
      
      {
      "sourceFieldName" : "/document/mergedText", 
      "targetFieldName" : "extractedText"
      },
       {
      "sourceFieldName" : "/document/languageCode", 
      "targetFieldName" : "languageCode"
      },
       {
      "sourceFieldName" : "/document/keyPhrases", 
      "targetFieldName" : "keyPhrases"
      },
      {
      "sourceFieldName" : "/document/translatedText", 
      "targetFieldName" : "translatedText"
      }
  ],
   "parameters":
  {
    "maxFailedItems":-1,
    "maxFailedItemsPerBatch":-1,
    "configuration":
    {
      "dataToExtract": "contentAndMetadata",
      "imageAction": "generateNormalizedImages"
    }
  }
}

r = requests.put(endpoint + "/indexers/" + indexer_name, data=json.dumps(indexer_payload), headers=headers, params=params)
print(r.content)


In [None]:
#Get indexer status
r = requests.get(endpoint + "/indexers/" + indexer_name + "/status", headers=headers,params=params)
pprint(json.dumps(r.json(), indent=1))

In [None]:
#Query the index for all fields
r = requests.get(endpoint + "/indexes/" + index_name, headers=headers,params=params)
print(json.dumps(r.json(), indent=1))

In [None]:
#Query the index to return the contents of organizations
#Note: Index creation may take time. If this step returns no data, wait a few minutes
#      and then try again
r = requests.get(endpoint + "/indexes/" + index_name + "/docs?&search=*&$select=organizations", headers=headers, params=params)
pprint(r.json())