# Speech recognition with IBM Watson

**Objectives:**
* Getting hands-on experience with speech-to-text service developed by IBM Watson
* Building a speech recognition application using Watson's API

## Speech recognition using Speech to Text API

In [3]:
from ibm_watson import SpeechToTextV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator

In [4]:
authenticator = IAMAuthenticator('ZfiRn03spxhO6XfCDKVXnGMeTmAden8wFbNGx_5P8eki') #replace {APIkey} by your API key
speech_to_text = SpeechToTextV1(authenticator=authenticator)
speech_to_text.set_service_url('https://api.eu-gb.speech-to-text.watson.cloud.ibm.com/instances/22432170-bcfc-4a08-8e41-ae7de1da6b17') #replace {url} by your URL

### We can test the Speech to Text service with a speech clip, e.g., arctic_a0005.wav, as follows:

In [5]:
import json

with open('SpeechtoTextData/arctic_a0005.wav', 'rb') as audio_file:
    speech_recognition_results = speech_to_text.recognize(
        audio = audio_file,
        content_type='audio/wav').get_result()
    
print(json.dumps(speech_recognition_results, indent = 2))

{
  "result_index": 0,
  "results": [
    {
      "final": true,
      "alternatives": [
        {
          "transcript": "will we ever forget it ",
          "confidence": 0.95
        }
      ]
    }
  ]
}


### The speech recognition result can be saved to file (in json format) using the command:

In [6]:
with open('SpeechtoTextData/arctic_a0005.json', 'w') as outfile:
    json.dump(speech_recognition_results, outfile)

### To load the result from file, we can use the following code:

In [7]:
with open('SpeechtoTextData/arctic_a0005.json') as infile:
    data = json.load(infile) # load data from a json file
    
print(data)

{'result_index': 0, 'results': [{'final': True, 'alternatives': [{'transcript': 'will we ever forget it ', 'confidence': 0.95}]}]}


### (a). Using speech clip- "367-130732-0000": 

In [8]:
with open('SpeechtoTextData/367-130732-0000.flac', 'rb') as audio_file:
    speech_recognition_results = speech_to_text.recognize(
        audio = audio_file,
        content_type='audio/flac').get_result()
    
print(json.dumps(speech_recognition_results, indent = 2))

{
  "result_index": 0,
  "results": [
    {
      "final": true,
      "alternatives": [
        {
          "transcript": "lobsters and law officers ",
          "confidence": 0.74
        }
      ]
    }
  ]
}


In [9]:
with open('SpeechtoTextData/367-130732-0000.json', 'w') as outfile:
    json.dump(speech_recognition_results, outfile)

In [10]:
with open('SpeechtoTextData/367-130732-0000.json') as infile:
    data = json.load(infile) # load data from a json file
    
print(data)

{'result_index': 0, 'results': [{'final': True, 'alternatives': [{'transcript': 'lobsters and law officers ', 'confidence': 0.74}]}]}


### (b). Using speech clip- "367-130732-0001": 

In [11]:
with open('SpeechtoTextData/367-130732-0001.flac', 'rb') as audio_file:
    speech_recognition_results = speech_to_text.recognize(
        audio = audio_file,
        content_type='audio/flac').get_result()
    
print(json.dumps(speech_recognition_results, indent = 2))

{
  "result_index": 0,
  "results": [
    {
      "final": true,
      "alternatives": [
        {
          "transcript": "when is a lobster not a lobster when it is a crayfish ",
          "confidence": 0.87
        }
      ]
    }
  ]
}


In [12]:
with open('SpeechtoTextData/367-130732-0001.json', 'w') as outfile:
    json.dump(speech_recognition_results, outfile)

In [13]:
with open('SpeechtoTextData/367-130732-0001.json') as infile:
    data = json.load(infile) # load data from a json file
    
print(data)

{'result_index': 0, 'results': [{'final': True, 'alternatives': [{'transcript': 'when is a lobster not a lobster when it is a crayfish ', 'confidence': 0.87}]}]}


### (c). Using speech clip- "367-130732-0004": 

In [14]:
with open('SpeechtoTextData/367-130732-0004.flac', 'rb') as audio_file:
    speech_recognition_results = speech_to_text.recognize(
        audio = audio_file,
        content_type='audio/flac').get_result()
    
print(json.dumps(speech_recognition_results, indent = 2))

{
  "result_index": 0,
  "results": [
    {
      "final": true,
      "alternatives": [
        {
          "transcript": "a book could be written about this restaurant and then all would not be told for all its secrets can never be known ",
          "confidence": 0.92
        }
      ]
    }
  ]
}


In [15]:
with open('SpeechtoTextData/367-130732-0004.json', 'w') as outfile:
    json.dump(speech_recognition_results, outfile)

In [16]:
with open('SpeechtoTextData/367-130732-0004.json') as infile:
    data = json.load(infile) # load data from a json file
    
print(data)

{'result_index': 0, 'results': [{'final': True, 'alternatives': [{'transcript': 'a book could be written about this restaurant and then all would not be told for all its secrets can never be known ', 'confidence': 0.92}]}]}


### (d). Using speech clip- "arctic_a0001": 

In [17]:
with open('SpeechtoTextData/arctic_a0001.wav', 'rb') as audio_file:
    speech_recognition_results = speech_to_text.recognize(
        audio = audio_file,
        content_type='audio/wav').get_result()
    
print(json.dumps(speech_recognition_results, indent = 2))

{
  "result_index": 0,
  "results": [
    {
      "final": true,
      "alternatives": [
        {
          "transcript": "author of the danger trail Phillips steals etcetera ",
          "confidence": 0.78
        }
      ]
    }
  ]
}


In [18]:
with open('SpeechtoTextData/arctic_a0001.json', 'w') as outfile:
    json.dump(speech_recognition_results, outfile)

In [19]:
with open('SpeechtoTextData/arctic_a0001.json') as infile:
    data = json.load(infile) # load data from a json file
    
print(data)

{'result_index': 0, 'results': [{'final': True, 'alternatives': [{'transcript': 'author of the danger trail Phillips steals etcetera ', 'confidence': 0.78}]}]}


### (e). Using speech clip- "arctic_a0003": 

In [20]:
with open('SpeechtoTextData/arctic_a0003.wav', 'rb') as audio_file:
    speech_recognition_results = speech_to_text.recognize(
        audio = audio_file,
        content_type='audio/wav').get_result()
    
print(json.dumps(speech_recognition_results, indent = 2))

{
  "result_index": 0,
  "results": [
    {
      "final": true,
      "alternatives": [
        {
          "transcript": "for the twentieth time that evening the two men shook hands ",
          "confidence": 0.86
        }
      ]
    }
  ]
}


In [21]:
with open('SpeechtoTextData/arctic_a0003.json', 'w') as outfile:
    json.dump(speech_recognition_results, outfile)

In [22]:
with open('SpeechtoTextData/arctic_a0003.json') as infile:
    data = json.load(infile) # load data from a json file
    
print(data)

{'result_index': 0, 'results': [{'final': True, 'alternatives': [{'transcript': 'for the twentieth time that evening the two men shook hands ', 'confidence': 0.86}]}]}


### (f). Using speech clip- "p232_009": 

In [23]:
with open('SpeechtoTextData/p232_009.wav', 'rb') as audio_file:
    speech_recognition_results = speech_to_text.recognize(
        audio = audio_file,
        content_type='audio/wav').get_result()
    
print(json.dumps(speech_recognition_results, indent = 2))

{
  "result_index": 0,
  "results": [
    {
      "final": true,
      "alternatives": [
        {
          "transcript": "there is according to legend a boiling pot of gold at one end ",
          "confidence": 0.98
        }
      ]
    }
  ]
}


In [24]:
with open('SpeechtoTextData/p232_009.json', 'w') as outfile:
    json.dump(speech_recognition_results, outfile)

In [25]:
with open('SpeechtoTextData/p232_009.json') as infile:
    data = json.load(infile) # load data from a json file
    
print(data)

{'result_index': 0, 'results': [{'final': True, 'alternatives': [{'transcript': 'there is according to legend a boiling pot of gold at one end ', 'confidence': 0.98}]}]}


### (g). Using speech clip- "p232_010": 

In [26]:
with open('SpeechtoTextData/p232_010.wav', 'rb') as audio_file:
    speech_recognition_results = speech_to_text.recognize(
        audio = audio_file,
        content_type='audio/wav').get_result()
    
print(json.dumps(speech_recognition_results, indent = 2))

{
  "result_index": 0,
  "results": [
    {
      "final": true,
      "alternatives": [
        {
          "transcript": "no one ever finds ",
          "confidence": 0.19
        }
      ]
    }
  ]
}


In [27]:
with open('SpeechtoTextData/p232_010.json', 'w') as outfile:
    json.dump(speech_recognition_results, outfile)

In [28]:
with open('SpeechtoTextData/p232_010.json') as infile:
    data = json.load(infile) # load data from a json file
    
print(data)

{'result_index': 0, 'results': [{'final': True, 'alternatives': [{'transcript': 'no one ever finds ', 'confidence': 0.19}]}]}


### (h). Using speech clip- "p232_014": 

In [29]:
with open('SpeechtoTextData/p232_014.wav', 'rb') as audio_file:
    speech_recognition_results = speech_to_text.recognize(
        audio = audio_file,
        content_type='audio/wav').get_result()
    
print(json.dumps(speech_recognition_results, indent = 2))

{
  "result_index": 0,
  "results": [
    {
      "final": true,
      "alternatives": [
        {
          "transcript": "the Hebrews it was a token that there would be no more universal floods ",
          "confidence": 0.92
        }
      ]
    }
  ]
}


In [30]:
with open('SpeechtoTextData/p232_014.json', 'w') as outfile:
    json.dump(speech_recognition_results, outfile)

In [31]:
with open('SpeechtoTextData/p232_014.json') as infile:
    data = json.load(infile) # load data from a json file
    
print(data)

{'result_index': 0, 'results': [{'final': True, 'alternatives': [{'transcript': 'the Hebrews it was a token that there would be no more universal floods ', 'confidence': 0.92}]}]}


### (i). Using speech clip- "p232_030": 

In [32]:
with open('SpeechtoTextData/p232_030.wav', 'rb') as audio_file:
    speech_recognition_results = speech_to_text.recognize(
        audio = audio_file,
        content_type='audio/wav').get_result()
    
print(json.dumps(speech_recognition_results, indent = 2))

{
  "result_index": 0,
  "results": [
    {
      "final": true,
      "alternatives": [
        {
          "transcript": "they were described as being in a serious but stable condition yesterday ",
          "confidence": 0.94
        }
      ]
    }
  ]
}


In [33]:
with open('SpeechtoTextData/p232_030.json', 'w') as outfile:
    json.dump(speech_recognition_results, outfile)

In [34]:
with open('SpeechtoTextData/p232_030.json') as infile:
    data = json.load(infile) # load data from a json file
    
print(data)

{'result_index': 0, 'results': [{'final': True, 'alternatives': [{'transcript': 'they were described as being in a serious but stable condition yesterday ', 'confidence': 0.94}]}]}


**Using IBM Watson's Speech to Text API service we have evaluated various speech clips of different audio formats ('wav' & 'flac') and found the confidence scores & transcriptions of the audio.**

**We further saved it in Json File and loaded & printed the data.** 