# Task 1: Download and transcribe the given the audio file using Speech-to-Text recognition.


In [None]:
#Q1
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

# Load the pre-trained Wav2Vec model and tokenizer
model = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-large-960h')
tokenizer = Wav2Vec2Tokenizer.from_pretrained('facebook/wav2vec2-large-960h')

# Load the audio file
audio, _ = torchaudio.load('sales_call_telephone_marketers.wav')

# Convert the audio to the required format
input_values = tokenizer(audio.numpy(), return_tensors='pt').input_values

# Transcribe the audio file
with torch.no_grad():
    logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = tokenizer.batch_decode(predicted_ids)[0]
print(transcription)

# Task 2: Train an NLU model to classify intents and recognize entities.


In [None]:

import spacy
from spacy.tokens import DocBin

# Load the pre-trained spaCy English language model
nlp = spacy.load('en_core_web_sm')

# Create a DocBin to store the labeled data
doc_bin = DocBin()

# Add the labeled example sentences to the DocBin
doc = nlp('My name is Jeff and I am calling from Amazon.')
doc.ents = [(ent.label_, ent.text) for ent in doc.ents]
doc_bin.add(doc)
doc = nlp('I am calling from Microsoft and my name is Satya.')
doc.ents = [(ent.label_, ent.text) for ent in doc.ents]
doc_bin.add(doc)
doc = nlp('I am Sundar and this is call from Google.')
doc.ents = [(ent.label_, ent.text) for ent in doc.ents]
doc_bin.add(doc)
doc = nlp('I am calling about your Microsoft Azure subscription.')
doc.ents = [(ent.label_, ent.text) for ent in doc.ents]
doc_bin.add(doc)
doc = nlp('This is a call regarding your Google Cloud Platform account.')
doc

# Task 3: Separate the sentences in the output of task 1. On each sentence, apply the model trained in task 2 to classify its intent and recognize the entities present in it.

In [None]:
# task 3
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
import spacy
from spacy.tokens import DocBin
import json

# Load the pre-trained Wav2Vec model and tokenizer
model = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-large-960h')
tokenizer = Wav2Vec2Tokenizer.from_pretrained('facebook/wav2vec2-large-960h')

# Load the pre-trained spaCy English language model
nlp = spacy.load('en_core_web_sm')

# Load the NLU model trained in Q2
nlp2 = spacy.load('nlu_model')

# Load the audio file
audio, _ = torchaudio.load('audio_file.wav')

# Convert the audio to the required format
input_values = tokenizer(audio.numpy(), return_tensors='pt').input_values

# Transcribe the audio file and split the sentences
with torch.no_grad():
    logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = tokenizer.batch_decode(predicted_ids)[0]
sentences = transcription.split('. ')

# Classify the intent and recognize the entities for each sentence
output = []
for sentence in sentences:
    doc = nlp(sentence)
    intent = nlp2(sentence).cats
    entities = [(ent.label_, ent.text) for ent in doc.ents]
    output.append({'sentence': sentence, 'intent': intent, 'entities': entities})

# Export the output in a JSON file
with open('output.json', 'w') as f:
    json.dump(output, f)


# Task 4: For the whole text generated from the audio file generate a summary report

In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter

# Clean the text
text = text.lower() # convert to lowercase
text = text.replace('\n', ' ') # remove newlines
text = text.replace('\r', ' ') # remove carriage returns

# Tokenize the text
sentences = sent_tokenize(text)
words = word_tokenize(text)

# Count word frequencies
word_counts = Counter(words)

# Calculate sentence scores
sentence_scores = []
for sentence in sentences:
    score = sum(word_counts[word] for word in word_tokenize(sentence))
    sentence_scores.append((sentence, score))

# Select top sentences
n = 3 # desired length of summary
top_sentences = sorted(sentence_scores, key=lambda x: x[1], reverse=True)[:n]

# Concatenate selected sentences
summary = ' '.join(sentence[0] for sentence in top_sentences)

print(summary)
