In [1]:
from transformers import pipeline
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
classifier = pipeline(
    "audio-classification", model="MIT/ast-finetuned-speech-commands-v2", device=device
)
classifier.model.config.id2label[27]

Device set to use cpu


'marvin'

In [3]:
from transformers.pipelines.audio_utils import ffmpeg_microphone_live


def launch_fn(
    wake_word="marvin",
    prob_threshold=0.5,
    chunk_length_s=2.0,
    stream_chunk_s=0.25,
    debug=False,
):
    if wake_word not in classifier.model.config.label2id.keys():
        raise ValueError(
            f"Wake word {wake_word} not in set of valid class labels, pick a wake word in the set {classifier.model.config.label2id.keys()}."
        )

    sampling_rate = classifier.feature_extractor.sampling_rate

    mic = ffmpeg_microphone_live(
        sampling_rate=sampling_rate,
        chunk_length_s=chunk_length_s,
        stream_chunk_s=stream_chunk_s,
    )

    print("Listening for wake word...")
    for prediction in classifier(mic):
        prediction = prediction[0]
        if debug:
            print(prediction)
        if prediction["label"] == wake_word:
            if prediction["score"] > prob_threshold:
                return True

In [4]:
launch_fn(debug=True)

Listening for wake word...
Using microphone: Microphone (Realtek High Definition Audio)
{'score': 0.054580558091402054, 'label': 'no'}
{'score': 0.19254353642463684, 'label': 'bed'}
{'score': 0.13133899867534637, 'label': 'down'}
{'score': 0.12622086703777313, 'label': 'down'}
{'score': 0.12622086703777313, 'label': 'down'}
{'score': 0.12622086703777313, 'label': 'down'}
{'score': 0.12622086703777313, 'label': 'down'}
{'score': 0.1726192831993103, 'label': 'stop'}
{'score': 0.132925346493721, 'label': 'up'}
{'score': 0.132925346493721, 'label': 'up'}
{'score': 0.10188263654708862, 'label': 'bed'}
{'score': 0.11977922916412354, 'label': 'down'}
{'score': 0.11977922916412354, 'label': 'down'}
{'score': 0.11977922916412354, 'label': 'down'}
{'score': 0.11227765679359436, 'label': 'bed'}
{'score': 0.11328096687793732, 'label': 'down'}
{'score': 0.11328096687793732, 'label': 'down'}
{'score': 0.11328096687793732, 'label': 'down'}
{'score': 0.10661307722330093, 'label': 'down'}
{'score': 0.1

True

In [5]:
transcriber = pipeline(
    "automatic-speech-recognition", model="openai/whisper-base.en", device=device
)
transcriber

Device set to use cpu


<transformers.pipelines.automatic_speech_recognition.AutomaticSpeechRecognitionPipeline at 0x13eae8571d0>

In [6]:
import sys


def transcribe(chunk_length_s=5.0, stream_chunk_s=1.0):
    sampling_rate = transcriber.feature_extractor.sampling_rate

    mic = ffmpeg_microphone_live(
        sampling_rate=sampling_rate,
        chunk_length_s=chunk_length_s,
        stream_chunk_s=stream_chunk_s,
    )

    print("Start speaking...")
    for item in transcriber(mic, generate_kwargs={"max_new_tokens": 128}):
        sys.stdout.write("\033[K")
        print(item["text"], end="\r")
        if not item["partial"][0]:
            break

    return item["text"]

In [16]:
transcribe()

Start speaking...
Using microphone: Microphone (Realtek High Definition Audio)
[K What is hugging face?

' What is hugging face?'

In [10]:
from huggingface_hub import InferenceClient


client = InferenceClient(
    provider="novita",
    api_key="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",
)

conversation_history = [
    {"role": "system", "content": "You are a helpful assistant."}
]

def query_inference(messages):
    conversation_history.append({"role": "user", "content": messages})

    completion = client.chat.completions.create(
        model="deepseek-ai/DeepSeek-V3-0324",
        messages=conversation_history
    )

    assistant_message = completion.choices[0].message['content']

    conversation_history.append({"role": "assistant", "content": assistant_message})

    return assistant_message

# Example usage:
user_input = "What is Hugging Face library in simple terms?"
response = query_inference(user_input)
print("Assistant:", response)

# You can keep adding user input and the assistant will remember the conversation history:
user_input_2 = "How do I use it for NLP?"
response_2 = query_inference(user_input_2)
print("Assistant:", response_2)


Assistant: Hugging Face is a popular open-source library for **natural language processing (NLP)** that makes it easy to use and share state-of-the-art AI models for tasks like:

- Text generation (e.g., chatbots, story writing)
- Translation (e.g., English to French)
- Sentiment analysis (e.g., detecting positive/negative tone)
- Question answering (e.g., extracting answers from text)

### In Simple Terms:
Hugging Face is like a **"GitHub for AI models"** where you can:
1. **Download pre-trained models** (like ChatGPT alternatives) with just a few lines of code.
2. **Fine-tune models** on your own data (e.g., train a chatbot on medical texts).
3. **Share your models** with others in the AI community.

### Key Features:
- **Transformers Library**: Provides thousands of pre-trained models (e.g., BERT, GPT, T5).
- **Easy-to-use APIs**: Simple Python code to run complex AI tasks.
- **Model Hub**: A repository of free, ready-to-use models.
- **Datasets & Tokenizers**: Tools to preprocess t

In [11]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan

processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")

model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)

In [12]:
from datasets import load_dataset

embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Generating validation split: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7931/7931 [00:00<00:00, 27399.64 examples/s]


In [18]:
import torch

def synthesise(text, chunk_size=500):
    # Split text into smaller chunks
    text_chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    audios = []

    for chunk in text_chunks:
        inputs = processor(text=chunk, return_tensors="pt")
        with torch.no_grad():
            speech = model.generate_speech(
                inputs["input_ids"].to(device),
                speaker_embeddings.to(device),
                vocoder=vocoder
            )
        audios.append(speech.cpu())

    # Concatenate all generated audios
    full_audio = torch.cat(audios, dim=-1)
    return full_audio


In [19]:
from IPython.display import Audio

audio = synthesise(
    "Hugging Face is a company that provides natural language processing and machine learning tools for developers."
)

Audio(audio, rate=16000)

In [20]:
launch_fn()
transcription = transcribe()
response = query_inference(transcription)
audio = synthesise(response)

Audio(audio, rate=16000, autoplay=True)

Listening for wake word...
Using microphone: Microphone (Realtek High Definition Audio)
Start speaking...
Using microphone: Microphone (Realtek High Definition Audio)
[K What is Hagging Face?