# Multimodal HCI (Speech and Gesture)

## Importing Libraries

In [1]:
import cv2
import numpy as np
import mediapipe as mp
import time
import autopy
import threading
from __future__ import division
import re
import sys
from google.cloud import speech
import pyaudio
from six.moves import queue
import os
import win32api, win32con
import keyboard
import json

## Speech Commands

In [2]:
def mouse_left_click():
    win32api.mouse_event(win32con.MOUSEEVENTF_LEFTDOWN,0,0)
    time.sleep(0.01)
    win32api.mouse_event(win32con.MOUSEEVENTF_LEFTUP,0,0)

def mouse_right_click():
    win32api.mouse_event(win32con.MOUSEEVENTF_RIGHTDOWN,0,0)
    time.sleep(0.01)
    win32api.mouse_event(win32con.MOUSEEVENTF_RIGHTUP,0,0)
    
def scroll(clicks=0, delta_x=0, delta_y=0):
    if clicks > 0:
        increment = win32con.WHEEL_DELTA
    else:
        increment = win32con.WHEEL_DELTA * -1

    for _ in range(abs(clicks)):
        win32api.mouse_event(win32con.MOUSEEVENTF_WHEEL, delta_x, delta_y, increment, 0)
        time.sleep(0.05)

def copy_item():
    mouse_left_click()
    keyboard.send("ctrl+c")

def cut_item():
    mouse_left_click()
    keyboard.send("ctrl+x")

def paste_item():
    mouse_left_click()
    keyboard.send("ctrl+v")

## Gesture Recognition System

In [3]:
def gesture_recognition():
    global stop_threads
    w_cam, h_cam = 640, 480 # setting width and height of webcam
    w_screen, h_screen = autopy.screen.size() # height and width of screen
    frame = 100 # width and height of the frame inside the webcam window
    smoothening = 7
    prev_loc_x = prev_loc_y = 0
    current_loc_x = current_loc_y = 0

    cap = cv2.VideoCapture(0) # 0 will take the input from the default camera. 1, 2 etc id's for other cameras
    cap.set(3, w_cam) # width (id is 3)
    cap.set(4, h_cam) # height (id is 4)
    cap.set(10, 100) # brightness (id is 10)

    mp_hands = mp.solutions.hands
    # hands = mp_hands.Hands() # default parameters are preferred
    hands = mp_hands.Hands(False, 1)
    mp_draw = mp.solutions.drawing_utils # function to draw (visualize) line and points which was used to detect hands

    # used for calculating FPS
    previous_time = 0
    current_time = 0
    count = 0
    fps_sum = 0

    while True:
        count+=1
        previous_time = time.time()
        success, img = cap.read()
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # Hands class only uses RGB images
        results = hands.process(img_rgb)
    #     print(results.multi_hand_landmarks) # will give co-ordinates if hands are detected, else None

        if results.multi_hand_landmarks: # if hands are detected
#         for hand_lms in results.multi_hand_landmarks: # for each hand detected(max 2 hands by default - Hands())
            hand_lms = results.multi_hand_landmarks[0] # first hand (here, only one hand is detected anyway)
#         lm - coordinate of point in ratio of h(x),w(y) and z. 
#         These landmarks will always be in order and hence we get id by enumerate
            lm = hand_lms.landmark[9] # total of 21 points(0 - 20 id's)

            h, w, c = img.shape # height, width and channels of the img(frame)
            cix, ciy = int(lm.x * w), int(lm.y * h) # coordinates with respect to the pixels of the img(frame)           

            cv2.circle(img, (cix, ciy), 15, (255,0,255), cv2.FILLED) # highlighting the specific landmark

#             frame where hand movement is detected
            cv2.rectangle(img, (frame, frame), (w_cam-frame, h_cam-frame), (255, 0, 255), 2)

#         csx = np.interp(cix, (0, w_cam), (0, w_screen)) # range from 0 to width of webcam is converted to 0 to width of screen
#         csy = np.interp(ciy, (0, h_cam), (0, h_screen)) # range from 0 to height of webcam is converted to 0 to height of screen
            csx = np.interp(cix, (frame, w_cam-frame), (0, w_screen)) # range from 0 to width of frame is converted to 0 to width of screen
            csy = np.interp(ciy, (frame, h_cam-frame), (0, h_screen)) # range from 0 to height of frame is converted to 0 to height of screen
#         print(csx, csy)

#         print("Landmark: [{0}, {1}]".format(lm.x, lm.y))
#         print("Webcam frame coordinates: [{0}, {1}]".format(cix, ciy))
#         print("Screen coordinates: [{0}, {1}]\n".format(csx, csy))
#         Smootheing x and y value
            current_loc_x = prev_loc_x + (csx - prev_loc_x) / smoothening
            current_loc_y = prev_loc_y + (csy - prev_loc_y) / smoothening

#         giving mouse coordinates(x coordinate is inverted, hence we subtract it from width of screen)
            autopy.mouse.move(w_screen - current_loc_x, current_loc_y)

#         img - destination image, hand_lms - for each hand, mp_hands.HAND_CONNECTIONS - to connect the dots(points)
            mp_draw.draw_landmarks(img, hand_lms, mp_hands.HAND_CONNECTIONS)

            prev_loc_x, prev_loc_y = current_loc_x, current_loc_y

        current_time = time.time()
        fps = 1 / (current_time - previous_time)
        fps_sum += fps

        cv2.putText(img, str(int(fps)), (10,70), cv2.FONT_HERSHEY_COMPLEX, 2, (0,0,255), 3)
        cv2.imshow("Video", img)

        if (cv2.waitKey(1) & 0xFF == ord('q')) or stop_threads: 
#             adds a delay between each image and checks if 'q' is pressed or 'done' is said to close the window
            print("Average Framerate is:", fps_sum/count)
            break

    cap.release()
    cv2.destroyAllWindows() # for jupyter notebook

## Speech Recognition System

In [4]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="SERVICE_KEY.JSON"

# Audio recording parameters
RATE = 16000
CHUNK = int(RATE / 10)  # 100ms

In [5]:
class MicrophoneStream(object):
    """Opens a recording stream as a generator yielding the audio chunks."""

    def __init__(self, rate, chunk):
        self._rate = rate
        self._chunk = chunk

        # Create a thread-safe buffer of audio data
        self._buff = queue.Queue()
        self.closed = True

    def __enter__(self):
        self._audio_interface = pyaudio.PyAudio()
        self._audio_stream = self._audio_interface.open(
            format=pyaudio.paInt16,
            # The API currently only supports 1-channel (mono) audio
            # https://goo.gl/z757pE
            channels=1,
            rate=self._rate,
            input=True,
            frames_per_buffer=self._chunk,
            # Run the audio stream asynchronously to fill the buffer object.
            # This is necessary so that the input device's buffer doesn't
            # overflow while the calling thread makes network requests, etc.
            stream_callback=self._fill_buffer,
        )

        self.closed = False

        return self

    def __exit__(self, type, value, traceback):
        self._audio_stream.stop_stream()
        self._audio_stream.close()
        self.closed = True
        # Signal the generator to terminate so that the client's
        # streaming_recognize method will not block the process termination.
        self._buff.put(None)
        self._audio_interface.terminate()

    def _fill_buffer(self, in_data, frame_count, time_info, status_flags):
        """Continuously collect data from the audio stream, into the buffer."""
        self._buff.put(in_data)
        return None, pyaudio.paContinue

    def generator(self):
        while not self.closed:
            # Use a blocking get() to ensure there's at least one chunk of
            # data, and stop iteration if the chunk is None, indicating the
            # end of the audio stream.
            chunk = self._buff.get()
            if chunk is None:
                return
            data = [chunk]

            # Now consume whatever other data's still buffered.
            while True:
                try:
                    chunk = self._buff.get(block=False)
                    if chunk is None:
                        return
                    data.append(chunk)
                except queue.Empty:
                    break

            yield b"".join(data)

In [6]:
def listen_print_loop(responses):
    """Iterates through server responses and prints them.

    The responses passed is a generator that will block until a response
    is provided by the server.

    Each response may contain multiple results, and each result may contain
    multiple alternatives; for details, see https://goo.gl/tjCPAU.  Here we
    print only the transcription for the top alternative of the top result.

    In this case, responses are provided for interim results as well. If the
    response is an interim one, print a line feed at the end of it, to allow
    the next result to overwrite it, until the response is a final one. For the
    final one, print a newline to preserve the finalized transcription.
    """
    global stop_threads
#     num_chars_printed = 0
    
    for response in responses:
        previous_time = time.time()
        if not response.results:
#             print("Latency =", time.time() - previous_time)
            continue

        # The `results` list is consecutive. For streaming, we only care about
        # the first result being considered, since once it's `is_final`, it
        # moves on to considering the next utterance.
        result = response.results[0]
        if not result.alternatives:
#             print("Latency =", time.time() - previous_time)
            continue

        # Display the transcription of the top alternative.
        transcript = result.alternatives[0].transcript

        # Display interim results, but with a carriage return at the end of the
        # line, so subsequent lines will overwrite them.
        #
        # If the previous result was longer than this one, we need to print
        # some extra spaces to overwrite the previous result
        
#         overwrite_chars = " " * (num_chars_printed - len(transcript))

#         if not result.is_final:
#             sys.stdout.write(transcript + overwrite_chars + "\r")
#             sys.stdout.flush()

#             num_chars_printed = len(transcript)

        if result.is_final:
            transcript = transcript.strip()
#             if(transcript == 'double click' or transcript == 'open' or transcript == "double-click"):
            if re.search(r"^(double[- ]?click|open)", transcript, re.I):
                mouse_left_click()
                mouse_left_click()
                print(transcript.title())
#                 print("Double Click")
                
#             elif(transcript == 'click' or transcript == 'tap' or transcript == 'left click' or transcript == 'left-click' or transcript == 'Click'):
            elif re.search(r"^(click|tap|left[- ]?click)", transcript, re.I):
                mouse_left_click()
                print(transcript.title())
#                 print("Left Click")
                
            elif(transcript == 'right click' or transcript == 'right-click' or transcript == "Rightclick" or transcript == "Right-Click" or transcript == "Right-click"):
#             elif re.search(r"^right[- ]?click", transcript, re.I):
                mouse_right_click()
                print(transcript.title())
#                 print("Right Click")
                
#             elif(transcript == 'copy'):
            elif re.search(r"^copy", transcript, re.I):
                copy_item()
                print(transcript.title())
#                 print("Copy")
            
#             elif(transcript == 'cut' or transcript == 'move'):
            elif re.search(r"^(cut|move)", transcript, re.I):
                cut_item()
#                 print(transcript.title())
                print("Cut")
                
#             elif(transcript == 'paste'):
            elif re.search(r"^paste", transcript, re.I):
                paste_item()
                print(transcript.title())
#                 print("Paste")
            
#             elif(transcript == 'scroll up' or transcript == 'ScrollUp' or transcript == 'scrollup' or transcript == 'scrollUp' or transcript == 'Scrollup' or transcript =='Scroll Up' or transcript == 'scroll-up' or transcript == 'Scroll-Up'):
            elif re.search(r"^scroll[- ]?up", transcript, re.I):
                scroll(4)
                print(transcript.title())
#                 print("Scroll Up")
            
#             elif(transcript == 'scroll down' or transcript == 'ScrollDown' or transcript == 'scrolldown' or transcript == 'scrollDown' or transcript == 'Scrolldown' or transcript =='Scroll Down' or transcript == 'scroll-down' or transcript == 'Scroll-Down'):
            elif re.search(r"^scroll[- ]?down", transcript, re.I):
                scroll(-4)
                print(transcript.title())
#                 print("Scroll Down")
                
#             elif(transcript == 'zoom in' or transcript == 'ZoomIn' or transcript == "zoomin" or transcript == "Zoomin" or transcript == "zoom-in" or transcript == 'Zoom-In'):
            elif re.search(r"^zoom[- ]?in", transcript, re.I):
                scroll(8)
                print(transcript.title())
#                 print("Zoom In")
            
#             elif(transcript == 'zoom out' or transcript == 'ZoomOut' or transcript == 'zoomout' or transcript == "Zoom Out" or transcript == "zoom-out" or transcript == "Zoom-Out"):
            elif re.search(r"^zoom[- ]?out", transcript, re.I):
                scroll(-8)
                print(transcript.title())
#                 print("Zoom Out")
                
#             print(transcript + overwrite_chars)

            # Exit recognition if any of the transcribed phrases could be
            # one of our keywords.
#             elif re.search(r"\b(exit|quit|close|done)\b", transcript, re.I):
            elif re.search(r"^(exit|quit|close|done)", transcript, re.I):
                stop_threads = True
                print("Exiting..")
                print("Latency =", time.time() - previous_time)
                break

#             num_chars_printed = 0
            
            print("Latency =", time.time() - previous_time)

In [7]:
def speech_recognition():
    global stop_threads
    stop_threads = False
    # See http://g.co/cloud/speech/docs/languages
    # for a list of supported languages.
    language_code = "en-US"  # a BCP-47 language tag
    speech_context = speech.SpeechContext(phrases = ['double click','open','double-click','click','tap','left click',
                                                        'left-click','right click','right-click',
                                                        'copy','cut','move','paste','scroll up',
                                                        'scrollup','scroll-up',
                                                        'scroll down','scrolldown',
                                                        'scroll-down','zoom in','zoomin',
                                                        'zoom-in','zoom out','zoomout',
                                                        'zoom-out','close','done'])

    client = speech.SpeechClient()
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code,
        speech_contexts = [speech_context],
    )

    streaming_config = speech.StreamingRecognitionConfig(
        config=config, interim_results=True
    )

    with MicrophoneStream(RATE, CHUNK) as stream:
        audio_generator = stream.generator()
        requests = (
            speech.StreamingRecognizeRequest(audio_content=content)
            for content in audio_generator
        )

        responses = client.streaming_recognize(streaming_config, requests)

        # Now, put the transcription responses to use.
        print("up and running")
        listen_print_loop(responses)
        #Speed. I am Speed.

## Multithreading

In [8]:
def multithreading():
    thread_1 = threading.Thread(target = gesture_recognition)
    thread_2 = threading.Thread(target = speech_recognition)

    thread_1.start()
    thread_2.start()
    thread_1.join()
    thread_2.join()

In [None]:
multithreading()