In [1]:
import numpy as np
import time
import cv2
import os
import imutils
#import argparse
from gtts import gTTS
import matplotlib.pyplot as plt
from translate import Translator
from playsound import playsound

In [2]:
#load in the weights

net = cv2.dnn.readNetFromDarknet("yolov4-tiny.cfg", "yolov4-tiny.weights")
#make open_cv use GPU
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)

net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)

#create a container to store all the label
classes = []

with open("coco.names", "r") as f:
    classes = [line.strip() for line in f.readlines()]
    
# to determine the output layer names that we need
layer_names = net.getLayerNames()
output_layers = [layer_names[i[0] - 1] for i in net.getUnconnectedOutLayers()]
# pick a  sample from uniform distribution  from 0 -255 using size len(classes) times 3  so each color
#represent each possible class labels
colors = np.random.uniform(0, 255, size=(len(classes), 3))
 
cap = cv2.VideoCapture(0)  #return video from first webcam on computer
font = cv2.FONT_HERSHEY_PLAIN  #select a font to display text later
starting_time = time.time()
frame_id = 0  #to assign id to each frame processed
while True:
    grabbed,frame = cap.read()    #take in one frame at a time
    frame=cv2.flip(frame,1)  #flip the frame around y -axis
    frame_id += 1  #add 1 to frame_id
    (height, width) = frame.shape[:2]  #get height and weidth of the frame
# Detecting objects
    blob = cv2.dnn.blobFromImage(frame, 0.00392, (416, 416), (0, 0, 0), True, crop=False)  #create blob from image
    net.setInput(blob)  #give the blob as input to the model
    outs = net.forward(output_layers) #perform a forward pass giving us bounding boxes and probabilties
    # Showing informations on the screen  since it is a video 
    class_ids = []
    confidences = []
    boxes = []
    centers =[]
    for out in outs:  #for each of the layer output
        for detection in out:  #for all detection
            scores = detection[5:]  # extract class id and confidence of current object
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if confidence > 0.2:  #if higher than 0.2% then scale the bounding box relative to input
                # Object detected
                center_x = int(detection[0] * width)
                center_y = int(detection[1] * height)
                w = int(detection[2] * width)
                h = int(detection[3] * height)
                # Rectangle coordinates
                x = int(center_x - w / 2)
                y = int(center_y - h / 2)
                boxes.append([x, y, w, h])   #update bounding boxes, confidence and class_ids
                confidences.append(float(confidence))
                class_ids.append(class_id)
                centers.append((center_x,center_y))  
    indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.4, 0.3)  #apply non maxima supression  passed confidence and the threshold
    for i in range(len(boxes)):   
        if i in indexes:
            x, y, w, h = boxes[i]
            label = str(classes[class_ids[i]])
            confidence = confidences[i]
            color = colors[class_ids[i]]
            cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
            cv2.rectangle(frame, (x, y), (x + w, y + 30), color, -1)
            cv2.putText(frame, label + " " + str(round(confidence, 2)), (x, y + 30), font, 3, (255,255,255), 3)
            elapsed_time = time.time() - starting_time
            fps = frame_id / elapsed_time
            cv2.putText(frame, "FPS: " + str(round(fps, 2)), (10, 50), font, 3, (0, 0, 0), 3)
            cv2.imshow("Image", cv2.resize(frame,(900,900)))
            texts = []

            # ensure at least one detection exists
            if len(indexes) > 0:# loop over the indexes we are keeping
                for i in indexes.flatten():# find positionscenter
                    centerX, centerY = centers[i][0], centers[i][1]  #extract bounding boxes coordinate
                    if centerX <= w/3:
                        W_pos = "left "
                    elif centerX <= (w/3 * 2):
                        W_pos = "center "
                    else:
                        W_pos = "right "
                    if centerY <= h/3:
                        H_pos = "top "
                    elif centerY <= (h/3 * 2):
                        H_pos = "mid "
                    else:
                        H_pos = "bottom "
                    texts.append(H_pos + W_pos + classes[class_ids[i]])
                
        
            if texts:
                description = ', '.join(texts)
                translator =Translator(to_lang='fr')
                translation =translator.translate(description)

            
                tts=gTTS(translation, lang='fr')
                tts.save('tts.mp3')
                tts = playsound('tts.mp3')
                os.remove('tts.mp3')
            
            key = cv2.waitKey(1)
            if key == 27:
                print("[button pressed] ///// [esc].")
                print("[feedback] ///// Videocapturing succesfully stopped")
                cap.release()
                cv2.destroyAllWindows()
                
            break


[button pressed] ///// [esc].
[feedback] ///// Videocapturing succesfully stopped


AttributeError: 'NoneType' object has no attribute 'shape'