In [None]:
#Enabling Tensorflow using TPU engine

%tensorflow_version 1.x
import tensorflow as tf
print("Tensorflow version " + tf.__version__)

try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
  raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)


Tensorflow version 1.15.2
Running on TPU  ['10.98.133.42:8470']
INFO:tensorflow:Initializing the TPU system: 10.98.133.42:8470
INFO:tensorflow:Finished initializing TPU system.
INFO:tensorflow:Querying Tensorflow master (grpc://10.98.133.42:8470) for TPU system metadata.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, -1, 17187712677345604422)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 4319665852127281055)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 3214836817000303880)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 15235130718227971614)
INFO:tensorflow:*** Avail

In [None]:
#importing necessary modules

print(tf.__version__)
import tensorflow_hub as hub
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

module_url = "https://tfhub.dev/google/universal-sentence-encoder/1?tf-hub-format=compressed"

# Import the Universal Sentence Encoder's TF Hub module
embed = hub.Module(module_url)

1.15.2


In [None]:
#Installing necessary packages

!pip install ffmpeg-python
!pip3 install --upgrade speechrecognition

Collecting ffmpeg-python
  Downloading https://files.pythonhosted.org/packages/d7/0c/56be52741f75bad4dc6555991fabd2e07b432d333da82c11ad701123888a/ffmpeg_python-0.2.0-py3-none-any.whl
Installing collected packages: ffmpeg-python
Successfully installed ffmpeg-python-0.2.0
Collecting speechrecognition
[?25l  Downloading https://files.pythonhosted.org/packages/26/e1/7f5678cd94ec1234269d23756dbdaa4c8cfaed973412f88ae8adf7893a50/SpeechRecognition-3.8.1-py2.py3-none-any.whl (32.8MB)
[K     |████████████████████████████████| 32.8MB 133kB/s 
[?25hInstalling collected packages: speechrecognition
Successfully installed speechrecognition-3.8.1


In [None]:
#Speech Recognition Code to enable microphone

from IPython.display import HTML, Audio
from google.colab.output import eval_js
from base64 import b64decode
import numpy as np
from scipy.io.wavfile import read as wav_read
import io
import ffmpeg

AUDIO_HTML = """
<script>
var my_div = document.createElement("DIV");
var my_p = document.createElement("P");
var my_btn = document.createElement("BUTTON");
var t = document.createTextNode("Press to start recording");

my_btn.appendChild(t);
//my_p.appendChild(my_btn);
my_div.appendChild(my_btn);
document.body.appendChild(my_div);

var base64data = 0;
var reader;
var recorder, gumStream;
var recordButton = my_btn;

var handleSuccess = function(stream) {
  gumStream = stream;
  var options = {
    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k
    mimeType : 'audio/webm;codecs=opus'
    //mimeType : 'audio/webm;codecs=pcm'
  };            
  //recorder = new MediaRecorder(stream, options);
  recorder = new MediaRecorder(stream);
  recorder.ondataavailable = function(e) {            
    var url = URL.createObjectURL(e.data);
    var preview = document.createElement('audio');
    preview.controls = true;
    preview.src = url;
    document.body.appendChild(preview);

    reader = new FileReader();
    reader.readAsDataURL(e.data); 
    reader.onloadend = function() {
      base64data = reader.result;
      //console.log("Inside FileReader:" + base64data);
    }
  };
  recorder.start();
  };

recordButton.innerText = "Recording... press to stop";

navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);


function toggleRecording() {
  if (recorder && recorder.state == "recording") {
      recorder.stop();
      gumStream.getAudioTracks()[0].stop();
      recordButton.innerText = "Saving the recording... pls wait!"
  }
}

// https://stackoverflow.com/a/951057
function sleep(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

var data = new Promise(resolve=>{
//recordButton.addEventListener("click", toggleRecording);
recordButton.onclick = ()=>{
toggleRecording()

sleep(2000).then(() => {
  // wait 2000ms for the data to be available...
  // ideally this should use something like await...
  //console.log("Inside data:" + base64data)
  resolve(base64data.toString())

});

}
});
      
</script>
"""

def get_audio():
  display(HTML(AUDIO_HTML))
  data = eval_js("data")
  binary = b64decode(data.split(',')[1])
  
  process = (ffmpeg
    .input('pipe:0')
    .output('pipe:1', format='wav')
    .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)
  )
  output, err = process.communicate(input=binary)
  
  riff_chunk_size = len(output) - 8
  # Break up the chunk size into four bytes, held in b.
  q = riff_chunk_size
  b = []
  for i in range(4):
      q, r = divmod(q, 256)
      b.append(r)

  # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.
  riff = output[:4] + bytes(b) + output[8:]

  sr, audio = wav_read(io.BytesIO(riff))

  return audio, sr

In [None]:
#Function to find semantic similarity of two sentences using Google's Universal Sentence Encoder

def semantic_similarity(cap1,cap2):
    messages=[cap1,cap2]
    similarity_input_placeholder = tf.placeholder(tf.string, shape=(None))
    similarity_message_encodings = embed(similarity_input_placeholder)
    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        message_embeddings_ = session.run(similarity_message_encodings, feed_dict={similarity_input_placeholder: messages})    
    return cosine_similarity(message_embeddings_[0].reshape(1,-1),message_embeddings_[1].reshape(1,-1))

In [None]:
#Speech to Text conversion to obtain the voice query as text

text=0
import speech_recognition
r = speech_recognition.Recognizer()
print("Speak Anything :")
audio, sr = get_audio()
#print(type(audio))
import numpy as np
from scipy.io.wavfile import write


write('test.wav', 44100, audio)
with speech_recognition.AudioFile('test.wav') as source:
        audio = r.record(source)  # read the entire audio file                  
        try:
            text = r.recognize_google(audio)
            print("You said : {}".format(text))
        except:
            print("Sorry could not recognize what you said")

Speak Anything :


You said : woman is skiing


In [None]:
#Final set of tracks for the input video

final=[['a person holding a pair of scissors on a table',
  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]],
 ['a woman holding a wii remote in her hand', [19, 20, 21, 22]],
 ['a person holding a pair of scissors in front of their face',
  [23, 24, 25, 26, 27, 28, 30, 32, 33, 34, 36, 38]],
 ['a view of a mountain range from a plane',
  [29, 30, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44]],
 ['a busy city street filled with lots of traffic', [31, 32, 33, 34, 36]],
 ['a bus is parked on the side of the street', [45, 46, 48, 49]],
 ['a sign that says UNK UNK on the side of it', [50, 51, 52]],
 ['a view of a city street with a traffic light on it',
  [53,
   54,
   55,
   56,
   57,
   58,
   59,
   60,
   61,
   62,
   63,
   64,
   65,
   66,
   67,
   68,
   69,
   70,
   71,
   72,
   73,
   74,
   76,
   77,
   78,
   79,
   80,
   81,
   82,
   83,
   84,
   85,
   86,
   88,
   89,
   90,
   91,
   92,
   93,
   94,
   95,
   96,
   97,
   98,
   99,
   100,
   101,
   102,
   103,
   104,
   105,
   106]],
 ['a little girl holding a cat in her hand',
  [107,
   108,
   109,
   110,
   111,
   112,
   113,
   114,
   115,
   116,
   117,
   118,
   119,
   120,
   121,
   122,
   123,
   124,
   125,
   126,
   127,
   128,
   129,
   130]],
 ['a stop sign is on a pole on a street',
  [131,
   132,
   133,
   134,
   135,
   136,
   137,
   138,
   139,
   140,
   141,
   142,
   143,
   144,
   145,
   146,
   147,
   149]],
 ['a picture of a plane flying in the sky',
  [148, 149, 150, 151, 152, 153, 154]],
 ['a large building with a large clock tower',
  [155,
   156,
   157,
   158,
   159,
   160,
   161,
   162,
   163,
   164,
   165,
   166,
   167,
   168,
   169,
   171,
   172,
   173,
   174,
   175,
   176,
   177,
   178,
   179,
   180,
   181,
   182,
   183,
   184,
   185,
   186,
   187,
   188,
   189,
   190,
   191,
   192,
   193,
   194,
   195,
   196,
   197,
   198,
   199,
   200,
   201,
   202,
   203]],
 ['a person holding a cell phone in their hand', [205, 207, 209, 211]],
 ['a woman is holding a toothbrush in her mouth',
  [208, 209, 210, 212, 213, 214, 215, 216, 217, 218, 219, 220, 222, 223, 224]],
 ['a man and woman are holding a cake', [221, 222, 223, 224]],
 ['a street sign on a pole on a city street',
  [225,
   226,
   227,
   228,
   229,
   230,
   231,
   232,
   233,
   234,
   235,
   236,
   237,
   238,
   239,
   240,
   241,
   242,
   243,
   244,
   245,
   246]],
 ['a man riding skis down a snow covered slope',
  [247,
   248,
   249,
   250,
   251,
   252,
   253,
   254,
   255,
   256,
   257,
   258,
   259,
   260,
   261,
   262,
   263,
   264,
   265,
   266,
   267,
   268,
   269,
   270,
   271,
   272,
   273,
   274,
   275]],
 ['a person standing on a snow covered ski slope',
  [280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291]],
 ['a close up of a person in a mirror', [292, 294, 296, 298, 299, 300, 301]],
 ['a woman is talking on a cell phone', [295, 296, 298, 299, 300, 301]],
 ['a man is holding a hot dog in his hand',
  [297, 298, 299, 300, 301, 302, 303, 304, 305, 306]],
 ['a person standing in a room with luggage', [308, 309, 310]],
 ['a man holding a cake with a candle on it',
  [311, 312, 313, 314, 315, 316, 317, 319, 320, 321, 323, 324, 325, 326]],
 ['a man is eating a sandwich in a restaurant',
  [318,
   319,
   320,
   321,
   322,
   323,
   324,
   325,
   326,
   327,
   328,
   329,
   330,
   331,
   333,
   334]],
 ['a woman sitting at a table with a glass of wine',
  [332,
   333,
   334,
   335,
   336,
   337,
   338,
   339,
   340,
   341,
   341,
   341,
   342,
   343,
   344,
   345,
   346,
   347,
   348,
   349,
   350,
   351,
   352]],
 ['a man and a woman eating a hot dog', [353, 354, 355]],
 ['a woman is taking a picture of herself in a mirror', [360, 362, 364]],
 ['a woman is holding a baby in her hand',
  [356, 357, 359, 361, 363, 364, 365, 366, 368, 369]],
 ['a traffic light with a red light on it',
  [371, 372, 374, 375, 376, 377, 378, 379]],
 ['a person is standing on a street corner',
  [373, 374, 375, 376, 377, 378, 379]],
 ['a bottle of wine sitting on top of a counter',
  [380,
   381,
   382,
   383,
   384,
   385,
   386,
   387,
   388,
   389,
   390,
   391,
   392,
   393,
   394,
   395,
   396,
   397,
   399,
   401,
   402,
   403,
   404,
   405,
   406,
   407,
   408]],
 ['a man is eating a sandwich on a plate',
  [398, 399, 400, 401, 403, 405, 406, 407, 408, 410, 411, 412, 414]],
 ['a group of people standing around a table with a laptop', [424, 425, 426]],
 ['a man is standing in front of a car',
  [413,
   414,
   415,
   416,
   417,
   418,
   419,
   420,
   421,
   422,
   423,
   425,
   426,
   427,
   428,
   429,
   430,
   431,
   432,
   433,
   434,
   435,
   436,
   437,
   438]],
 ['a woman is holding a glass of wine', [454, 455, 456, 457]],
 ['a close up of a person holding a pair of scissors',
  [440,
   441,
   443,
   444,
   445,
   446,
   447,
   448,
   449,
   450,
   451,
   452,
   453,
   455,
   457,
   459,
   460]],
 ['a woman is holding a glass of wine', [461, 462, 463, 464, 465, 466]],
 ['a man is holding a cell phone in his hand',
  [469, 471, 472, 473, 474, 475, 477]],
 ['a person is holding a black and white photo', [480, 481, 482]],
 ['a view of a mountain range from the ocean', [483, 484, 485]],
 ['a man in a suit and tie standing in front of a mirror', [488, 490, 491]],
 ['a person is standing in front of a window', [486, 487, 489, 490, 492]],
 ['a person is holding a teddy bear in the dark', [493, 494, 495]],
 ['a close up of a person holding a pair of scissors',
  [496, 497, 498, 499, 501]],
 ['a traffic light with a red light on it', [502, 503, 504]],
 ['a man holding a cell phone in his hand', [506, 507, 508]],
 ['a sign that says UNK UNK UNK on it', [512, 513, 514, 515, 516, 517]]]

In [None]:
#Generating tracks and their duration

video_length=52 #seconds
num_frames=520
frame_len=video_length/num_frames
print(frame_len)

start_end=[]
for track in final:
    start=int(track[1][0])
    end=int(track[1][-1])
    #start_end.append([start,end])
    start_end.append([track[0],[round(start*frame_len,2),round(end*frame_len,2)]])
start_end

0.1


[['a person holding a pair of scissors on a table', [0.0, 1.8]],
 ['a woman holding a wii remote in her hand', [1.9, 2.2]],
 ['a person holding a pair of scissors in front of their face', [2.3, 3.8]],
 ['a view of a mountain range from a plane', [2.9, 4.4]],
 ['a busy city street filled with lots of traffic', [3.1, 3.6]],
 ['a bus is parked on the side of the street', [4.5, 4.9]],
 ['a sign that says UNK UNK on the side of it', [5.0, 5.2]],
 ['a view of a city street with a traffic light on it', [5.3, 10.6]],
 ['a little girl holding a cat in her hand', [10.7, 13.0]],
 ['a stop sign is on a pole on a street', [13.1, 14.9]],
 ['a picture of a plane flying in the sky', [14.8, 15.4]],
 ['a large building with a large clock tower', [15.5, 20.3]],
 ['a person holding a cell phone in their hand', [20.5, 21.1]],
 ['a woman is holding a toothbrush in her mouth', [20.8, 22.4]],
 ['a man and woman are holding a cake', [22.1, 22.4]],
 ['a street sign on a pole on a city street', [22.5, 24.6]],
 [

In [None]:
from IPython.display import HTML
from base64 import b64encode
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip

c=0
for i in start_end:
  print(i[0])
  #Semantic similarity checked between the text and each entry in the tracklist and video segment retrieved
  if(semantic_similarity(i[0],text)>=0.5):
    #Padding performed if required
    print("match")
    if((i[1][0]-0.5)>0 and (i[1][1]-i[1][0])<1.5 and (i[1][1]+0.5)<52):
      ffmpeg_extract_subclip("RUSSIA_ 1 MINUTE TRAVEL VLOG.mp4", i[1][0]-0.5, i[1][1]+0.5, targetname="video"+str(c)+".mp4")
    elif((i[1][0]-0.5)<0 and (i[1][1]-i[1][0])<1.5 and (i[1][1]+0.5)<52):
      ffmpeg_extract_subclip("RUSSIA_ 1 MINUTE TRAVEL VLOG.mp4", i[1][0], i[1][1]+1, targetname="video"+str(c)+".mp4")   
    elif((i[1][0]-0.5)>0 and (i[1][1]-i[1][0])<1.5 and (i[1][1]+0.5)>52):
      ffmpeg_extract_subclip("RUSSIA_ 1 MINUTE TRAVEL VLOG.mp4", i[1][0]-1, i[1][1], targetname="video"+str(c)+".mp4")
    else:
      ffmpeg_extract_subclip("RUSSIA_ 1 MINUTE TRAVEL VLOG.mp4", i[1][0], i[1][1], targetname="video"+str(c)+".mp4")
    #play(c)
    c+=1

a person holding a pair of scissors on a table
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


a woman holding a wii remote in her hand
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


a person holding a pair of scissors in front of their face
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


a view of a mountain range from a plane
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


a busy city street filled with lots of traffic
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


a bus is parked on the side of the street
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


a sign that says UNK UNK on the side of it
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


a view of a city street with a traffic light on it
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


a little girl holding a cat in her hand
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


a stop sign is on a pole on a street
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


a picture of a plane flying in the sky
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


a large building with a large clock tower
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


a person holding a cell phone in their hand
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


a woman is holding a toothbrush in her mouth
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [None]:
#Playing of results
c=3
import ipywidgets as widgets
from IPython.display import display,clear_output
from IPython.display import YouTubeVideo,HTML
import base64
import io
button = widgets.Button(description="Play Next")
output = widgets.Output()
x=0
def on_button_clicked(b):
  # Display the message within the output widget.
  with output:
    clear_output()
    global x
    video = io.open('video'+str(x)+'.mp4', 'r+b').read()
    encoded = base64.b64encode(video)
    display(HTML(data='''<video width="320" height="240" controls><source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'''.format(encoded.decode('ascii'))))
    print("Result ",x+1)
    x+=1
    x%=c

button.on_click(on_button_clicked)
display(button, output)

Button(description='Play Next', style=ButtonStyle())

Output()