In [None]:
#Enabling Tensorflow using TPU engine

%tensorflow_version 1.x
import tensorflow as tf
print("Tensorflow version " + tf.__version__)

try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
  raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)


TensorFlow 1.x selected.
Tensorflow version 1.15.2
Running on TPU  ['10.77.115.242:8470']
INFO:tensorflow:Initializing the TPU system: 10.77.115.242:8470
INFO:tensorflow:Finished initializing TPU system.
INFO:tensorflow:Querying Tensorflow master (grpc://10.77.115.242:8470) for TPU system metadata.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, -1, 5369540710660063690)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 17206099995930652858)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 13941344766462294965)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 361922778064283026

In [None]:
#importing necessary modules

print(tf.__version__)
import tensorflow_hub as hub
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

module_url = "https://tfhub.dev/google/universal-sentence-encoder/1?tf-hub-format=compressed"

# Import the Universal Sentence Encoder's TF Hub module
embed = hub.Module(module_url)

1.15.2


In [None]:
#Installing necessary packages

!pip install ffmpeg-python
!pip3 install --upgrade speechrecognition

Collecting ffmpeg-python
  Downloading https://files.pythonhosted.org/packages/d7/0c/56be52741f75bad4dc6555991fabd2e07b432d333da82c11ad701123888a/ffmpeg_python-0.2.0-py3-none-any.whl
Installing collected packages: ffmpeg-python
Successfully installed ffmpeg-python-0.2.0
Collecting speechrecognition
[?25l  Downloading https://files.pythonhosted.org/packages/26/e1/7f5678cd94ec1234269d23756dbdaa4c8cfaed973412f88ae8adf7893a50/SpeechRecognition-3.8.1-py2.py3-none-any.whl (32.8MB)
[K     |████████████████████████████████| 32.8MB 110kB/s 
[?25hInstalling collected packages: speechrecognition
Successfully installed speechrecognition-3.8.1


In [None]:
#Speech Recognition Code to enable microphone

from IPython.display import HTML, Audio
from google.colab.output import eval_js
from base64 import b64decode
import numpy as np
from scipy.io.wavfile import read as wav_read
import io
import ffmpeg

AUDIO_HTML = """
<script>
var my_div = document.createElement("DIV");
var my_p = document.createElement("P");
var my_btn = document.createElement("BUTTON");
var t = document.createTextNode("Press to start recording");

my_btn.appendChild(t);
//my_p.appendChild(my_btn);
my_div.appendChild(my_btn);
document.body.appendChild(my_div);

var base64data = 0;
var reader;
var recorder, gumStream;
var recordButton = my_btn;

var handleSuccess = function(stream) {
  gumStream = stream;
  var options = {
    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k
    mimeType : 'audio/webm;codecs=opus'
    //mimeType : 'audio/webm;codecs=pcm'
  };            
  //recorder = new MediaRecorder(stream, options);
  recorder = new MediaRecorder(stream);
  recorder.ondataavailable = function(e) {            
    var url = URL.createObjectURL(e.data);
    var preview = document.createElement('audio');
    preview.controls = true;
    preview.src = url;
    document.body.appendChild(preview);

    reader = new FileReader();
    reader.readAsDataURL(e.data); 
    reader.onloadend = function() {
      base64data = reader.result;
      //console.log("Inside FileReader:" + base64data);
    }
  };
  recorder.start();
  };

recordButton.innerText = "Recording... press to stop";

navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);


function toggleRecording() {
  if (recorder && recorder.state == "recording") {
      recorder.stop();
      gumStream.getAudioTracks()[0].stop();
      recordButton.innerText = "Saving the recording... pls wait!"
  }
}

// https://stackoverflow.com/a/951057
function sleep(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

var data = new Promise(resolve=>{
//recordButton.addEventListener("click", toggleRecording);
recordButton.onclick = ()=>{
toggleRecording()

sleep(2000).then(() => {
  // wait 2000ms for the data to be available...
  // ideally this should use something like await...
  //console.log("Inside data:" + base64data)
  resolve(base64data.toString())

});

}
});
      
</script>
"""

def get_audio():
  display(HTML(AUDIO_HTML))
  data = eval_js("data")
  binary = b64decode(data.split(',')[1])
  
  process = (ffmpeg
    .input('pipe:0')
    .output('pipe:1', format='wav')
    .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)
  )
  output, err = process.communicate(input=binary)
  
  riff_chunk_size = len(output) - 8
  # Break up the chunk size into four bytes, held in b.
  q = riff_chunk_size
  b = []
  for i in range(4):
      q, r = divmod(q, 256)
      b.append(r)

  # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.
  riff = output[:4] + bytes(b) + output[8:]

  sr, audio = wav_read(io.BytesIO(riff))

  return audio, sr

In [None]:
#Function to find semantic similarity of two sentences using Google's Universal Sentence Encoder

def semantic_similarity(cap1,cap2):
    messages=[cap1,cap2]
    similarity_input_placeholder = tf.placeholder(tf.string, shape=(None))
    similarity_message_encodings = embed(similarity_input_placeholder)
    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        message_embeddings_ = session.run(similarity_message_encodings, feed_dict={similarity_input_placeholder: messages})    
    return cosine_similarity(message_embeddings_[0].reshape(1,-1),message_embeddings_[1].reshape(1,-1))

In [None]:
#Speech to Text conversion to obtain the voice query as text

text=0
import speech_recognition
r = speech_recognition.Recognizer()
print("Speak Anything :")
audio, sr = get_audio()
#print(type(audio))
import numpy as np
from scipy.io.wavfile import write

#data = np.random.uniform(-1,1,44100) # 44100 random samples between -1 and 1
#scaled = np.int16(data/np.max(np.abs(data)) * 32767)
write('test.wav', 44100, audio)
with speech_recognition.AudioFile('test.wav') as source:
        audio = r.record(source)  # read the entire audio file                  
        try:
            text = r.recognize_google(audio)
            print("You said : {}".format(text))
        except:
            print("Sorry could not recognize what you said")

Speak Anything :


You said : woman is skiing


In [None]:
#Mounting drive

from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [None]:
#Reading Final tracks

f=open(r'drive/My Drive/Mini-Project/final_tracks.txt')
c=0
for x in f:
    c+=1
    if(c==1):
        final=eval(x[15:])

In [None]:
len(final)

156

In [None]:
#Generating tracks and their duration

video_length=52 #seconds
num_frames=520
frame_len=video_length/num_frames
print(frame_len)

start_end=[]
for track in range(len(final)):
    start=int(final[track][2][0][5:-5])
    end=int(final[track][2][-1][5:-5])
    start_end.append([final[track][0],[round(start*frame_len,2),round(end*frame_len,2)],track])
start_end

0.1


[['the hand of a person', [0.0, 2.0], 0],
 ['a pink and white sign', [0.0, 2.6], 1],
 ['white tile on the floor', [0.0, 0.7], 2],
 ['a brown tile floor', [0.8, 1.2], 3],
 ['the girl is smiling', [1.8, 2.6], 4],
 ['a man holding a cake', [1.7, 2.7], 5],
 ['hand holding a pizza', [2.3, 2.7], 6],
 ['a cloudy sky', [2.9, 3.6], 7],
 ['buildings in the background', [3.0, 3.6], 8],
 ['a building with many windows', [3.0, 3.7], 9],
 ['tall building with many windows', [3.4, 4.1], 10],
 ['a city scene', [3.6, 4.3], 11],
 ['white clouds in blue sky', [4.2, 4.6], 12],
 ['a building in the background', [3.0, 5.3], 13],
 ['a large blue and white sign', [5.0, 5.4], 14],
 ['cars parked on the street', [5.7, 7.4], 15],
 ['a building with many windows', [6.0, 7.4], 16],
 ['a building with many windows', [6.1, 6.5], 17],
 ['a building with a red roof', [7.1, 7.5], 18],
 ['power lines in the sky', [7.5, 8.1], 19],
 ['street light on a pole', [8.2, 8.5], 20],
 ['power lines in the sky', [7.5, 8.6], 21],
 

In [None]:
from IPython.display import HTML
from base64 import b64encode
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip

c=0
#text="a woman is skiing"

#Semantic similarity checked between the text and each entry in the tracklist and video segment retrieved
for i in start_end:
  print("xxxxxxxx",i[2],"xxxxxxxx")
  x=semantic_similarity(i[0],text)
  #Threshold set to 0.6
  if(x>=0.6):
    print("match")
    print("Tracklet caption - ",i[0],", Similarity - ",x)
    print(final[i[2]])

    #Padding performed if required
    if((i[1][0]-0.5)>0 and (i[1][1]-i[1][0])<1.5 and (i[1][1]+0.5)<52):
      ffmpeg_extract_subclip("RUSSIA_ 1 MINUTE TRAVEL VLOG.mp4", i[1][0]-0.5, i[1][1]+0.5, targetname="video"+str(c)+".mp4")
    elif((i[1][0]-0.5)<0 and (i[1][1]-i[1][0])<1.5 and (i[1][1]+0.5)<52):
      ffmpeg_extract_subclip("RUSSIA_ 1 MINUTE TRAVEL VLOG.mp4", i[1][0], i[1][1]+1, targetname="video"+str(c)+".mp4")   
    elif((i[1][0]-0.5)>0 and (i[1][1]-i[1][0])<1.5 and (i[1][1]+0.5)>52):
      ffmpeg_extract_subclip("RUSSIA_ 1 MINUTE TRAVEL VLOG.mp4", i[1][0]-1, i[1][1], targetname="video"+str(c)+".mp4")
    else:
      ffmpeg_extract_subclip("RUSSIA_ 1 MINUTE TRAVEL VLOG.mp4", i[1][0], i[1][1], targetname="video"+str(c)+".mp4")

    c+=1

xxxxxxxx 0 xxxxxxxx
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


xxxxxxxx 1 xxxxxxxx
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


xxxxxxxx 2 xxxxxxxx
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


xxxxxxxx 3 xxxxxxxx
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


xxxxxxxx 4 xxxxxxxx
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


xxxxxxxx 5 xxxxxxxx
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


xxxxxxxx 6 xxxxxxxx
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


xxxxxxxx 7 xxxxxxxx
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


xxxxxxxx 8 xxxxxxxx
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


xxxxxxxx 9 xxxxxxxx
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


xxxxxxxx 10 xxxxxxxx
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


xxxxxxxx 11 xxxxxxxx
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


xxxxxxxx 12 xxxxxxxx
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


xxxxxxxx 13 xxxxxxxx
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


xxxxxxxx 14 xxxxxxxx
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


xxxxxxxx 15 xxxxxxxx
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [None]:
#Playing of results
c=2
import ipywidgets as widgets
from IPython.display import display,clear_output,HTML
from IPython.display import YouTubeVideo
import base64
import io
button = widgets.Button(description="Play Next")
output = widgets.Output()
x=0
def on_button_clicked(b):
  # Display the message within the output widget.
  with output:
    clear_output()
    global x
    video = io.open('video'+str(x)+'.mp4', 'r+b').read()
    encoded = base64.b64encode(video)
    display(HTML(data='''<video width="320" height="240" controls><source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'''.format(encoded.decode('ascii'))))
    print("Result ",x+1)
    x+=1
    x%=c

button.on_click(on_button_clicked)
display(button, output)

Button(description='Play Next', style=ButtonStyle())

Output()