## Video to Audio Conversion

### Install necessary libraries

In [2]:
%pip install moviepy
%pip install -U sentence-transformers
%pip install pvleopard
%pip install pydub

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
#Imports
import moviepy.editor as mp
import pandas as pd
import numpy as np
import pvleopard
import os
import shutil
from pydub import AudioSegment
from pydub.utils import make_chunks
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('stopwords')
from numpy.linalg import norm
from sentence_transformers import SentenceTransformer

### Video to Audio Conversion

In [None]:
leopard = pvleopard.create(access_key='clzWvIxwJZXvEIc4rhS9oBA3yoJ8GzcTqZfvn6KuqL/M7FPcB2HJWQ==')

In [2]:
def audio_file(link):
    clip = mp.VideoFileClip(link) 
    clip.audio.write_audiofile(r"converted.wav")
    return  "converted.wav"

MoviePy - Writing audio in converted.wav


                                                                        

MoviePy - Done.




# Making chunks

In [30]:
def chunks(file):
    try :
        os.mkdir('chunks')
    except: 
        shutil.rmtree('chunks')
        os.mkdir('chunks')
    myaudio = AudioSegment.from_file(file, "wav") 
    chunk_length_ms = 100000 # pydub calculates in millisec
    chunks = make_chunks(myaudio, chunk_length_ms) #Make chunks of 100 sec

    #Export all of the individual chunks as wav files
    chunks_list=[]
    for i, chunk in enumerate(chunks):
        chunk_name = "./chunks/{0}chunk.wav".format(i)
        print ("exporting", chunk_name)
        chunk.export(chunk_name, format="wav")
        chunks_list.append(chunk_name)
    return chunks_list

In [8]:
def transcript(chunks_list):
    sentences=[]
    for x in chunks_list:    
        tsc, words = leopard.process_file(x)
        sentences.append(tsc)
    transcript=" ".join(sentences)
    return sentence,transcript

# Audio to text conversion

# Decreasing the search space

### Employing TF-IDF 

In [14]:
vectorizer = TfidfVectorizer(max_features=10,stop_words=stopwords.words('english'))    
corpus = [transcript]
X = vectorizer.fit_transform(corpus).todense()
df = pd.DataFrame(X, columns=vectorizer.get_feature_names_out())
df

Unnamed: 0,algorithm,class,kind,know,like,okay,one,problem,right,time
0,0.318677,0.171595,0.171595,0.165467,0.165467,0.226751,0.214494,0.239008,0.76605,0.208366


In [15]:
X1 = np.array(X).flatten()
X2  = vectorizer.get_feature_names_out().flatten()
X2

array(['algorithm', 'class', 'kind', 'know', 'like', 'okay', 'one',
       'problem', 'right', 'time'], dtype=object)

In [16]:
arr = np.array([X1,X2]).T
arr=sorted(arr,key=lambda x:x[0],reverse=True)
arr

[array([0.7660498805166968, 'right'], dtype=object),
 array([0.31867675029494585, 'algorithm'], dtype=object),
 array([0.23900756272120938, 'problem'], dtype=object),
 array([0.22675076463294225, 'okay'], dtype=object),
 array([0.2144939665446751, 'one'], dtype=object),
 array([0.20836556750054153, 'time'], dtype=object),
 array([0.17159517323574008, 'class'], dtype=object),
 array([0.17159517323574008, 'kind'], dtype=object),
 array([0.1654667741916065, 'know'], dtype=object),
 array([0.1654667741916065, 'like'], dtype=object)]

# Encoding the various chunks

In [17]:

model = SentenceTransformer('all-MiniLM-L6-v2')

In [18]:

sentence_embeddings = model.encode(sentences)

In [19]:
sentence_embeddings.shape

(28, 384)

In [20]:
query = ['algorithm problem']

query_embedding = model.encode(query).reshape(1, -1)
query_embedding[0].shape

(384,)

# Applying cosine similarity

In [36]:
similarities=[]
for i in range(len(sentences)):
    cosine = np.dot(sentence_embeddings[i],query_embedding[0])/(norm(sentence_embeddings[i])*norm(query_embedding[0]))
    similarities.append(cosine)

In [37]:
np.argsort(similarities)[-3:]

array([7, 5, 6])