In [1]:
import speech_recognition as sr 
import os 
from pydub import AudioSegment
from pydub.silence import split_on_silence 
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("paulowoicho/t5-podcast-summarisation")



# text transcription (speech to text)

In [2]:
r = sr.Recognizer()

In [3]:
def get_large_audio_transcription(path):
    """
    Splitting the large audio file into chunks
    and apply speech recognition on each of these chunks
    """
    # open the audio file using pydub
    sound = AudioSegment.from_wav(path)  
    # split audio sound where silence is 700 miliseconds or more and get chunks
    chunks = split_on_silence(sound,
        # experiment with this value for your target audio file
        min_silence_len = 500,
        # adjust this per requirement
        silence_thresh = sound.dBFS-14,
        # keep the silence for 1 second, adjustable as well
        keep_silence=500,
    )
    folder_name = "audio-chunks"
    # create a directory to store the audio chunks
    if not os.path.isdir(folder_name):
        os.mkdir(folder_name)
    whole_text = ""
    # process each chunk 
    for i, audio_chunk in enumerate(chunks, start=1):
        # export audio chunk and save it in
        # the `folder_name` directory.
        chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
        audio_chunk.export(chunk_filename, format="wav")
        # recognize the chunk
        with sr.AudioFile(chunk_filename) as source:
            audio_listened = r.record(source)
            # try converting it to text
            try:
                text = r.recognize_google(audio_listened)
            except sr.UnknownValueError as e:
                pass
                #print("Error:", str(e))
            else:
                text = f"{text.capitalize()}. "
                #print(chunk_filename, ":", text)
                whole_text += text
    # return the text for all chunks detected
    return whole_text

In [4]:
if False:
    text = get_large_audio_transcription("AudioFiles/Convo5.wav")
else: 
    with open(".\\testTexts\\GA.txt", "r", encoding="utf8") as file:
        text = file.read()
print(text)

I was given the task to use the idea of genetic algorithms to be able to solve a problem. The problem that I solved, granted it does not have real world applications, was to get it to figure out a string that the user put in. Since most of my stuff has been done with functional programming, I thought that this would be a good time to use object oriented programming to solve this problem. 

The main idea of genetic algorithms is to have many entities of something, in this case it would be the random strings, and allow the most ‘fit’ entities to survive and reproduce. Doing over many iterations allows desired traits to be passed over, and hopefully will create an entity that is the most efficient in its task. This imitates natural section in evolution proposed by Charles Darwin.

Coding this was split up into two parts, the first with coding the entity object followed by the population object. The entity object containes attributes relating to a single beings such as fitnessLevel, entity

# summarization - Text Ranking and RAKE

### cleaning

In [5]:
import numpy as np
import pandas as pd
import nltk
import re
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nigel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nigel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
sentences = sent_tokenize(text)

In [7]:
# workds for filtering
def decontracted(phrase):
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [8]:
def sentCleaner(text):
    stopWords = stopwords.words('english') #set stopwords
    temp = []
    sent = [s.lower() for s in text] #make things lowercase
    sent = [decontracted(s) for s in sent] # decontract words
    
    return_sent = []
    for s in sent:
        temp = s.split()
        temp = " ".join([word for word in temp if word not in stopWords])
        return_sent.append(temp)
    
    return_sent = pd.Series(return_sent).str.replace("[^a-zA-Z]"," ")
    return return_sent

clean_sentences = sentCleaner(sentences)
clean_sentences

0     given task use idea genetic algorithms able so...
1     problem solved  granted real world application...
2     since stuff done functional programming  thoug...
3     main idea genetic algorithms many entities som...
4     many iterations allows desired traits passed o...
5     imitates natural section evolution proposed ch...
6     coding split two parts  first coding entity ob...
7     entity object containes attributes relating si...
8     population object contain attibutes targetstri...
9     entity string number characters populations ta...
10    ties fitness function  defined percentage char...
11    example would targetstring  cupcake  entity  j...
12    something note function case sensitive meaning...
13    attributes selectforbreeding mutationrate deci...
14    functions handle ideas done population class p...
15    entities selected put breed pop reproducted mu...
16    selection function gives entity generation fit...
17    followed setting entity selectforbreeding 

In [9]:
clean_sentences_sentence = " ".join(clean_sentences)
clean_sentences_sentence

'given task use idea genetic algorithms able solve problem . problem solved  granted real world applications  get figure string user put in . since stuff done functional programming  thought would good time use object oriented programming solve problem . main idea genetic algorithms many entities something  case would random strings  allow  fit  entities survive reproduce . many iterations allows desired traits passed over  hopefully create entity efficient task . imitates natural section evolution proposed charles darwin . coding split two parts  first coding entity object followed population object . entity object containes attributes relating single beings fitnesslevel  entitystring  selectionforbreeding mutationrate . population object contain attibutes targetstring  topscore  topentity  vectors entitypopulation breedingpopulation . entity string number characters populations targetstring . ties fitness function  defined percentage characters correct possition . example would targe

### vector representation and ranking

In [10]:
import wget
import networkx as nx
from zipfile import ZipFile
from sklearn.metrics.pairwise import cosine_similarity
from math import ceil

In [12]:
#get files and extract them

if not os.path.isfile("./glove.6B.zip"):
    print("file doesnt exists, downloading it now")
    wget.download("http://nlp.stanford.edu/data/glove.6B.zip")
    
#extract to proper file
try:
    if len(os.listdir("./vectorfile")) == 0:
        with ZipFile("glove.6B.zip", 'r') as zip_ref:
            print("unzipping")
            zip_ref.extractall("vectorfile")
except:
    os.mkdir("./vectorfile")
    with ZipFile("glove.6B.zip", 'r') as zip_ref:
            print("unzipping")
            zip_ref.extractall("vectorfile")

unzipping


In [13]:
# Extract word vectors
word_embeddings = {}
f = open('vectorfile/glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

len(word_embeddings)

400000

In [14]:
sentence_vectors = []
for i in clean_sentences:
  if len(i) != 0:
    v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
  else:
    v = np.zeros((100,))
  sentence_vectors.append(v)

In [15]:
# find similarities
sim_mat = np.zeros([len(sentences), len(sentences)])

for i in range(len(sentences)):
  for j in range(len(sentences)):
    if i != j:
      sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]
    

In [16]:
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [17]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
#display to 10%
for i in range(ceil(len(ranked_sentences)*0.1)):
    print(ranked_sentences[i][1])

We need to first make the population an even number of entities so there isn't just one lone parent left.
The main idea of genetic algorithms is to have many entities of something, in this case it would be the random strings, and allow the most ‘fit’ entities to survive and reproduce.
With the children now created, they are put into the "new" generation (in code, when breed is called the entity_pop vector is cleared).
The problem that I solved, granted it does not have real world applications, was to get it to figure out a string that the user put in.


### RAKE (rapid automatic keyword extraction)

In [18]:
from rake_nltk import Rake

In [19]:
filepath = ".\\testTexts\\GA.txt"
rake_obj = Rake()
rake_obj.extract_keywords_from_text(clean_sentences_sentence)

In [25]:
rake_obj.get_ranked_phrases()[:10]

['selection function gives entity generation fitness score setting fittest entity score top score string top entity',
 'main idea genetic algorithms many entities something case would random strings allow fit entities survive reproduce',
 'since stuff done functional programming thought would good time use object oriented programming solve problem',
 'followed setting entity selectforbreeding either true false entities fitness meets threshold top entities score',
 'functions handle ideas done population class population consists multiple entities stored entity pop vector',
 'set like becuase none entities scored higher base threshold breeding population empty',
 'attributes selectforbreeding mutationrate decide entity going reproduce reproducting going mutation kind',
 'coding split two parts first coding entity object followed population object',
 'entity object containes attributes relating single beings fitnesslevel entitystring selectionforbreeding mutationrate',
 'problem solved g

In [26]:
adv_rake_obj = Rake(min_length = 2, max_length=4)
adv_rake_obj.extract_keywords_from_text(text)
adv_rake_obj.get_ranked_phrases()[:10]

['one lone parent left',
 'use object oriented programming',
 '‘ fit ’ entities',
 '‘ c ’',
 '“ jupcake ”,',
 '“ cupcake ”',
 'real world applications',
 'new children must',
 'lowest scoring enetitiy',
 'imitates natural section']