## Load file

In [1]:
with open('Apple_sept.mp3.txt', 'r') as ins:
    obj = ins.readlines()

In [10]:
from nltk.probability import FreqDist
from nltk import tokenize
import pandas as pd
import nltk 
import plotly.express as px

words = tokenize.word_tokenize(obj[0])

#Don't want to remove stopwords
#from nltk.corpus import stopwords
#nltk.download("stopwords")
#stop_words = stopwords.words('English')
#print(stop_words)

remove = [',', '.', 'a', 'the', 'and', 'to']  #manual exclusion list
words = [x for x in words if x not in remove]

In [37]:
distrib = FreqDist(words)
distrib = pd.DataFrame(distrib, index = range(len(distrib.keys())))

distrib = distrib.T.reset_index()
distrib = distrib.iloc[:, :2]
distrib.columns = ['words', 'count']
distrib.sort_values(by='count', ascending=False, inplace= True)
distrib

Unnamed: 0,words,count
2,iPhone,37
40,Pro,30
46,with,25
8,in,21
66,of,20
...,...,...
257,ca,1
255,brand,1
253,gives,1
252,many,1


## Tokenize by word count and analyse

In [38]:
import numpy as np

fig = px.treemap(distrib[0:100], path=[px.Constant("Plot keywords"), 'words'],
                    values='count',
                    color='count',
                    color_continuous_scale='viridis',
                    color_continuous_midpoint=np.average(distrib['count'])
                    )

fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))


## Generate Sentence embeddings and cluster by marketing emotions


In [40]:
import spacy
from time import perf_counter

from sentence_transformers import SentenceTransformer


model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe('sentencizer')

lines = nlp(obj[0])
doc = {'id': [], 'text': [], 'embedding': []}
count = 0

start = perf_counter()
for line in lines.sents:

    doc['id'].append(count)
    doc['text'].append(line.text)
    

    embedding = model.encode(line.text)
    doc['embedding'].append(embedding)
    assert embedding.shape[0] == 384

    count += 1
end = perf_counter()

print(f"There are {count} sentences in the document and it took {end-start} seconds to encode")

There are 84 sentences in the document and it took 0.8496515830001954 seconds to encode


In [47]:
pd.DataFrame(doc)

Unnamed: 0,id,text,embedding
0,0,a pixel image.,"[-0.029540338, 0.051602565, 0.031634264, -0.05..."
1,1,iPhone can shoot higher resolution photos in 2...,"[0.05018772, 0.01515901, 0.034717936, -0.03159..."
2,2,They can now quickly switch between these new ...,"[-0.009930151, -0.0053128777, 0.038176753, -0...."
3,3,iPhone 15 Pro also gets next generation portra...,"[-0.077535085, 0.053243104, 0.047085665, -0.07..."
4,4,The smallest details of our expression are inc...,"[0.013330734, 0.09756808, 0.043980382, 0.01519..."
...,...,...,...
79,79,"They're with us all the time, and we use them ...","[0.011874347, -0.03783722, 0.048084874, -0.001..."
80,80,Thank you for joining us.,"[0.030471608, -0.07470791, 0.031317618, -0.011..."
81,81,Have a great day.,"[-0.0603752, 0.02640435, 0.06287647, -0.043353..."
82,82,Talk to you soon.,"[0.016188892, -0.105991006, 0.08225385, 0.0338..."


## Try to chunk by sections in the video


#Setup search and streamlit interface, maybe experiment with another embedding.

#Website for songwriters.