In [222]:
from transcript_analyser import Transcript
from transcript_analyser.utils.utils import Utils

In [223]:
import json
with open('transcript_analyser/sample_data/sample_01.json', 'r') as f:
    json_obj = json.load(f)
    f.close()
transcript = Transcript(json_obj)

In [224]:
transcript.text

" Uh huh  mary  hi  hello, I'm Susan Thompson Resource manager.  Hi, I'm mary Hanson and I'm applying for one of your kitchen jobs.  Great,  here's a copy of my resume.  Great, have a seat mary.Thank you.  Mary, do you have any experience working in the kitchen?  No,  but I want to learn,  I work hard and  I cook a lot at home.  Okay,  well tell me about yourself.  Well  I love to learn new things.  I'm very  organized  and  I follow directions. Exactly.  That's why my boss at my last  job  made me a trainer  and the company actually gave me a special certificate  for coming to work  on time  every day for a year  and  I'm taking an  english class to  improve my writing skills.  That's.Great.  Why did you leave your last job?  It was  graveyard  and  I need to work  days.  Oh I see.  Well what hours can you.Work  from eight am until five pm.  Okay well do you have any questions for me mary?  Yes.  What  kind of training is needed?  Not a lot.  Most new workers can learn everything the 

In [225]:
from nltk.tokenize import sent_tokenize
stop_words = Utils.load_stop_words()
sents = sent_tokenize(transcript.text)
sents = [
    ' '.join([word.lower() for word in sent.split()])
    for sent in sents
]

sents = [
    sent for sent in sents
    if len(sent.split()) > 4
]

In [226]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from numpy.linalg import svd
import numpy as np

In [227]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words=stop_words, min_df=2)
bows = vectorizer.fit_transform(sents)

In [228]:
bows = bows.todense().A.T
bows.shape

(12, 20)

In [229]:
feature_names = vectorizer.get_feature_names_out()
feature_names

array(['company', 'day', 'great', 'job', 'kitchen', 'learn', 'lot',
       'mary', 'questions', 'things', 'time', 'work'], dtype=object)

In [230]:
u, sigma, vt = singular_value_decomposition(bows, full_matrices=False)

print(f'resulted array after using SVD on the data: {u.shape, sigma.shape, vt.shape}')

resulted array after using SVD on the data: ((12, 12), (12,), (12, 20))


In [231]:
n_concepts = 3
n_sentences = 5

Gong and Liu (2001)

In [232]:
vt_v1 = vt[:n_concepts,].copy()
indices = []
for i in range(n_concepts):
    sent_index = np.argmax(np.abs(vt_v1[i, :]))
    indices.append(sent_index)
for index in sorted(indices):
    print(sents[index])

uh huh mary hi hello, i'm susan thompson resource manager.
why did you leave your last job?
it was graveyard and i need to work days.


Steinberger and Jezek (2004)

In [233]:
vt_v2 = (vt.copy())[:n_concepts, ]
sigma_v2 = (sigma.copy())[:n_concepts]
indices = []

for i in range(vt_v2.shape[1]):
    vt_v2[:, i] = np.multiply(vt_v2[:, i], sigma_v2)

for i in range(vt_v2.shape[0]):
    sent_index = np.argmax(np.abs(vt_v2[i, :]))
    indices.append(sent_index)
for index in sorted(indices):
    print(sents[index])

uh huh mary hi hello, i'm susan thompson resource manager.
why did you leave your last job?
it was graveyard and i need to work days.


In [234]:
vt_v3 = (vt.copy())[:n_concepts, ]
sigma_v3 = (sigma.copy())[:n_concepts]

indices = []
sentence_scores_v3 = []
for sent in vt_v3.T:
    sentence_scores_v3.append(np.dot(sent, sigma_v3))
indices = np.argsort(sentence_scores_v3)[:n_sentences]
for index in sorted(indices):
    print(sents[index])

okay, well tell me about yourself.
why did you leave your last job?
what kind of training is needed?
well i have a few more interviews to do today but i will call you tomorrow if you get the job.
thank you so much for your time.


Cross method

In [235]:
vt_v4 = (vt.copy())[:n_concepts, ]
sigma_v4 = (sigma.copy())[:n_concepts]

concepts_mean_scores = np.mean(vt_v4, axis = 1)
for concept_id in range(vt_v4.shape[0]):
    vt_v4[concept_id, :] = np.multiply(vt_v4[concept_id, :], vt_v4[concept_id, :] > concepts_mean_scores[concept_id])
    
sentence_scores = np.sum(vt_v4, axis = 0)
sentence_indices = np.argsort(sentence_scores)[:n_sentences]
for index in sorted(sentence_indices):
    print(sents[index])

great, here's a copy of my resume.
okay, well tell me about yourself.
i'm very organized and i follow directions.
what kind of training is needed?
thank you so much for your time.


In [251]:
vt_v5 = (vt.copy())[:n_concepts, ]
sigma_v5 = (sigma.copy())[:n_concepts]

concepts_mean_scores = np.mean(vt_v5, axis = 1)
for concept_id in range(vt_v5.shape[0]):
    vt_v5[concept_id, :] = np.multiply(vt_v5[concept_id, :], vt_v5[concept_id, :] > concepts_mean_scores[concept_id])
    

concept_concept_matrix = np.zeros((n_concepts, n_concepts))
for i in range(n_concepts):
    for j in range(n_concepts):
        a = vt_v5[i, :]
        b = vt_v5[j, :]
        both_non_zero_mask = np.multiply(a != 0, b != 0)
        concept_concept_matrix[i, j] = np.sum(np.multiply(both_non_zero_mask, a)) + np.sum(np.multiply(both_non_zero_mask, b))

strength_values = np.sum(concept_concept_matrix, axis = 1)