# Third step in pipeline
# Get topics of predicted segments

In [7]:
import numpy as np
import pandas as pd

In [23]:
# For each segment, add probability desity vectors and find max index which matches with the topic 

def certainty_score(boundaries, proba_vectors, print_statment=False):

    try:
    
        certainty_ls = []

        for index, bound in enumerate(boundaries):
            if index == len(boundaries)-1:
                segment_before = proba_vectors[boundaries[index-1]:bound+1,:]
                segment_after = proba_vectors[bound+1:,:]
                segment_before_added = np.add.reduce(segment_before)
                segment_after_added = np.add.reduce(segment_after)
                max_topic_before = np.argmax(segment_before_added)  # index of largest element
                max_topic_after = np.argmax(segment_after_added)  # index of largest element
                norm_value_before = np.max(segment_before_added)/len(segment_before)  # value of largest element
                norm_value_after = np.max(segment_after_added)/len(segment_after)  # value of largest element
                certainty_ls.append([norm_value_before, len(segment_before)])
                certainty_ls.append([norm_value_after, len(segment_after)])
                if print_statment == True:
                    print(f'Segment before position {bound} has topic: {max_topic_before} and certainty score: {norm_value_before:.3f}')
                    print(f'Segment after position {bound} has topic: {max_topic_after} and certainty score: {norm_value_after:.3f}')
            elif index == 0:
                segment = proba_vectors[:bound+1,:]
                segment_added = np.add.reduce(segment)
                max_topic = np.argmax(segment_added)  # index of largest element
                norm_value = np.max(segment_added)/len(segment)  # value of largest element
                certainty_ls.append([norm_value, len(segment)])
                if print_statment == True:
                    print(f'Segment before position {bound} has topic: {max_topic} and certainty score: {norm_value:.3f}')
            else:
                segment = proba_vectors[boundaries[index-1]:bound+1,:]
                segment_added = np.add.reduce(segment)
                max_topic = np.argmax(segment_added)  # index of largest element
                norm_value = np.max(segment_added)/len(segment)  # value of largest element
                certainty_ls.append([norm_value, len(segment)])
                if print_statment == True:
                    print(f'Segment before position {bound} has topic: {max_topic} and certainty score: {norm_value:.3f}')

        # Calculate weighted average certainty
        certainty_arr = np.array(certainty_ls)
        weights = certainty_arr[:,1]/sum(certainty_arr[:,1])
        weighted_average = sum(certainty_arr[:,0] * weights)
        if print_statment == True:
            print(f'Weighted average certainty: {weighted_average:.1%}')

    except IndexError:
        return None

    return weighted_average

In [42]:
# Load boundaries and probability desiti vectors of corresponding transcript

# Inputs

# hdbscan_model_number = 50
# transcript_ref = '13NDTKL5ZGs8cb8dojW3bz'

model_list = [15, 50, 200]
transcript_list = ['13NDTKL5ZGs8cb8dojW3bz', '6preEOWrgR9eRr938upFgv',
       '2Bp5vd9GAmEpZzjEtGQBFD', '0ZGQ63222rqX5TD5ZrMmcN',
       '19W5dgUcFseQZBmcVF4coc', '3iydyD9rAb1f6rmvmgpwS4',
       '0bXWB28GwN8OiqC1ykRrRX', '28IWswylk2FvkebOehoCkL',
       '3DR5Qa40Mc17AiBYfmC29U', '3RT2j2BG8ILNYKjxsNhfvZ',
       '3p9FLEH5V5sCGHhGubaYZc', '5Sg6efUjypR4m6p9eYBXpm',
       '4y67J0Fmgm5L7TPPsUunwo', '1VBbCB6ja5pPdU2wrBy27N',
       '7A7swZJL0AtFghauiGLadV', '4pFaG2QLnDr95gqDQFEWoh',
       '53DrbE5nPJskpPT0PtOi9O', '5ts4p0QlyePWCgIB2W1wLf',
       '7mv5E2yb2yVQU34OiQ1vqv', '4DUIcbw3EZpeYUC2mcxV0D']

# get dataframe of outputs

wa_df = pd.DataFrame(columns=['episode_id', 'mpts', 'weighted_average'])

for transcript_ref in transcript_list:
       for hdbscan_model_number in model_list:
              bound = np.load(f'../Thesis/evaluation/boundaries_msize{hdbscan_model_number}_{transcript_ref}.npy')
              proba_matrix = np.load(f'../Thesis/annotated_probabilities_msize{hdbscan_model_number}/proba_dens_vec_msize{hdbscan_model_number}_{transcript_ref}.npy')
              weighted_avg = certainty_score(bound, proba_matrix)
              output = pd.DataFrame({'episode_id': [transcript_ref], 'mpts': [hdbscan_model_number], 'weighted_average': [weighted_avg]})
              wa_df = pd.concat([wa_df, output])

wa_df = wa_df.reset_index(drop=True)

In [47]:
# Group by model and get average
wa_df.groupby('mpts')['weighted_average'].mean().sort_values(ascending=False)

mpts
50     0.195989
200    0.119781
15     0.119617
Name: weighted_average, dtype: float64

In [46]:
# Group by transcript and get average
wa_df.groupby('episode_id')['weighted_average'].mean().sort_values(ascending=False)

episode_id
7mv5E2yb2yVQU34OiQ1vqv    0.210567
3iydyD9rAb1f6rmvmgpwS4    0.196039
5Sg6efUjypR4m6p9eYBXpm    0.168394
7A7swZJL0AtFghauiGLadV    0.160519
28IWswylk2FvkebOehoCkL    0.157704
0bXWB28GwN8OiqC1ykRrRX    0.154417
5ts4p0QlyePWCgIB2W1wLf    0.154240
3p9FLEH5V5sCGHhGubaYZc    0.152576
13NDTKL5ZGs8cb8dojW3bz    0.148709
3DR5Qa40Mc17AiBYfmC29U    0.147334
0ZGQ63222rqX5TD5ZrMmcN    0.141892
6preEOWrgR9eRr938upFgv    0.138632
1VBbCB6ja5pPdU2wrBy27N    0.136397
4DUIcbw3EZpeYUC2mcxV0D    0.134069
4y67J0Fmgm5L7TPPsUunwo    0.128439
3RT2j2BG8ILNYKjxsNhfvZ    0.125333
2Bp5vd9GAmEpZzjEtGQBFD    0.112319
19W5dgUcFseQZBmcVF4coc    0.111672
4pFaG2QLnDr95gqDQFEWoh    0.108432
53DrbE5nPJskpPT0PtOi9O    0.070612
Name: weighted_average, dtype: float64

In [44]:
# view full df to analyse transcripts
wa_df

Unnamed: 0,episode_id,mpts,weighted_average
0,13NDTKL5ZGs8cb8dojW3bz,15,0.145978
1,13NDTKL5ZGs8cb8dojW3bz,50,0.21651
2,13NDTKL5ZGs8cb8dojW3bz,200,0.083639
3,6preEOWrgR9eRr938upFgv,15,0.098644
4,6preEOWrgR9eRr938upFgv,50,0.185714
5,6preEOWrgR9eRr938upFgv,200,0.131537
6,2Bp5vd9GAmEpZzjEtGQBFD,15,0.140376
7,2Bp5vd9GAmEpZzjEtGQBFD,50,
8,2Bp5vd9GAmEpZzjEtGQBFD,200,0.084262
9,0ZGQ63222rqX5TD5ZrMmcN,15,0.093116
