In [None]:
class FeatureExtractor:
    
    def __init__(self):
        self.segmenter = pysbd.Segmenter(language='en', clean='False')
        self.model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

    def extract_features(self,text):
        """
        Extract features from text using a mean of the tf-idf
        Parameters:
            - text: string representing a document
        Returns:
            - extracted features
        """
        sentences = self.segmenter.segment(text)
        embeddings = []
        for sentence in sentences:
          embeddings.append(self.model.encode(sentence))

        features = np.mean(embeddings, axis=0)

        return features

In [None]:
def isChunkUseful(chunk, summary, metric, threshold):
    """
    Parameters:
        - chunk: part of the transcript
        - summary: summary of a transcript
        - metric: function of ariety 2 (chunk, summary) used to evaluate the summary
        - threshold: value used to decide whether chunk is a good summary or not
    Returns:
        - True if the chunk is a good summary, False otherwise
    """
    score = metric(chunk, summary)
    if score < threshold:
        result = False
    else:
        result = True
    
    return result

In [None]:
def bertscore_f1_score(reference, candidate):
    """
    BERTScore score, see https://github.com/huggingface/datasets/tree/master/metrics/bertscore for API
    Parameters:
        reference: reference translation
        candidate: generated translation
    Returns:
        BERTScore f1 score
    """
    bertscore = load_metric("bertscore")
    result = bertscore.compute(
        predictions=[candidate],
        references=[reference],
        lang="en",
        rescale_with_baseline=True
    )
    return result['f1'][0]

Chunks which are not meaningful with respect to their summary are removed. This is performed removing chunks which have a certain score with respect to their summary lower than a certain threshold related to the aforementioned metric. At runtime the summary is not available therefore we train a classifier to learn it.

In [None]:
threshold = 0.5
metric = bertscore_f1_score

# creation of the dataset for chunk classification

targets = []

for i in len(metadata_train):
    description = metadata_train.iloc[i].description
    chunk = semantic_segmentation(get_transcription(metadata_train.iloc[i]), model)
    if isChunkUseful(chunk, summary, metric, threshold):
        targets.append(1)
    else:
        targets.append(0)

y = np.array(targets)

# extraction of the features

extractor = FeatureExtractor()
features = []
for i in len(metadata_train):
    chunk = semantic_segmentation(get_transcription(metadata_train.iloc[i]), model)
    features.append(extractor.extract_features(chunk))

X = np.array(features)

# splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# training the model
catboost = CatBoostClassifier(iterations=2,
                           learning_rate=1,
                           depth=2)
# Fit model
catboost.fit(X_train, y_train)

# Test the model
y_pred = catboost.predict(X_test)
accuracy = accuracy(y, y_pred)

print(f"Accuracy of chunk selection: {round(accuracy,2)}")


In [None]:
# creation of gold set

metadata_path_gold = os.path.join(dataset_path, '150gold.tsv')
metadata_gold = pd.read_csv(metadata_path_gold, sep='\t')
metadata_gold = pd.merge(metadata_gold, metadata_train, left_on='episode id', right_on='episode_uri')

quality = {
    'B': 1,
    'F': 2,
    'G': 3,
    'E': 4
}

# convert egfb columns to a quality score
egfb_columns = ['EGFB', 'EGFB.1', 'EGFB.2', 'EGFB.3', 'EGFB.4', 'EGFB.5']
egfb_to_quality = metadata_gold[egfb_columns].applymap(lambda x: quality[x])

# remove rows with no quality > 1
egfb_to_quality = egfb_to_quality[[any(row > 1) for row in egfb_to_quality.values]] 

# select the best transcript for each episode
best_egfb = egfb_to_quality.apply(lambda x: x.idxmax(), axis=1)
best_summary = [metadata_gold.iloc[i, np.argwhere(metadata_gold.columns == egfb)[0][0] - 1] for i, egfb in best_egfb.iteritems()]

metadata_gold = metadata_gold.loc[best_egfb.index]
metadata_gold['best_summary'] = best_summary

# add transcripts
metadata_gold['transcript'] = metadata_gold.apply(get_transcription, axis=1)

data = metadata_gold[['episode id', 'transcript', 'best_summary']]
