In [None]:
class FeatureExtractor:
    
    def __init__(self):
        self.segmenter = pysbd.Segmenter(language='en', clean='False')
        self.model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

    def extract_features(self,text):
        """
        Extract features from text using a mean of the tf-idf
        Parameters:
            - text: string representing a document
        Returns:
            - extracted features
        """
        sentences = self.segmenter.segment(text)
        embeddings = []
        for sentence in sentences:
          embeddings.append(self.model.encode(sentence))

        features = np.mean(embeddings, axis=0)

        return features

In [None]:
def isChunkUseful(chunk, summary, metric, threshold):
    """
    Parameters:
        - chunk: part of the transcript
        - summary: summary of a transcript
        - metric: function of ariety 2 (chunk, summary) used to evaluate the summary
        - threshold: value used to decide whether chunk is a good summary or not
    Returns:
        - True if the chunk is a good summary, False otherwise
    """
    score = metric(chunk, summary)
    if score < threshold:
        result = False
    else:
        result = True
    
    return result

In [None]:
def bertscore_f1_score(reference, candidate):
    """
    BERTScore score, see https://github.com/huggingface/datasets/tree/master/metrics/bertscore for API
    Parameters:
        reference: reference translation
        candidate: generated translation
    Returns:
        BERTScore f1 score
    """
    bertscore = load_metric("bertscore")
    result = bertscore.compute(
        predictions=[candidate],
        references=[reference],
        lang="en",
        rescale_with_baseline=True
    )
    return result['f1'][0]

Chunks which are not meaningful with respect to their summary are removed. This is performed removing chunks which have a certain score with respect to their summary lower than a certain threshold related to the aforementioned metric. At runtime the summary is not available therefore we train a classifier to learn it.

In [None]:
# I assume df is the dataframe and it has chunk and summary columns

threshold = 0.5
metric = bertscore_f1_score

num_chunks = df.size

# creation of the dataset for chunk classification

targets = []

for i in range(num_chunks):
    chunk = df['chunk'][i]
    summary = df['summary'][i]
    if isChunkUseful(chunk, summary, metric, threshold):
        targets.append(1)
    else:
        targets.append(0)

y = np.array(targets)

# extraction of the features

extractor = FeatureExtractor()
features = []
for i in range(num_chunks):
    features.append(extractor.extract_features(df['chunk']))

X = np.array(features)

# splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# training the model
model = CatBoostClassifier(iterations=2,
                           learning_rate=1,
                           depth=2)
# Fit model
model.fit(X_train, y_train)

# Test the model
y_pred = model.predict(X_test)
accuracy = accuracy(y, y_pred)

print(f"Accuracy of chunk selection: {round(accuracy,2)}")
