In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, get_scheduler
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn.functional as F
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
import json
from tqdm.auto import tqdm
import numpy as np
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_premises_text(gold, raw_data_location):
  for data in gold:
        section = gold[data]["Section_id"]
        primary_id = gold[data]["Primary_id"]
        with open(f'{raw_data_location}/{primary_id}.json') as f:
            primary = json.load(f)

        primary_premise = ' '.join(primary[section])
        gold[data]["Primary_premise"] = primary_premise

        if gold[data]["Type"] == 'Comparison':
            secondary_id = gold[data]["Secondary_id"]
            with open(f'{raw_data_location}/{secondary_id}.json') as f:
                secondary = json.load(f)
            secondary_premise = ' '.join(secondary[section])
            gold[data]["Secondary_premise"] = secondary_premise

with open('data\\raw\\train.json') as f:
    train_data = json.load(f)
with open('data\\raw\\dev.json') as f:
    dev_data = json.load(f)
with open('data\\raw\\test.json') as f:
    test_data = json.load(f)

get_premises_text(train_data, 'data\\raw\\CT')
get_premises_text(dev_data, 'data\\raw\\CT')
get_premises_text(test_data, 'data\\raw\\CT')

In [9]:
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Helper function to prepare data for vectorization
def prepare_data(data):
    combined_texts = []
    for key, value in data.items():
        premise = value['Primary_premise']
        if 'Secondary_premise' in value:
            premise += ' ' + value['Secondary_premise']
        combined_texts.append(premise)
    return combined_texts

# Prepare the train, dev, and test data
train_texts = prepare_data(train_data)
dev_texts = prepare_data(dev_data)
test_texts = prepare_data(test_data)

# Create and fit the TF-IDF vectorizer on the training data
vectorizer = TfidfVectorizer()
vectorizer.fit(train_texts)

# Helper function to summarize a text
def summarize_text(text, vectorizer, max_words=256):
    sentences = nltk.sent_tokenize(text)
    if len(sentences) == 1:
        return sentences[0]

    tfidf_matrix = vectorizer.transform(sentences)
    sentence_scores = np.sum(tfidf_matrix.toarray(), axis=1)

    num_sentences = int(len(sentences) * 0.3)
    top_sentence_indices = np.argsort(sentence_scores)[-num_sentences:]
    top_sentences = [sentences[i] for i in sorted(top_sentence_indices)]

    summary = ' '.join(top_sentences)
    summary_words = summary.split(' ')
    if len(summary_words) > max_words:
        summary = ' '.join(summary_words[:max_words])

    return summary

# Function to process and summarize data
def process_and_summarize(data, vectorizer):
    for key, value in data.items():
        premise = value['Primary_premise']
        if 'Secondary_premise' in value:
            premise += ' ' + value['Secondary_premise']

        summary = summarize_text(premise, vectorizer)
        data[key]['Extractive_premise'] = summary

# Process and summarize each dataset
process_and_summarize(train_data, vectorizer)
process_and_summarize(dev_data, vectorizer)
process_and_summarize(test_data, vectorizer)

In [10]:
train_data

{'5bc844fc-e852-4270-bfaf-36ea9eface3d': {'Type': 'Comparison',
  'Section_id': 'Intervention',
  'Primary_id': 'NCT01928186',
  'Secondary_id': 'NCT00684983',
  'Statement': 'All the primary trial participants do not receive any oral capecitabine, oral lapatinib ditosylate or cixutumumab IV, in conrast all the secondary trial subjects receive these.',
  'Label': 'Contradiction',
  'Primary_premise': 'INTERVENTION 1:    Diagnostic (FLT PET)   Patients with early stage, ER positive primary breast cancer undergo FLT PET scan at baseline and 1-6 weeks after the start of standard endocrine treatment. The surgery follows 1-7 days after the second FLT PET scan.   Tracer used in the FLT PET (positron emission tomography) scanning procedure: [F18] fluorothymidine.   Positron Emission Tomography: Undergo FLT PET   Laboratory Biomarker Analysis: Correlative studies - Ki67 staining of the tumor tissue in the biopsy and surgical specimen.',
  'Secondary_premise': 'INTERVENTION 1:    Arm A   Patien