**Installations**

In [1]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


**Importing Libraries**

In [2]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, losses, models, util
import nltk
import os

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

**Dataset Loading**

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
INPUT_PATH = "/content/drive/MyDrive/Semester 3 IIITD/NLP/NLP_Project/Dataset/preprocessed_data/whole_dataset"
DATASET_PATH = "/content/drive/MyDrive/Semester 3 IIITD/NLP/NLP_Project/Dataset/DMD"

In [6]:
train = pd.read_csv(os.path.join(INPUT_PATH,"train.csv"))
test = pd.read_csv(os.path.join(INPUT_PATH,"test.csv"))

In [7]:
train['Source'] = train['Heading'] + train['Article']
train.drop(columns=['Article','Heading'],inplace=True)
train.head()

Unnamed: 0,Summary,id,Source
0,the name of all member countries except india ...,0,india opposes china's belt and road initiative...
1,"pakistan termed the indian action as ""unilater...",1,"un urges for maximum restraint, invokes simla ..."
2,"""the agreement will be finalised between khybe...",2,"china, pak to finalise deal to develop sez und..."
3,the top health research institute said that an...,3,"covaxin effectively neutralises both alpha, de..."
4,the decision to shelve detailed advice from th...,4,"top white house officials buried cdc report, r..."


In [8]:
test['Source'] = test['Heading'] + test['Article']
test.drop(columns=['Article','Heading'],inplace=True)
test.head()

Unnamed: 0,id,Source
0,0,explainer: how worrying is the variant first s...
1,1,pakistan parliament to elect new prime ministe...
2,2,indian-origin pathologist accused of botching ...
3,3,china begins world's biggest census drive to c...
4,4,"indonesia prison fire kills 41 drug inmates, i..."


**Sentence Embeddings**

In [9]:
model = SentenceTransformer("all-mpnet-base-v2")

In [10]:
# to return encoding of sentences
def encode_sentence(sent):
    return model.encode(sent)

In [11]:
# generate embedding for document
# if no of sentence in doc = nsent
# then it returns a matrix of shape(nsent,embed_dimension)
def document_embedding(doc,embed_dimension):
    sentences = nltk.sent_tokenize(doc)
    doc_embedding = np.zeros(shape=(len(sentences),embed_dimension))
    for i,sent in enumerate(sentences):
        sent_encoding = encode_sentence(sent)
        doc_embedding[i] = sent_encoding
    return doc_embedding

In [12]:
def list_of_document_embeddings(data,embed_dimension):
    data_source = data['Source']
    list_embedding = []
    for i in range(len(data_source)):
        list_embedding.append(document_embedding(data_source[i],embed_dimension))
    return list_embedding

In [13]:
embed_dimension =	768

In [None]:
train_embed = list_of_document_embeddings(train,embed_dimension)
test_embed = list_of_document_embeddings(test,embed_dimension)

**DMD Function**

In [None]:
def DMD(data, r):
    """Dynamic Mode Decomposition (DMD) algorithm."""
    ## Build data matrices
    X1 = data[:, : -1]
    X2 = data[:, 1 :]
    ## Perform singular value decomposition on X1
    u, s, v = np.linalg.svd(X1, full_matrices = False)
    ## Compute the Koopman matrix
    A_tilde = u[:, : r].conj().T @ X2 @ v[: r, :].conj().T * np.reciprocal(s[: r])
    ## Perform eigenvalue decomposition on A_tilde
    Phi, Q = np.linalg.eig(A_tilde)
    # ## Compute the coefficient matrix
    # Psi = X2 @ v[: r, :].conj().T @ np.diag(np.reciprocal(s[: r])) @ Q
    # A = Psi @ np.diag(Phi) @ np.linalg.pinv(Psi)
    
    return Phi

**Removal of non-important sentences**

In [None]:
train_DMD = [0 for i in range(len(train_embed))]
for i in range(len(train_embed)):
    train_DMD[i] = DMD(train_embed[i],embed_dimension)

In [None]:
test_DMD = [0 for i in range(len(test_embed))]
for i in range(len(test_embed)):
    test_DMD[i] = DMD(test_embed[i],embed_dimension)

In [None]:
def remove_nonrelevant_sent(data,DMD,index,percent_kept):
    original = data['Source'][index]
    original = nltk.sent_tokenize(original)
    len_original = len(original)
    num_sent_to_keep = int(np.ceil(percent_kept * len_original))
    instance = np.real(DMD[index])
    res = sorted(range(len(instance)), key = lambda sub: instance[sub])[-num_sent_to_keep:]
    final_sent = []
    for i in range(len(original)):
        if i in sorted(res):
          final_sent.append(original[i])
    return " ".join(final_sent)

In [None]:
def create_dataset(data,DMD,percent_kept):
    dataset = data.copy(deep=True)
    for i in range(len(dataset)):
        dataset['Source'][i] = remove_nonrelevant_sent(dataset,DMD,i,percent_kept)
    return dataset

**Final corpus for train and test**

In [None]:
percent_kept = 0.8

In [None]:
final_train = create_dataset(train,train_DMD,percent_kept)
final_test = create_dataset(test,test_DMD,percent_kept)

**Saving the datasets**

In [None]:
final_train.to_csv(os.path.join(DATASET_PATH,"train.csv"),index=False)
final_test.to_csv(os.path.join(DATASET_PATH,"test.csv"),index=False)

**Training on pretrained model**

In [None]:
# done in separate file