In [6]:
! pip install datasets transformers rouge-score nltk sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from transformers import AutoTokenizer, AutoModel
import pyarrow as pa
import pyarrow.dataset as ds
from datasets import Dataset
from datasets import load_metric
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import nltk

In [8]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
INPUT_PATH1 = "/content/drive/MyDrive/Semester 3 IIITD/NLP/NLP_Project/Dataset/preprocessed_data/divided_dataset"
INPUT_PATH2 = "/content/drive/MyDrive/Semester 3 IIITD/NLP/NLP_Project/Dataset/preprocessed_data/whole_dataset"
RESULT_PATH = "/content/drive/MyDrive/Semester 3 IIITD/NLP/NLP_Project/Results"
MODEL_PATH = "/content/drive/MyDrive/Semester 3 IIITD/NLP/NLP_Project/Models_pickled_file"
# INPUT_PATH1 = "/content/drive/MyDrive/NLP_Project/Dataset/preprocessed_data/divided_dataset"
# INPUT_PATH2 = "/content/drive/MyDrive/NLP_Project/Dataset/preprocessed_data/whole_dataset"
# RESULT_PATH = "/content/drive/MyDrive/NLP_Project/Results"
# MODEL_PATH = "/content/drive/MyDrive/NLP_Project/Models_pickled_file"

In [11]:
test = pd.read_csv(os.path.join(INPUT_PATH2,"Smiti_test.csv"))

In [12]:
test['Source'] = test['Heading'] + test['Article']
test.drop(columns=['Article','Heading'],inplace=True)
test.head()

Unnamed: 0,id,Source
0,1440,gotabaya rajapaksa sworn in as sri lankan pres...
1,1441,trump calls trudeau ‘two-faced’ as palace goss...
2,1442,donald trump appoints indian-american as us am...
3,1443,"vietnam: typhoon, landslides leave 35 dead, 59..."
4,1444,donald trump says his daughter ivanka would be...


In [13]:
dataset = ds.dataset(pa.Table.from_pandas(test).to_batches())

### convert to Huggingface dataset
test_dataset = Dataset(pa.Table.from_pandas(test))

In [14]:
test_dataset

Dataset({
    features: ['id', 'Source'],
    num_rows: 1073
})

In [15]:
model_checkpoint = "mrm8488/t5-base-finetuned-summarize-news"
max_input_length = 1520
max_target_length = 56

In [16]:
checkpoint = "t5-base-finetuned-summarize-news"

In [17]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

In [18]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [19]:
def generate_summary(test_samples, model):
    inputs = tokenizer(
        test_samples["Source"],
        padding="max_length",
        truncation=True,
        max_length=max_input_length,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model.generate(input_ids, attention_mask=attention_mask,max_length=max_target_length)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs, output_str

In [20]:
model.from_pretrained(os.path.join(MODEL_PATH,checkpoint))

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [21]:
#try:
 # os.remove(os.path.join(RESULT_PATH,model_checkpoint+".csv"))
#except:
#  pass

In [22]:
df = pd.DataFrame(columns=["Summary","id"])
#df.to_csv(os.path.join(RESULT_PATH,checkpoint+".csv"),index=False)
df.to_csv("/content/drive/MyDrive/Semester 3 IIITD/NLP/NLP_Project/Results/Smiti_result.csv",index=False)

In [24]:
for i in range(0,len(test_dataset),10):
   summaries_after_tuning = generate_summary(test_dataset[i:i+10], model)[1]
   df = pd.DataFrame(zip(summaries_after_tuning,test_dataset[i:i+10]['id']),columns=["Summary","id"])
   # append data frame to CSV file
   df.to_csv("/content/drive/MyDrive/Semester 3 IIITD/NLP/NLP_Project/Results/Smiti_result.csv", mode='a', index=False, header=False)

KeyboardInterrupt: ignored