In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/till-20000/till_20000.csv
/kaggle/input/till-39000/till_39000.csv
/kaggle/input/model-till-30000/config.json
/kaggle/input/model-till-30000/merges.txt
/kaggle/input/model-till-30000/vocab.json
/kaggle/input/model-till-30000/tokenizer_config.json
/kaggle/input/model-till-30000/model.safetensors
/kaggle/input/model-till-30000/special_tokens_map.json
/kaggle/input/model-till-30000/added_tokens.json
/kaggle/input/model-till-30000/generation_config.json
/kaggle/input/till-32000/till_32000.csv


### Importing Dataset

In [3]:
df = pd.read_csv("/kaggle/input/till-32000/till_32000.csv")

In [4]:
df1 = pd.read_csv("/kaggle/input/till-20000/till_20000.csv")

In [5]:
df.drop("Unnamed: 0", axis=1, inplace=True)

In [6]:
df.head()

Unnamed: 0,title,article,summary
0,"Yes, Trump really is fighting with a lawyer fo...",The president of the United States is in a Twi...,The president of the United States is in a Twi...
1,"In Passing for Human, Liana Finck Illustrates ...",The New Yorker cartoonist calls her new graphi...,New Yorker cartoonist Liana Finck's new graphi...
2,Dick's Sporting Goods won't sell assault-style...,In the absence of any meaningful gun regulatio...,In the absence of any meaningful gun regulatio...
3,A Utopian Suburb Where Residents Underwrite an...,"Serenbe, a bedroom community of Atlanta with f...",Brandon Hinman is the first paid staff member ...
4,Watch a Massive Tarantula Drag an Opossum Arou...,Tarantulas are often cast as creepy crawlers ...,Tarantulas are often cast as creepy crawlers ...


In [7]:
df['summary'] = df['summary'].str.replace(r'\s{2,}', ' ', regex=True).str.strip()

### Loading the Model

In [8]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_path = "/kaggle/input/model-till-30000"

# Load the model
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# If GPU is available, move the model to GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50258, bias=False)
)

### DataSet Preparation

In [9]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import Dataset, load_dataset

In [214]:
df["gpt2_summary"] = df1["summary"] + '\nTL;DR:' + df["title"]

In [215]:
df1["gpt2_summary"] = df["summary"] + '\nTL;DR:' + df1["title"]

In [216]:
new_df = df[['gpt2_summary']]
eval_df = df[['gpt2_summary']].iloc[1800:2000]

In [217]:
new_df = Dataset.from_pandas(new_df)
eval_df = Dataset.from_pandas(eval_df)

In [218]:
new_df

Dataset({
    features: ['gpt2_summary'],
    num_rows: 2000
})

In [219]:
eval_df

Dataset({
    features: ['gpt2_summary'],
    num_rows: 200
})

### Encoding the Inputs

In [220]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['gpt2_summary'] , max_length = 1024, padding = True, truncation = True )
    
    #with tokenizer.as_target_tokenizer():
        #target_encodings = tokenizer(example_batch['title'], max_length = 32, padding = True, truncation = True )
        
    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': input_encodings['input_ids']
    }
    
new_df_pt = new_df.map(convert_examples_to_features, batched = True)
eval_df_pt = eval_df.map(convert_examples_to_features, batched = True)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [221]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [222]:
from transformers import TrainingArguments, Trainer
from accelerate import Accelerator, DataLoaderConfiguration

# Create DataLoaderConfiguration object
dataloader_config = DataLoaderConfiguration(
    dispatch_batches=None,  # Change None to your desired value
    split_batches=False,
    even_batches=True,
    use_seedable_sampler=True
)

# Create Accelerator object with DataLoaderConfiguration
accelerator = Accelerator(dataloader_config=dataloader_config)


# Define training arguments with gradient accumulation, dropout, and validation
trainer_args = TrainingArguments(
    output_dir='news_headline',
    num_train_epochs=10,
    warmup_steps=500,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    logging_steps=15,
    gradient_accumulation_steps=16,
    learning_rate=3e-5,  # Specify the learning rate here
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    overwrite_output_dir=True,
    dataloader_num_workers=accelerator.num_processes
)


In [223]:
# Create Trainer object
trainer = Trainer(
    model=model,
    args=trainer_args,
    tokenizer=tokenizer,
    data_collator=seq2seq_data_collator,
    train_dataset=new_df_pt,
    eval_dataset=eval_df_pt
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train() 

### Computing Rouge Score

In [13]:
import pandas as pd
from tqdm import tqdm
from rouge import Rouge
rouge = Rouge()

def calculate_rouge_scores(df, model, tokenizer):
    rouge_scores = []

    for index, row in tqdm(df.iterrows(), total=len(df), desc="Calculating ROUGE scores"):
        # Define the reference summary
        reference_summary = row["title"]

        output = pipe(row["article"], **generation_args)
        # Define the input text for GPT-2
        input_text = output[0]['generated_text'] + '\nTL;DR:'

        # Tokenize input text
        input_ids = tokenizer(input_text, max_length = 900, truncation=True, return_tensors="pt").to("cuda")
        # Generate predictions
        output = model.generate(input_ids=input_ids["input_ids"],
                                attention_mask=input_ids["attention_mask"],
                                length_penalty=0.8,
                                min_new_tokens=7,
                                max_new_tokens=24,
                                num_beams=8,
                                no_repeat_ngram_size=2,
                                early_stopping=True)

        # Decode the generated sequence
        output_text = tokenizer.decode(output[0], skip_special_tokens=True)
        tldr_index = output_text.find("TL;DR:")
        if tldr_index != -1:
            # Extract the text after "TL;DR:"
            output_text = output_text[tldr_index + len("TL;DR:"):]

        # Remove commas from the output text
        output_text = output_text.replace(",", "")

        generated_text = output_text.strip()

        # Calculate ROUGE for the generated and reference summaries
        scores = rouge.get_scores(generated_text, reference_summary)
        rouge_scores.append(scores)

    return rouge_scores

# Example usage:
sampled_df = df1.sample(n=40, random_state=42)
rouge_scores = calculate_rouge_scores(sampled_df, model, tokenizer)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Calculating ROUGE scores:   2%|▎         | 1/40 [01:03<40:57, 63.02s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Calculating ROUGE scores:   5%|▌         | 2/40 [01:28<25:49, 40.78s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Calculating ROUGE scores:   8%|▊         | 3/40 [02:26<30:10, 48.93s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Calculating ROUGE scores:  10%|█         | 4/40 [03:25<31:40, 52.78s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Calculating ROUGE scores:  12%|█▎        | 5/40 [04:27<32:44, 56.13s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Calculating ROUGE scores:  15%|█▌        | 6/40 [05:26<32:16, 56.95s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Calculating ROUGE scores:  18%|█▊        | 7/40 [05:27<21:23, 38

In [213]:
# Compute average scores
rouge1_avg = sum(score[0]['rouge-1']['f'] for score in new_scores) / len(new_scores)
rouge2_avg = sum(score[0]['rouge-2']['f'] for score in new_scores) / len(new_scores)
rougeL_avg = sum(score[0]['rouge-l']['f'] for score in new_scores) / len(new_scores)

# Create a dictionary of average scores
avg_scores = {
    'rouge1_avg': rouge1_avg,
    'rouge2_avg': rouge2_avg,
    'rougeL_avg': rougeL_avg
}

# Display average scores in a table
df_avg_scores = pd.DataFrame.from_dict(avg_scores, orient='index', columns=['Average Score'])
print(df_avg_scores)

            Average Score
rouge1_avg       0.333333
rouge2_avg       0.142857
rougeL_avg       0.200000


### Evaluating Rouge Score Current News Article

In [193]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from rouge import Rouge


# Define the generated summary and the reference summary
reference_summary = '''AI a 'fundamental change in the news ecosystem'''

# Initialize the ROUGE object
rouge = Rouge()

# Define the input text for GPT-2
input_text = '''Artificial intelligence is shaking up journalism and in the short term will cause "a fundamental change in the news ecosystem", media expert David Caswell told AFP.
A former employee at Yahoo! and BBC News Labs, the British broadcaster's innovation wing, Caswell spoke as industry leaders gathered in the Italian city of Perugia to discuss the biggest questions facing their trade.
"We don't know. But what we are trying to do is to understand all of the possibilities or as many of the possibilities as we can. But I think there are some things that are becoming clearer: one is the fact that more media will probably be created and originated and sourced by machines. So machines will do more gathering in a lot of journalism, will do more of the producing, the audio, the video and the text, and will create the kind of experiences of consumption that consumers have.
That is a very fundamental change in the information ecosystem in general, and the news ecosystem in particular. This is structurally different than the one that we're in now. We don't know how long it's going to take - it may be two, four, seven years. I think it's going to be faster because there is very little friction.
People don't need news devices, new hardware, they don't need a lot of money as producers, they don't need technical expertise. All those things that were barriers in the previous generation of AI are no longer barriers, thanks to generative AI".
"One class of development is in new tools that enables AI workflow, for example JP Politikens in Denmark focused on making their existing products and activities more efficient. But it is also a basis for transitioning their products, their workforce, the activities into this new AI world.
There is a tool that Google has built -- the code name is 'Genesis' -- that they are testing with publishers. Some publishers are building their own. There will be platform versions of these tools.
These are tools, you bring your news gathering on the left side: your PDF, transcripts, audios, videos.. roughly. It helps you do things like analysis, summaries, turn into scripts, audios. They're orchestrated by the tool.
What the journalist is doing is coordinating the tool, verifying the content all the way through to the end, and editing. The job becomes using the tool, like an editorial manager of this AI tool.
It technically works. But that's a different thing than putting it in a newsroom in a large operation and use it day in day out, months in, months out. That's a big question: is it going to be enthusiastically adopted, to be used in a way that isn't very productive in the long run or will that enhance the productivity of newsroom dramatically?"
"In the last decade it was very expensive. It was very difficult: You need the data, you had to build a data warehouse, have an enterprise deal with Amazon or Google cloud, you had to hire data scientists, to have a team of data engineers. it was a major investment. Only the BBC, the New York Times, this level of organisations could really afford it.
That's not true with generative AI. You can run news workflow through interfaces that you pay 20 dollars a month. You don't need to be a coder. All you need is motivation, enthusiasm and curiosity.
There's lots of people in news organisations that would not have been involved in AI in the past because they did not have the technical background and now they can just use it. It's a much more open form of AI: both smaller newsrooms can do a lot with, and more junior individuals in more established newsrooms can do a lot with. I think it's a good thing, but it's also a disruptive thing. Often the internal politics in newsrooms are disrupted by that".
"AI has been around since the 1950s. But AI for practical purposes appeared with ChatGPT. It's going to be quite a while -- years -- before we really understand how to use them for valuable things. There are so many things that you can do with them.''' + '\nTL;DR:'

# Tokenize input text
input_ids = tokenizer.encode(input_text, return_tensors="pt").to("cuda")

# Generate predictions
output = model.generate(input_ids=input_ids,
                        length_penalty=0.9,
                        min_new_tokens=7,
                        max_new_tokens=24,
                        num_beams=8,
                        #no_repeat_ngram_size=2,
                        early_stopping=True,
                        do_sample=True)

# Decode the generated sequence
output_text = tokenizer.decode(output[0], skip_special_tokens=True)
tldr_index = output_text.find("TL;DR:")
if tldr_index != -1:
    # Extract the text after "TL;DR:"
    output_text = output_text[tldr_index + len("TL;DR:"):]

#Remove commas from the output text
output_text = output_text.replace(",", "")

generated_text = output_text.strip()

# Calculate ROUGE for the generated and reference summaries
scores = rouge.get_scores(generated_text, reference_summary)
# Print the results
print(scores)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'rouge-1': {'r': 0.5, 'p': 0.2222222222222222, 'f': 0.3076923034319527}, 'rouge-2': {'r': 0.42857142857142855, 'p': 0.16666666666666666, 'f': 0.23999999596800003}, 'rouge-l': {'r': 0.5, 'p': 0.2222222222222222, 'f': 0.3076923034319527}}]


In [194]:
generated_text

'Artificial intelligence is shaking up journalism and in the short term will cause "a fundamental change in the news ecosystem"'

### Saving Model

In [42]:
from IPython.display import FileLink

file_path = "news_headline/checkpoint-1000/model.safetensors"  # Specify the full path to the file
link = FileLink(file_path)
link
