In [1]:
!pip install transformers datasets accelerate -q

In [2]:
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments, pipeline
from datasets import Dataset
import pandas as pd
import torch
import os

In [3]:
from google.colab import files

uploaded = files.upload()
uploaded_filename = list(uploaded.keys())[0]
df = pd.read_csv(uploaded_filename)

print("Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")
print("\nFirst few rows:")
print(df.head())


Saving trainig_dataset.csv to trainig_dataset.csv
Dataset loaded successfully!
Dataset shape: (2225, 2)

First few rows:
                               title  \
0  Ad sales boost Time Warner profit   
1   Dollar gains on Greenspan speech   
2  Yukos unit buyer faces loan claim   
3  High fuel prices hit BA's profits   
4  Pernod takeover talk lifts Domecq   

                                           paragraph  
0  Quarterly profits at US media giant TimeWarner...  
1  The dollar has hit its highest level against t...  
2  The owners of embattled Russian oil giant Yuko...  
3  British Airways has blamed high fuel prices fo...  
4  Shares in UK drinks and food firm Allied Domec...  


In [4]:
df = df.dropna(subset=['title', 'paragraph'])
print(f"\nCleaned dataset: {len(df)} examples")

# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(df)

# Use a smaller subset for faster training (adjust as needed)
dataset = dataset.select(range(min(1000, len(dataset))))
print(f"Using subset: {len(dataset)} examples")


Cleaned dataset: 2225 examples
Using subset: 1000 examples


In [5]:
model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
print(f"Model loaded on: {device}")

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Model loaded on: cpu


In [6]:
def preprocess(examples):
    """Preprocess the dataset for training"""
    # Tokenize paragraphs (inputs)
    inputs = tokenizer(
        examples["paragraph"],
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors=None  # Important: return lists, not tensors
    )

    # Tokenize titles (labels)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["title"],
            max_length=64,
            truncation=True,
            padding="max_length",
            return_tensors=None
        )

    inputs["labels"] = labels["input_ids"]
    return inputs

# Apply preprocessing
print("Tokenizing dataset...")
tokenized_dataset = dataset.map(
    preprocess,
    batched=True,
    remove_columns=dataset.column_names  # Remove original columns
)
print("Dataset tokenized successfully!")


Tokenizing dataset...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]



Dataset tokenized successfully!


In [7]:
from google.colab import drive
drive.mount('/content/drive')
print("Google Drive mounted!")

Mounted at /content/drive
Google Drive mounted!


In [9]:
output_dir = "./results"
drive_model_path = '/content/drive/MyDrive/ai_paragraph_titler'

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=2,  # Reduced from 3 for faster training
    per_device_train_batch_size=4,  # Adjust based on your GPU memory
    gradient_accumulation_steps=2,  # Effective batch size = 4 * 2 = 8
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    eval_strategy="no",
    fp16=torch.cuda.is_available(),  # Enable mixed precision on GPU
    dataloader_num_workers=2,  # Speed up data loading
    report_to=[],
    remove_unused_columns=True,
)
print("Training arguments configured!")

Training arguments configured!


In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

print("Starting training...")
print("=" * 60)
try:
    train_results = trainer.train()
    print("\nTraining completed successfully!")

    # Save metrics
    trainer.save_metrics("train", train_results.metrics)
    print(f"Training metrics saved!")

except KeyboardInterrupt:
    print("\n\nTraining interrupted! Saving current model state...")
except Exception as e:
    print(f"\n\nError during training: {e}")
    print("Attempting to save model anyway...")


Starting training...




Step,Training Loss
50,10.6916
100,3.1385
150,0.4807
200,0.3341
250,0.2864





Training completed successfully!
Training metrics saved!


In [11]:
try:
    os.makedirs(drive_model_path, exist_ok=True)

    # Save the model and tokenizer
    model.save_pretrained(drive_model_path)
    tokenizer.save_pretrained(drive_model_path)

    print(f"\n{'='*60}")
    print(f"Model saved successfully to: {drive_model_path}")
    print(f"{'='*60}")
except Exception as e:
    print(f"Error saving model: {e}")


Model saved successfully to: /content/drive/MyDrive/ai_paragraph_titler


In [12]:
print("\nLoading model for inference...")
titler = pipeline(
    "summarization",
    model=drive_model_path,
    tokenizer=drive_model_path,
    device=0 if torch.cuda.is_available() else -1
)
print("Model loaded successfully!")


Loading model for inference...


Device set to use cpu


Model loaded successfully!


In [13]:
test_texts = [
    "Artificial intelligence is transforming healthcare by enabling early disease detection and personalized treatment plans. Machine learning algorithms analyze medical data to identify patterns that humans might miss.",
    "Climate change poses significant challenges to global ecosystems. Rising temperatures affect biodiversity, weather patterns, and sea levels, requiring immediate action from governments worldwide.",
    "Online education has expanded access to learning opportunities. Students can now access courses from top universities regardless of their geographic location."
]

print("\n" + "="*80)
print("TESTING THE TRAINED MODEL")
print("="*80 + "\n")

for i, text in enumerate(test_texts, 1):
    result = titler(
        text,
        max_length=32,
        min_length=5,
        do_sample=False
    )
    print(f"Test {i}:")
    print(f"Input: {text[:100]}...")
    print(f"Generated Title: {result[0]['summary_text']}")
    print("-" * 80 + "\n")



TESTING THE TRAINED MODEL



Both `max_new_tokens` (=256) and `max_length`(=32) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Your max_length is set to 32, but your input_length is only 30. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=15)
Both `max_new_tokens` (=256) and `max_length`(=32) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Test 1:
Input: Artificial intelligence is transforming healthcare by enabling early disease detection and personali...
Generated Title: Artificial intelligence revolutionises healthcare
--------------------------------------------------------------------------------



Your max_length is set to 32, but your input_length is only 25. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)
Both `max_new_tokens` (=256) and `max_length`(=32) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Test 2:
Input: Climate change poses significant challenges to global ecosystems. Rising temperatures affect biodive...
Generated Title: Climate change threatens ecosystems
--------------------------------------------------------------------------------

Test 3:
Input: Online education has expanded access to learning opportunities. Students can now access courses from...
Generated Title: Online education expands access
--------------------------------------------------------------------------------



In [14]:
def generate_title(paragraph):
    """Generate a title for the given paragraph"""
    if not paragraph.strip():
        return "Please provide valid text."

    result = titler(
        paragraph,
        max_length=32,
        min_length=5,
        do_sample=False
    )
    return result[0]['summary_text']

print("\n" + "="*80)
print("INTERACTIVE TITLE GENERATOR")
print("="*80)
print("Enter a paragraph and get an AI-generated title (type 'quit' to exit)\n")

while True:
    user_input = input("\nEnter your paragraph: ")

    if user_input.lower() in ['quit', 'exit', 'q']:
        print("Goodbye!")
        break

    if user_input.strip():
        title = generate_title(user_input)
        print(f"\n✓ Generated Title: {title}")
    else:
        print("⚠ Please enter some text.")



INTERACTIVE TITLE GENERATOR
Enter a paragraph and get an AI-generated title (type 'quit' to exit)


Enter your paragraph: Education has also been revolutionized by artificial intelligence. Adaptive learning platforms now tailor educational content to each student’s needs and pace. Teachers can use AI tools to identify struggling learners and provide personalized support, making education more inclusive and efficient.


Both `max_new_tokens` (=256) and `max_length`(=32) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



✓ Generated Title: Artificial intelligence revolutionizes education

Enter your paragraph: Education has also been revolutionized by artificial intelligence. Adaptive learning platforms now tailor educational content to each student’s needs and pace. Teachers can use AI tools to identify struggling learners and provide personalized support, making education more inclusive and efficient.


Both `max_new_tokens` (=256) and `max_length`(=32) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



✓ Generated Title: Artificial intelligence revolutionizes education

Enter your paragraph: The day began not with a fanfare, but with a slow, gentle unveiling. A pale, lavender light first softened the hard edges of the night, gradually seeping into the sky until it bled into shades of rose and gold. The world, once a monochrome silhouette, slowly revealed its colors: the deep green of a distant pine, the weathered grey of a fence post, the rich brown of the turned earth. A single bird tentatively broke the silence, its chirp soon joined by others, weaving a complex tapestry of sound that spoke not of chaos, but of a quiet, purposeful awakening. In that hushed hour, before the world demanded its due, there existed a perfect, fleeting peace, a reminder that the most profound beginnings are often the most silent.


Both `max_new_tokens` (=256) and `max_length`(=32) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



✓ Generated Title: Day begins with silence


KeyboardInterrupt: Interrupted by user

In [15]:
print("\n" + "="*80)
print("TRAINING SUMMARY")
print("="*80)
print(f"• Model: BART-base")
print(f"• Training examples: {len(tokenized_dataset)}")
print(f"• Epochs: {training_args.num_train_epochs}")
print(f"• Batch size: {training_args.per_device_train_batch_size}")
print(f"• Device: {device.upper()}")
print(f"• Model saved to: {drive_model_path}")
print(f"• Status: ✓ Completed")
print("="*80)


TRAINING SUMMARY
• Model: BART-base
• Training examples: 1000
• Epochs: 2
• Batch size: 4
• Device: CPU
• Model saved to: /content/drive/MyDrive/ai_paragraph_titler
• Status: ✓ Completed
