In [None]:
!pip install requests feedparser bs4 nltk transformers torch datasets --quiet


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone


# Fetch articles

In [None]:
import requests
import feedparser

# Function to fetch articles from NewsAPI
def fetch_newsapi_articles(api_key, query, page_size=20):
    url = 'https://newsapi.org/v2/everything'
    params = {
        'q': query,
        'pageSize': page_size,
        'apiKey': api_key
    }
    response = requests.get(url, params=params)
    data = response.json()
    articles = data.get('articles', [])
    return articles

# Function to fetch articles from an RSS feed
def fetch_rss_feed(feed_url):
    feed = feedparser.parse(feed_url)
    articles = []
    for entry in feed.entries:
        articles.append({
            'title': entry.title,
            'link': entry.link,
            'published': entry.published,
            'summary': entry.summary
        })
    return articles

# Example usage
if __name__ == "__main__":
    # Fetch articles using NewsAPI
    api_key = 'YOUR_NEWSAPI_KEY'
    query = 'technology'
    newsapi_articles = fetch_newsapi_articles(api_key, query)

    # Fetch articles from an RSS feed
    rss_url = 'http://feeds.bbci.co.uk/news/rss.xml'
    rss_articles = fetch_rss_feed(rss_url)

    # Combine articles
    all_articles = newsapi_articles + rss_articles


# Clean up the text

In [None]:
import requests
from bs4 import BeautifulSoup

# Function to scrape the content of a news article
def scrape_article_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    # Modify the selector based on the website's structure
    article_body = soup.find('div', class_='article-body')
    paragraphs = article_body.find_all('p') if article_body else []
    content = '\n'.join([para.get_text() for para in paragraphs])
    return content

# Example usage
if __name__ == "__main__":
    article_url = 'https://www.example.com/news/article'
    content = scrape_article_content(article_url)
    print(content)





# Preprocessing the text

In [None]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download NLTK data files (run once)
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Rejoin tokens
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# Example usage
if __name__ == "__main__":
    raw_text = "This is an example article! Visit https://example.com for more info."
    cleaned_text = preprocess_text(raw_text)
    print(cleaned_text)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


example article visit info


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# fine-tune the model

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration, GenerationConfig
from transformers import Trainer, TrainingArguments
import torch
from datasets import load_dataset

# Load the CNN/DailyMail dataset
dataset = load_dataset('cnn_dailymail', '3.0.0', split='train')

# Initialize the tokenizer and model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

# Create a GenerationConfig object with the non-default parameters
generation_config = GenerationConfig(
    max_length=142,
    min_length=56,
    early_stopping=True,
    num_beams=4,
    length_penalty=2.0,
    no_repeat_ngram_size=3,
    forced_bos_token_id=0,
    forced_eos_token_id=2
)

# Set the generation config for the model
model.generation_config = generation_config

# Preprocess the dataset
def preprocess_data(examples):
    inputs = [doc for doc in examples['article']]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")

    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['highlights'], max_length=128, truncation=True, padding="max_length")

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_dataset = dataset.map(preprocess_data, batched=True)

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy='steps',
    eval_steps=500,
    save_steps=500,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    fp16=True  # Enable if your hardware supports it
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset.shuffle(seed=42).select(range(1000)),  # Use a subset for example
    eval_dataset=tokenized_dataset.shuffle(seed=42).select(range(100)),
)

# Start training
trainer.train()

README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss
500,0.8106,0.567065


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


TrainOutput(global_step=500, training_loss=1.4713970031738282, metrics={'train_runtime': 282.4052, 'train_samples_per_second': 3.541, 'train_steps_per_second': 1.771, 'total_flos': 2167104602112000.0, 'train_loss': 1.4713970031738282, 'epoch': 1.0})