### Collecting News,article on Climate Language

In [3]:
import os
import time
import logging
import re
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from newspaper import Article
from bs4 import BeautifulSoup
import feedparser

# Setup logging
logging.basicConfig(level=logging.INFO, format="INFO: %(message)s")

# Directory to save articles
SAVE_DIR = "articles"
os.makedirs(SAVE_DIR, exist_ok=True)

# RSS feed of climate-related news
RSS_FEED = "https://news.google.com/rss/search?q=climate+change+Nepal&hl=en-NE&gl=NP&ceid=NP:en"

# Setup headless Chrome WebDriver
def get_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# Clean extracted HTML content using BeautifulSoup
def clean_article_content(raw_html: str) -> str:
    soup = BeautifulSoup(raw_html, 'html.parser')

    # Remove unnecessary sections
    for tag in soup.find_all(['script', 'style', 'nav', 'footer', 'aside']):
        tag.decompose()

    for a in soup.find_all('a', string=re.compile(r'(Facebook|LinkedIn|X|Twitter|Share)', re.IGNORECASE)):
        a.decompose()

    for div in soup.find_all('div', class_=re.compile(r'download|attachments|related', re.IGNORECASE)):
        div.decompose()

    # Collect meaningful blocks
    blocks = soup.find_all(['h1', 'h2', 'h3', 'h4', 'p', 'li', 'blockquote'])

    # Join text with paragraph spacing
    content = '\n\n'.join(block.get_text(strip=True) for block in blocks if block.get_text(strip=True))
    return re.sub(r'\s{3,}', '\n\n', content).strip()

# Extract article content using Selenium and fallback to newspaper3k
def extract_article_content(driver, url):
    try:
        driver.get(url)
        time.sleep(3)

        selectors = [
            '//article',
            '//div[contains(@class, "article-content")]',
            '//div[@id="content"]',
            '//section[contains(@class, "content")]',
            '//div[@class="post-content"]'
        ]

        raw_html = ""
        for selector in selectors:
            try:
                elem = driver.find_element(By.XPATH, selector)
                raw_html = elem.get_attribute('innerHTML')
                logging.info(f"✅ Extracted using XPath: {selector}")
                break
            except:
                continue

        # If no specific selector worked, fallback to full page source
        if not raw_html:
            logging.info("⚠️  XPath selectors failed. Using full page source.")
            raw_html = driver.page_source

        # Clean content
        text = clean_article_content(raw_html)
        if text:
            return text

        # Final fallback: newspaper3k
        logging.info("⚠️  Falling back to newspaper3k for: " + url)
        article = Article(url)
        article.download()
        article.parse()
        return article.text.strip() if article.text else None

    except Exception as e:
        logging.warning(f"❌ Exception extracting from {url}: {e}")
        return None

# Sanitize filename
def clean_filename(title):
    return ''.join(c if c.isalnum() or c in (' ', '_') else '_' for c in title).replace(' ', '_')[:80] + ".txt"

def main():
    logging.info("Fetching article metadata...")
    feed = feedparser.parse(RSS_FEED)
    entries = feed.entries[:50]  # Limit to top 50
    logging.info(f"Found {len(entries)} articles. Starting content extraction...")

    failed_articles = []
    driver = get_driver()

    for entry in tqdm(entries):
        title = entry.title
        link = entry.link
        published = entry.published
        source = entry.source.title if 'source' in entry else 'Unknown'

        logging.info(f"📄 Processing: {title}")

        content = extract_article_content(driver, link)
        if not content:
            logging.info(f"🚫 Could not extract content: {title}")
            failed_articles.append(link)
            continue

        filename = clean_filename(title)
        filepath = os.path.join(SAVE_DIR, filename)

        with open(filepath, "w", encoding="utf-8") as f:
            f.write(f"Title: {title}\nPublished: {published}\nSource: {source}\n\n")
            f.write(content)

        logging.info(f"✅ Saved: {filename}")

    driver.quit()

    if failed_articles:
        logging.info("\n🧾 Failed Articles:")
        for bad_url in failed_articles:
            logging.info(" - " + bad_url)
    else:
        logging.info("✅ All articles processed successfully.")

if __name__ == "__main__":
    main()

INFO: Fetching article metadata...
INFO: Found 50 articles. Starting content extraction...
INFO: Get LATEST chromedriver version for google-chrome
INFO: Get LATEST chromedriver version for google-chrome
INFO: There is no [win64] chromedriver "135.0.7049.114" for browser google-chrome "135.0.7049" in cache
INFO: Get LATEST chromedriver version for google-chrome
INFO: WebDriver version 135.0.7049.114 selected
INFO: Modern chrome version https://storage.googleapis.com/chrome-for-testing-public/135.0.7049.114/win32/chromedriver-win32.zip
INFO: About to download new driver from https://storage.googleapis.com/chrome-for-testing-public/135.0.7049.114/win32/chromedriver-win32.zip
INFO: Driver downloading response is 200
INFO: Get LATEST chromedriver version for google-chrome
INFO: Driver has been saved in cache [C:\Users\SHYAM PANDIT\.wdm\drivers\chromedriver\win64\135.0.7049.114]
  0%|          | 0/50 [00:00<?, ?it/s]INFO: 📄 Processing: Climate Change Amplified the Effects of Extreme Rainfall

### NLP Analysis

In [2]:
!python -m spacy download en_core_web_sm

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     - -------------------------------------- 0.5/12.8 MB 4.2 MB/s eta 0:00:03
     ---- ----------------------------------- 1.3/12.8 MB 4.2 MB/s eta 0:00:03
     ------- -------------------------------- 2.4/12.8 MB 4.5 MB/s eta 0:00:03
     ---------- ----------------------------- 3.4/12.8 MB 4.6 MB/s eta 0:00:03
     ------------- -------------------------- 4.5/12.8 MB 4.5 MB/s eta 0:00:02
     ----------------- ---------------------- 5.5/12.8 MB 4.7 MB/s eta 0:00:02
     -------------------- ------------------- 6.6/12.8 MB 4.6 MB/s eta 0:00:02
     ---------------------- ----------------- 7.3/12.8 MB 4.5 MB/s eta 0:00:02
     ----------------------- -------------


[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
import pandas as pd
import os
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
import spacy
from gensim import corpora, models

# Download required resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load spaCy model
import en_core_web_sm
nlp = en_core_web_sm.load()

# Cleaning function
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    tokens = nltk.word_tokenize(text.lower())
    tokens = [t for t in tokens if t not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

# Load articles
def load_articles(folder_path='articles'):
    articles, filenames = [], []
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                content = file.read()
                articles.append(content)
                filenames.append(filename)
    df = pd.DataFrame({'filename': filenames, 'article': articles})
    df['cleaned_article'] = df['article'].apply(clean_text)
    return df

# Sentiment analysis
def get_sentiment(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    subjectivity = blob.sentiment.subjectivity
    sentiment = 'Positive' if polarity > 0 else 'Negative' if polarity < 0 else 'Neutral'
    return sentiment, polarity, subjectivity

# Named Entity Recognition
def get_named_entities(text):
    doc = nlp(text)
    return ', '.join([f"{ent.text} ({ent.label_})" for ent in doc.ents])

# Topic modeling (on all articles)
def get_topics(texts, num_topics=3):
    tokenized = [t.split() for t in texts]
    dictionary = corpora.Dictionary(tokenized)
    corpus = [dictionary.doc2bow(text) for text in tokenized]
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15, random_state=42)
    topics = lda_model.print_topics()
    return topics

# Process data
df = load_articles('../articles')
df[['sentiment', 'polarity', 'subjectivity']] = df['article'].apply(
    lambda x: pd.Series(get_sentiment(x))
)
df['named_entities'] = df['article'].apply(get_named_entities)

# Run topic modeling on all articles
topics = get_topics(df['cleaned_article'].tolist())
topic_summary = '; '.join([f"Topic {idx +1}: {topic}" for idx, topic in topics])

# Save topic summary as same for all rows (you can improve to assign specific topic per article later)
df['topics'] = topic_summary

# Save results to CSV
df.to_csv('../articles/article_analysis.csv', index=False)

print("✅ NLP analysis complete! Results saved to article_analysis.csv")



[nltk_data] Downloading package punkt to C:\Users\SHYAM
[nltk_data]     PANDIT\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\SHYAM
[nltk_data]     PANDIT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\SHYAM
[nltk_data]     PANDIT\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


✅ NLP analysis complete! Results saved to article_analysis.csv


In [15]:
import nltk
import shutil
import os

# Get nltk data base path
nltk_data_base = os.path.join(os.path.expanduser('~'), 'AppData', 'Roaming', 'nltk_data')

# Delete punkt and punkt_tab folders if they exist
for subfolder in ['tokenizers/punkt', 'tokenizers/punkt_tab']:
    folder_path = os.path.join(nltk_data_base, subfolder)
    if os.path.exists(folder_path):
        print(f"Found existing {subfolder} folder at {folder_path}, deleting it...")
        shutil.rmtree(folder_path)
        print(f"Deleted old {subfolder} folder.")

# Download punkt and punkt_tab cleanly
print("Downloading fresh punkt...")
nltk.download('punkt')
print("Downloading fresh punkt_tab...")
nltk.download('punkt_tab')

# Test word_tokenize
from nltk.tokenize import word_tokenize

try:
    tokens = word_tokenize("This is a test sentence.")
    print("word_tokenize works! ✅ Output:", tokens)
except Exception as e:
    print("❌ word_tokenize failed with error:", e)


Downloading fresh punkt...


[nltk_data] Downloading package punkt to C:\Users\SHYAM
[nltk_data]     PANDIT\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


Downloading fresh punkt_tab...


[nltk_data] Downloading package punkt_tab to C:\Users\SHYAM
[nltk_data]     PANDIT\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


word_tokenize works! ✅ Output: ['This', 'is', 'a', 'test', 'sentence', '.']


In [7]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd

def initialize_sentiment_analyzer():
    """Initialize VADER Sentiment Analyzer."""
    analyzer = SentimentIntensityAnalyzer()
    return analyzer

def analyze_sentiment(text, analyzer):
    """Analyze sentiment polarity scores of a given text."""
    if not isinstance(text, str):
        return None
    score = analyzer.polarity_scores(text)
    return score

def assign_sentiment_label(score):
    """Assign Positive, Negative, Neutral labels based on compound score."""
    if score is None:
        return "Unknown"
    compound = score['compound']
    if compound >= 0.05:
        return "Positive"
    elif compound <= -0.05:
        return "Negative"
    else:
        return "Neutral"

def apply_sentiment_analysis(input_csv="../news/climate_news_articles.csv", output_csv="../news/climate_news_with_sentiment.csv"):
    """Apply sentiment analysis on the news articles CSV and save results."""
    df = pd.read_csv(input_csv)
    analyzer = initialize_sentiment_analyzer()
    
    # Analyze sentiment
    df['sentiment_score'] = df['description'].apply(lambda x: analyze_sentiment(x, analyzer))
    df['sentiment_label'] = df['sentiment_score'].apply(assign_sentiment_label)
    
    # Save the new dataframe
    df.to_csv(output_csv, index=False)
    print(f"Sentiment analysis completed and saved to {output_csv}")

# 🛠 Example Usage:
if __name__ == "__main__":
    apply_sentiment_analysis()


Sentiment analysis completed and saved to ../news/climate_news_with_sentiment.csv


### NER and Extracting location and name 

In [5]:

!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     --- ------------------------------------ 1.0/12.8 MB 1.7 MB/s eta 0:00:07
     ----- ---------------------------------- 1.8/12.8 MB 2.5 MB/s eta 0:00:05
     --------- ------------------------------ 3.1/12.8 MB 3.3 MB/s eta 0:00:03
     ------------- -------------------------- 4.2/12.8 MB 3.8 MB/s eta 0:00:03
     ----------------- ---------------------- 5.5/12.8 MB 4.0 MB/s eta 0:00:02
     -------------------- ------------------- 6.6/12.8 MB 4.2 MB/s eta 0:00:02
     ----------------------- ---------------- 7.6/12.8 MB 4.3 

In [11]:
import spacy
import pandas as pd

def initialize_spacy_model(model_name='en_core_web_sm'):
    """Initialize the spaCy NLP model."""
    nlp = spacy.load(model_name)
    return nlp

def extract_named_entities(text, nlp_model):
    """Extract named entities like location, organization, event from the text."""
    if not isinstance(text, str):
        return []
    doc = nlp_model(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

def filter_entities_by_type(entities, types=["GPE", "LOC", "ORG", "EVENT"]):
    """Filter entities based on specified types (like Location, Organization, Event)."""
    return [ent for ent in entities if ent[1] in types]

def apply_ner(input_csv="../news/climate_news_with_sentiment.csv", output_csv="../news/climate_news_with_ner.csv"):
    """Apply NER on news articles and save the enriched file."""
    df = pd.read_csv(input_csv)
    nlp_model = initialize_spacy_model()

    df['named_entities'] = df['description'].apply(lambda x: filter_entities_by_type(extract_named_entities(x, nlp_model)))

    df.to_csv(output_csv, index=False)
    print(f"Named Entity Recognition completed and saved to {output_csv}")

# 🛠 Example Usage:
if __name__ == "__main__":
    apply_ner()
    


Named Entity Recognition completed and saved to ../news/climate_news_with_ner.csv


In [6]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to C:\Users\SHYAM
[nltk_data]     PANDIT\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\SHYAM
[nltk_data]     PANDIT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Topic Modelig system

In [13]:

import nltk
import pandas as pd
import gensim
from gensim import corpora

nltk.data.path.append(r"C:\Users\SHYAM PANDIT\AppData\Roaming\nltk_data")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re


def preprocess_text(text):
    """Clean and tokenize text for topic modeling without nltk punkt."""
    import nltk
    from nltk.corpus import stopwords
    import re

    stop_words = set(stopwords.words('english'))

    if not isinstance(text, str):
        text = str(text)

    tokens = re.findall(r'\b\w+\b', text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens




def create_dictionary_and_corpus(texts):
    """Create dictionary and corpus for LDA model."""
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    return dictionary, corpus

def build_lda_model(corpus, dictionary, num_topics=5):
    """Build and return the LDA model."""
    lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42, passes=15)
    return lda_model

def print_topics(lda_model, num_words=5):
    """Print the topics discovered by the LDA model."""
    topics = lda_model.print_topics(num_words=num_words)
    for idx, topic in topics:
        print(f"Topic {idx}: {topic}")

def apply_topic_modeling(input_csv="../news/climate_news_with_ner.csv", text_column="description", num_topics=5):
    """Apply topic modeling on news articles."""
    df = pd.read_csv(input_csv)
    
    # Check for NaN or empty values in the text column
    if df[text_column].isnull().any():
        print(f"Warning: Some rows in '{text_column}' column have NaN values.")
        df = df.dropna(subset=[text_column])  # Remove rows with NaN values
    
    texts = df[text_column].apply(preprocess_text)
    
    dictionary, corpus = create_dictionary_and_corpus(texts)
    lda_model = build_lda_model(corpus, dictionary, num_topics=num_topics)
    
    print("Generated Topics:")
    print_topics(lda_model)
    
    return lda_model, dictionary, corpus


if __name__ == "__main__":
    # Apply topic modeling on the enriched CSV
  lda_model, dictionary, corpus = apply_topic_modeling()



Generated Topics:
Topic 0: 0.055*"climate" + 0.042*"change" + 0.015*"new" + 0.015*"pope" + 0.015*"francis"
Topic 1: 0.064*"climate" + 0.051*"change" + 0.014*"un" + 0.014*"action" + 0.014*"leaders"
Topic 2: 0.083*"climate" + 0.062*"change" + 0.032*"news" + 0.011*"us" + 0.011*"making"
Topic 3: 0.066*"climate" + 0.052*"change" + 0.037*"arizona" + 0.030*"republic" + 0.016*"environment"
Topic 4: 0.060*"climate" + 0.047*"change" + 0.014*"global" + 0.014*"new" + 0.014*"francis"


### Text Summarization

In [None]:
import pandas as pd
from transformers import pipeline

def initialize_summarizer(model_name="facebook/bart-large-cnn"):
    """Initialize Huggingface summarization pipeline."""
    summarizer = pipeline("summarization", model=model_name)
    return summarizer

def summarize_text(text, summarizer, min_length=30, max_length=120):
    """Summarize a given text."""
    if not isinstance(text, str) or len(text.split()) < 30:
        return text  # Skip very short texts
    
    summary = summarizer(text, min_length=min_length, max_length=max_length, do_sample=False)
    return summary[0]['summary_text']

def apply_text_summarization(input_csv="../news/climate_news_with_ner.csv", output_csv="../news/climate_news_with_summary.csv", text_column="description"):
    """Apply summarization on news articles and save."""
    df = pd.read_csv(input_csv)
    summarizer = initialize_summarizer()

    df['summary'] = df[text_column].apply(lambda x: summarize_text(x, summarizer))

    df.to_csv(output_csv, index=False)
    print(f"Summarization completed and saved to {output_csv}")

# 🛠 Example Usage:
if __name__ == "__main__":
    apply_text_summarization()


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cpu


Summarization completed and saved to ../news/climate_news_with_summary.csv


### Multilingual Nepali Language

In [20]:
!pip install gnews

Collecting gnews
  Using cached gnews-0.4.1-py3-none-any.whl.metadata (19 kB)
Collecting feedparser~=6.0.2 (from gnews)
  Using cached feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting beautifulsoup4<5,>=4.9.3 (from gnews)
  Downloading beautifulsoup4-4.13.4-py3-none-any.whl.metadata (3.8 kB)
Collecting dnspython (from gnews)
  Using cached dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4<5,>=4.9.3->gnews)
  Downloading soupsieve-2.7-py3-none-any.whl.metadata (4.6 kB)
Collecting sgmllib3k (from feedparser~=6.0.2->gnews)
  Using cached sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Installing build dependencies: started
  Installing build dependencies: still running...
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with

In [21]:
from gnews import GNews
import pandas as pd

def initialize_news_client_nepali(language='ne', country='NP', max_results=50):
    """Initialize GNews client for Nepali language."""
    news = GNews(language=language, country=country, max_results=max_results)
    return news

def search_nepali_climate_news(news_client, keyword="जलवायु परिवर्तन"):
    """Search Nepali climate-related news articles."""
    articles = news_client.get_news(keyword)
    return articles

def extract_article_info_nepali(articles):
    """Extract useful fields from Nepali news articles."""
    extracted_data = []
    for article in articles:
        extracted_data.append({
            'title': article.get('title', ''),
            'description': article.get('description', ''),
            'published_date': article.get('published date', ''),
            'url': article.get('url', ''),
            'publisher': article.get('publisher', {}).get('title', '')
        })
    return extracted_data

def save_nepali_articles_to_csv(extracted_data, filename="../news/nepali_climate_news_articles.csv"):
    """Save extracted Nepali articles into a CSV file."""
    df = pd.DataFrame(extracted_data)
    df.to_csv(filename, index=False, encoding='utf-8-sig')  # Save properly in Nepali
    print(f"Saved {len(df)} Nepali articles to {filename}")

# 🛠 Example Usage:
if __name__ == "__main__":
    news_client = initialize_news_client_nepali(language='ne', country='NP', max_results=50)
    articles = search_nepali_climate_news(news_client, keyword="जलवायु परिवर्तन")  # "Climate Change" in Nepali
    extracted_data = extract_article_info_nepali(articles)
    save_nepali_articles_to_csv(extracted_data)


Saved 50 Nepali articles to nepali_climate_news_articles.csv


In [25]:
from transformers import MarianMTModel, MarianTokenizer, pipeline

def load_translation_model(src_lang="en", tgt_lang="np"):
    """Load translation model for Nepali to English."""
    model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}'
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    return tokenizer, model

def translate_nepali_to_english(text, tokenizer, model):
    """Translate Nepali text to English."""
    if not isinstance(text, str) or len(text.strip()) == 0:
        return ""
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    translated = model.generate(**inputs)
    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
    return translated_text

def initialize_multilingual_summarizer(model_name="facebook/bart-large-cnn"):
    """Initialize a summarizer model (already in English)."""
    summarizer = pipeline("summarization", model=model_name)
    return summarizer

def summarize_nepali_text(text, nepali_tokenizer, nepali_model, summarizer, min_length=30, max_length=120):
    """Translate Nepali → English → Summarize."""
    english_text = translate_nepali_to_english(text, nepali_tokenizer, nepali_model)
    if len(english_text.split()) < 30:
        return english_text  

    summary = summarizer(english_text, min_length=min_length, max_length=max_length, do_sample=False)
    return summary[0]['summary_text']

def apply_multilingual_processing(input_csv="../news/nepali_climate_news_articles.csv", output_csv="../news/nepali_climate_news_summary.csv", text_column="description"):
    """Full pipeline: Translate Nepali text → Summarize → Save."""
    df = pd.read_csv(input_csv)

    nepali_tokenizer, nepali_model = load_translation_model()
    summarizer = initialize_multilingual_summarizer()

    df['english_translation'] = df[text_column].apply(lambda x: translate_nepali_to_english(x, nepali_tokenizer, nepali_model))
    df['summary'] = df['english_translation'].apply(lambda x: summarizer(x, min_length=30, max_length=120, do_sample=False)[0]['summary_text'] if len(x.split()) > 30 else x)

    df.to_csv(output_csv, index=False)
    print(f"Multilingual processing completed and saved to {output_csv}")

# 🛠 Example Usage
if __name__ == "__main__":
    apply_multilingual_processing()


ImportError: 
MarianTokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.
