In [None]:
%pip install spacy




In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
!pip install transformers



In [None]:
import pandas as pd
import spacy
from transformers import pipeline
import re
from html import unescape

# Load spaCy's large English model
try:
    nlp = spacy.load("en_core_web_lg")
except OSError:
    print("Downloading language model...")
    !python -m spacy download en_core_web_lg
    nlp = spacy.load("en_core_web_lg")

# Initialize BART model for text summarization
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

# Custom entity patterns
patterns = [
    {"label": "MEASUREMENT", "pattern": [
        {"LOWER": {"REGEX": r"^\d+\.?\d*"}},
        {"LOWER": {"REGEX": r"bbl|boe|mmcf|b/d|mtpa|billion|million"}}
    ]},
    {"label": "CURRENCY", "pattern": [
        {"TEXT": {"REGEX": r"^\$[\d\.]+"}}
    ]},
    {"label": "FIELD", "pattern": [
        {"LOWER": "field"},
        {"IS_ALPHA": True}
    ]}
]

ruler = nlp.add_pipe("entity_ruler", before="ner")
ruler.add_patterns(patterns)

# Text cleaning function
def clean_text(text):
    """Sanitize text by removing boilerplate, formatting issues, and decoding HTML entities"""
    if pd.isna(text) or text.strip() == "":
        return ""

    # Decode HTML/XML entities
    clean = unescape(text)

    # Remove ISSN footer and other technical information
    clean = re.split(r'ISSN: \d{4}-\d{4} \(Online\)', clean)[0]

    # Normalize whitespace characters
    clean = re.sub(r'\s+', ' ', clean)

    # Remove non-ASCII characters
    clean = re.sub(r'[^\x00-\x7F]+', ' ', clean)

    return clean.strip()

# File handling function
def read_events_from_file(filename):
    """Read input data from Excel file with proper column handling"""
    df = pd.read_excel(
        filename,
        usecols=['titles', 'links', 'information'],
        engine='openpyxl'
    )

    df.columns = df.columns.str.strip().str.lower()
    df = df.dropna(subset=['titles', 'information'])

    return df

# Enhanced processing function
def process_event(title, info_text, link):
    """Process raw event text into structured data with error handling"""
    result = {
        'title': title,
        'link': link,
        'summary': "",
        'entities': {
            "ORG": [],
            "DATE": [],
            "GPE": [],
            "MONEY": [],
            "MEASUREMENT": []
        }
    }

    try:
        cleaned_text = clean_text(str(info_text))
        if len(cleaned_text) < 50:
            return result

        max_tokens = 1000
        truncated_text = " ".join(cleaned_text.split()[:max_tokens])

        doc = nlp(truncated_text)

        # Filter entities
        entities = {
            "ORG": list(set(ent.text for ent in doc.ents if ent.label_ == "ORG" and ent.text.isalnum())),
            "DATE": list(set(ent.text for ent in doc.ents if ent.label_ == "DATE" and ent.text.isalnum())),
            "GPE": list(set(ent.text for ent in doc.ents if ent.label_ == "GPE" and ent.text.isalnum())),
            "MONEY": list(set(ent.text for ent in doc.ents if ent.label_ in ("MONEY", "CURRENCY") and ent.text.isalnum())),
            "MEASUREMENT": list(set(ent.text for ent in doc.ents if ent.label_ == "MEASUREMENT" and ent.text.isalnum()))
        }

        # Generate summary components
        summary_parts = [f"Event: {title}"]

        if entities['ORG']:
            summary_parts.append(f"Involved parties: {', '.join(entities['ORG'])}")
        if entities['DATE']:
            summary_parts.append(f"Key dates: {', '.join(entities['DATE'])}")
        if entities['MONEY']:
            summary_parts.append(f"Financial figures: {', '.join(entities['MONEY'])}")
        if entities['MEASUREMENT']:
            summary_parts.append(f"Production metrics: {', '.join(entities['MEASUREMENT'])}")

        if len(truncated_text) > 100:
            try:
                bert_summary = summarizer(
                    truncated_text,
                    max_length=50,
                    min_length=20,
                    do_sample=False,
                    truncation=True
                )[0]['summary_text']
                summary_parts.append(f"Summary: {bert_summary}")
            except Exception as e:
                print(f"Summarization error for '{title}': {str(e)}")

        result['summary'] = ' '.join(summary_parts)
        result['entities'] = entities

    except Exception as e:
        print(f"Error processing '{title}': {str(e)}")

    return result

# Main execution block
if __name__ == "__main__":
    input_file = r"/content/drive/MyDrive/jpt_articles_4.xlsx"
    output_file = "processed_events.csv"

    try:
        # Load input data
        df = read_events_from_file(input_file)
        print(f"Loaded {len(df)} valid entries from file")

        # Process and save incrementally
        processed_results = []
        for index, row in df.iterrows():
            result = process_event(
                row['titles'],
                row['information'],
                row['links']
            )
            processed_results.append(result)

            print(f"\nProcessed: {row['titles']}")
            print(f"Source: {row['links']}")
            print("-" * 50)

        # Save results to CSV
        pd.DataFrame(processed_results).to_csv(output_file, index=False)
        print(f"\nSuccessfully saved {len(processed_results)} events to {output_file}")

    except Exception as e:
        print(f"Critical error: {str(e)}")