In [3]:
%pip install spacy


Collecting spacy
  Downloading spacy-3.8.4-cp312-cp312-win_amd64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.12-cp312-cp312-win_amd64.whl.metadata (2.2 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.11-cp312-cp312-win_amd64.whl.metadata (8.8 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp312-cp312-win_amd64.whl.metadata (2.2 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Downloading thinc-8.3.4-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downloading srsly-2.5.1-cp312-cp312-win_amd64

In [6]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
     ---------------------------------------- 0.0/400.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/400.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/400.7 MB ? eta -:--:--
     ---------------------------------------- 0.3/400.7 MB ? eta -:--:--
     ---------------------------------------- 0.5/400.7 MB 1.3 MB/s eta 0:05:10
     ---------------------------------------- 0.8/400.7 MB 1.3 MB/s eta 0:04:58
     ---------------------------------------- 1.0/400.7 MB 1.3 MB/s eta 0:05:10
     ---------------------------------------- 1.6/400.7 MB 1.4 MB/s eta 0:04:36
     ---------------------------------------- 1.8/400.7 MB 1.5 MB/s eta 0:04:26
     ---------------------------------------- 2.4/400.7 MB 1.5 MB/s eta 0:04:22
     ---------------------------------------- 2.9/4

In [7]:
import csv
import spacy
from transformers import pipeline
import re

# Load models
nlp = spacy.load("en_core_web_lg")  # For better entity recognition
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Custom patterns for oil/gas entities
patterns = [
    {"label": "MEASUREMENT", "pattern": [{"LOWER": {"REGEX": "^\d+\.?\d*"}}, {"LOWER": {"REGEX": "bbl|boe|mmcf|b/d|mtpa|billion|million"}}]},
    {"label": "CURRENCY", "pattern": [{"TEXT": {"REGEX": "^\$[\d\.]+"}}]},
    {"label": "FIELD", "pattern": [{"LOWER": "field"}, {"IS_ALPHA": True}]}
]

ruler = nlp.add_pipe("entity_ruler", before="ner")
ruler.add_patterns(patterns)

def clean_text(text):
    """Remove boilerplate and special characters"""
    clean = re.split(r'ISSN: \d{4}-\d{4} \(Online\)', text)[0]
    clean = re.sub(r'\s+', ' ', clean)
    return clean.strip()

def read_events_from_csv(filename):
    """Load events from CSV with headers: titles, links, information"""
    events = []
    with open(filename, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f, delimiter='\t')
        for row in reader:
            events.append({
                "title": row['titles'],
                "link": row['links'],
                "info": row['information']
            })
    return events

def process_event(title, info_text):
    """Process event into natural language summary"""
    cleaned_text = clean_text(info_text)
    doc = nlp(cleaned_text)
    
    # Entity extraction
    entities = {
        "ORG": list(set([ent.text for ent in doc.ents if ent.label_ == "ORG"])),
        "DATE": list(set([ent.text for ent in doc.ents if ent.label_ == "DATE"])),
        "GPE": list(set([ent.text for ent in doc.ents if ent.label_ == "GPE"])),
        "MONEY": list(set([ent.text for ent in doc.ents if ent.label_ in ("MONEY", "CURRENCY")])),
        "MEASUREMENT": list(set([ent.text for ent in doc.ents if ent.label_ == "MEASUREMENT"]))
    }
    
    # Generate natural language summary
    summary = f"Event: {title}. "
    summary += f"Involved parties: {', '.join(entities['ORG']) if entities['ORG'] else 'Various industry players'}. "
    summary += f"Key dates: {', '.join(entities['DATE']) if entities['DATE'] else 'Recent developments'}. "
    
    if entities['MONEY']:
        summary += f"Financial figures: {', '.join(entities['MONEY'])}. "
    if entities['MEASUREMENT']:
        summary += f"Production metrics: {', '.join(entities['MEASUREMENT'])}. "
    
    # Add BERT summary for context
    bert_summary = summarizer(
        cleaned_text,
        max_length=100,
        min_length=30,
        do_sample=False
    )[0]['summary_text']
    
    summary += f"Key details: {bert_summary}"
    return summary

# Load data from CSV
events = read_events_from_csv(r"C:\Users\HP\PycharmProjects\WebScrapingUsingSelenium\jpt_articles_4.csv")

# Process and print results
for event in events:
    result = process_event(event["title"], event["info"])
    print(f"\n{result}")
    print(f"Source: {event['link']}\n{'-'*50}")


  {"label": "MEASUREMENT", "pattern": [{"LOWER": {"REGEX": "^\d+\.?\d*"}}, {"LOWER": {"REGEX": "bbl|boe|mmcf|b/d|mtpa|billion|million"}}]},
  {"label": "CURRENCY", "pattern": [{"TEXT": {"REGEX": "^\$[\d\.]+"}}]},


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development





model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [9]:
import csv
import spacy
from transformers import pipeline
import re

# ----------------------------
# Model Initialization Section
# ----------------------------

# Load spaCy's large English model for enhanced entity recognition
nlp = spacy.load("en_core_web_lg")

# Initialize BART model for text summarization
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# ---------------------------------
# Custom Entity Patterns Section
# ---------------------------------
# Define domain-specific patterns for oil/gas industry entities
patterns = [
    # Pattern for measurements like "3 billion BOE" or "500 bbl"
    {"label": "MEASUREMENT", "pattern": [
        {"LOWER": {"REGEX": "^\d+\.?\d*"}},  # Numeric value
        {"LOWER": {"REGEX": "bbl|boe|mmcf|b/d|mtpa|billion|million"}}  # Units
    ]},
    # Pattern for currency values like "$25 billion"
    {"label": "CURRENCY", "pattern": [
        {"TEXT": {"REGEX": "^\$[\d\.]+"}}  # Dollar amounts
    ]},
    # Pattern for oil fields like "Kirkuk field"
    {"label": "FIELD", "pattern": [
        {"LOWER": "field"},  # Field identifier
        {"IS_ALPHA": True}  # Field name
    ]}
]

# Add custom patterns to spaCy's processing pipeline
ruler = nlp.add_pipe("entity_ruler", before="ner")
ruler.add_patterns(patterns)

# --------------------------
# Text Cleaning Function
# --------------------------
def clean_text(text):
    """Sanitize text by removing boilerplate and formatting issues"""
    # Remove ISSN footer and other technical information
    clean = re.split(r'ISSN: \d{4}-\d{4} \(Online\)', text)[0]
    # Normalize whitespace characters
    clean = re.sub(r'\s+', ' ', clean)
    return clean.strip()

# ----------------------------
# CSV Handling Functions
# ----------------------------
def read_events_from_csv(filename):
    """Read input data from CSV file with tab delimiter"""
    events = []
    with open(filename, 'r', encoding='utf-8') as f:
        # Create CSV reader with tab delimiter
        reader = csv.DictReader(f, delimiter='\t')
        for row in reader:
            events.append({
                "title": row['titles'],
                "link": row['links'],
                "info": row['information']
            })
    return events

def write_results_to_csv(results, output_file="processed_events.csv"):
    """Write processed results to output CSV file"""
    # Define CSV field names
    fieldnames = [
        'title', 
        'summary', 
        'organizations', 
        'dates', 
        'financials', 
        'metrics', 
        'source_link'
    ]
    
    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        
        for result in results:
            writer.writerow({
                'title': result['title'],
                'summary': result['summary'],
                'organizations': '; '.join(result['entities']['ORG']),
                'dates': '; '.join(result['entities']['DATE']),
                'financials': '; '.join(result['entities']['MONEY']),
                'metrics': '; '.join(result['entities']['MEASUREMENT']),
                'source_link': result['link']
            })

# ----------------------------
# Event Processing Function
# ----------------------------
def process_event(title, info_text, link):
    """Process raw event text into structured data"""
    # Clean input text
    cleaned_text = clean_text(info_text)
    
    # Perform NLP analysis
    doc = nlp(cleaned_text)
    
    # Extract entities with deduplication
    entities = {
        "ORG": list(set([ent.text for ent in doc.ents if ent.label_ == "ORG"])),
        "DATE": list(set([ent.text for ent in doc.ents if ent.label_ == "DATE"])),
        "GPE": list(set([ent.text for ent in doc.ents if ent.label_ == "GPE"])),
        "MONEY": list(set([ent.text for ent in doc.ents if ent.label_ in ("MONEY", "CURRENCY")])),
        "MEASUREMENT": list(set([ent.text for ent in doc.ents if ent.label_ == "MEASUREMENT"]))
    }
    
    # Generate summary components
    summary_parts = []
    summary_parts.append(f"Event: {title}")
    
    if entities['ORG']:
        summary_parts.append(f"Involved parties: {', '.join(entities['ORG'])}")
    
    if entities['DATE']:
        summary_parts.append(f"Key dates: {', '.join(entities['DATE'])}")
    
    if entities['MONEY']:
        summary_parts.append(f"Financial figures: {', '.join(entities['MONEY'])}")
    
    if entities['MEASUREMENT']:
        summary_parts.append(f"Production metrics: {', '.join(entities['MEASUREMENT'])}")
    
    # Generate BERT summary
    try:
        bert_summary = summarizer(
            cleaned_text,
            max_length=100,
            min_length=30,
            do_sample=False
        )[0]['summary_text']
        summary_parts.append(f"Summary: {bert_summary}")
    except Exception as e:
        print(f"Summarization error: {str(e)}")
        bert_summary = "Summary unavailable"
    
    return {
        'title': title,
        'link': link,
        'summary': ' '.join(summary_parts),
        'entities': entities
    }
# ----------------------------
# Main Execution Block
# ----------------------------
if __name__ == "__main__":
    # Load raw data from CSV
    input_file = r"C:\Users\HP\PycharmProjects\WebScrapingUsingSelenium\jpt_articles_4.csv"
    events = read_events_from_csv(input_file)
    
    # Process all events
    processed_results = []
    for event in events:
        result = process_event(
            event["title"],
            event["info"],
            event["link"]
        )
        processed_results.append(result)
        
        # Print progress to console
        print(f"\nProcessed: {event['title']}")
        print(f"Source: {event['link']}")
        print("-" * 50)
    
    # Save results to CSV
    output_file = "processed_events.csv"
    write_results_to_csv(processed_results, output_file)
    print(f"\nSuccessfully saved {len(processed_results)} events to {output_file}")

  {"LOWER": {"REGEX": "^\d+\.?\d*"}},  # Numeric value
  {"TEXT": {"REGEX": "^\$[\d\.]+"}}  # Dollar amounts


model.safetensors:   3%|3         | 52.4M/1.63G [00:00<?, ?B/s]

  {"LOWER": {"REGEX": "^\d+\.?\d*"}},  # Numeric value
  {"TEXT": {"REGEX": "^\$[\d\.]+"}}  # Dollar amounts


KeyboardInterrupt: 

In [None]:
import spacy
from transformers import pipeline
import re

# Load models
nlp = spacy.load("en_core_web_lg")  # Use larger spaCy model for better NER
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Custom patterns for oil/gas specific entities
patterns = [
    {"label": "MEASUREMENT", "pattern": [{"LOWER": {"REGEX": "^\d+\.?\d*"}}, {"LOWER": {"REGEX": "bbl|boe|mmcf|b/d|mtpa"}}]},
    {"label": "CURRENCY", "pattern": [{"TEXT": {"REGEX": "^\$[\d\.]+"}}]},
    {"label": "FIELD", "pattern": [{"LOWER": "field"}, {"IS_ALPHA": True}]}
]

# Add patterns to entity ruler
ruler = nlp.add_pipe("entity_ruler", before="ner")
ruler.add_patterns(patterns)

def clean_text(text):
    """Remove boilerplate text and special characters"""
    clean = re.split(r'ISSN: \d{4}-\d{4} \(Online\)', text)[0]
    clean = re.sub(r'\s+', ' ', clean)  # Remove extra whitespace
    return clean.strip()

def extract_entities(text):
    """Extract entities using spaCy with custom patterns"""
    doc = nlp(text)
    
    entities = {
        "ORG": [],
        "DATE": [],
        "GPE": [],
        "MONEY": [],
        "MEASUREMENT": [],
        "CURRENCY": [],
        "FIELD": []
    }
    
    for ent in doc.ents:
        if ent.label_ in entities:
            entities[ent.label_].append(ent.text)
    
    return entities

def generate_summary(text):
    """Generate summary using BERT-based model"""
    return summarizer(
        text,
        max_length=100,
        min_length=30,
        do_sample=False,
        truncation=True
    )[0]['summary_text']

def process_event(title, info_text):
    """Process each event and return structured data"""
    cleaned_text = clean_text(info_text)
    
    # Extract basic information
    result = {
        "event": title,
        "summary": generate_summary(cleaned_text),
        "parties_involved": [],
        "dates": [],
        "amounts": [],
        "locations": [],
        "measurements": []
    }
    
    # Extract entities
    entities = extract_entities(cleaned_text)
    
    # Organize results
    result["parties_involved"] = list(set(entities["ORG"]))
    result["dates"] = list(set(entities["DATE"]))
    result["locations"] = list(set(entities["GPE"]))
    result["amounts"] = list(set(entities["MONEY"] + entities["CURRENCY"]))
    result["measurements"] = list(set(entities["MEASUREMENT"]))
    
    return result

# Example usage with sample data
sample_data = [
    {
        "title": "Next-Gen Fracturing: How Repsol and ExxonMobil Are Using Data, Iteration To Optimize Well Performance",
        "info": "The oil and gas industry is writing a new chapter..."  # Your full text here
    },
    {
        "title": "BP Signs Deal To Rehab Iraq's Kirkuk Oil Assets, Boost Production",
        "info": "In a deal described as possibly..."  # Your full text here
    }
]

# Process all events
results = []
for event in sample_data:
    processed = process_event(event["title"], event["info"])
    results.append(processed)

# Print formatted results
for i, result in enumerate(results):
    print(f"\nEvent {i+1}:")
    print(f"Title: {result['event']}")
    print(f"Summary: {result['summary']}")
    print(f"Parties: {', '.join(result['parties_involved'])}")
    print(f"Dates: {', '.join(result['dates'])}")
    print(f"Amounts: {', '.join(result['amounts'])}")
    print(f"Locations: {', '.join(result['locations'])}")
    print(f"Measurements: {', '.join(result['measurements'])}")

In [None]:
# Process multiple texts at once for spaCy
texts = [clean_text(event["info"]) for event in sample_data]
docs = list(nlp.pipe(texts))  # More efficient than individual processing

In [None]:
# Use smaller models for faster inference
nlp = spacy.load("en_core_web_sm")
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

In [None]:
# Text: "BP invested $25B in Kirkuk field on 2023-02-26"
# Labels:
# BP → ORG
# $25B → AMOUNT
# Kirkuk → LOCATION
# 2023-02-26 → DATE

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv("events.csv")
train, test = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Bidirectional, LSTM, Concatenate
from tensorflow.keras.models import Model
from transformers import TFBertModel, BertTokenizer
import pandas as pd
import numpy as np

# 1. Data Preparation
# -------------------
# Assume CSV with columns: titles, information, ner_tags, event_types, summaries
# ner_tags: BIO tags for entities (e.g., B-ORG, I-ORG, O)
# event_types: Numerical labels for event categories

MAX_LEN = 128
BATCH_SIZE = 16

# Load data
df = pd.read_csv("oil_gas_events.csv")
texts = df["information"].tolist()
ner_tags = np.array([eval(tags) for tags in df["ner_tags"]])  # Convert string lists to arrays
event_types = df["event_type"].values
summaries = df["summary"].tolist()

# 2. Tokenization
# ---------------
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def encode_texts(texts):
    return tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="tf"
    )

encoded_inputs = encode_texts(texts)

# 3. Model Architecture
# ---------------------
# Shared BERT Base
bert = TFBertModel.from_pretrained("bert-base-uncased")

# Input Layers
input_ids = Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_ids")
attention_mask = Input(shape=(MAX_LEN,), dtype=tf.int32, name="attention_mask")

# BERT Outputs
sequence_output, pooled_output = bert(input_ids, attention_mask=attention_mask)

# Task-Specific Heads
# a) NER Head (BIO Tags)
ner_head = Bidirectional(LSTM(128, return_sequences=True))(sequence_output)
ner_output = Dense(num_ner_tags, activation="softmax", name="ner")(ner_head)

# b) Event Classification
event_head = Dense(64, activation="relu")(pooled_output)
event_output = Dense(num_event_types, activation="softmax", name="event")(event_head)

# c) Summary Generation
summary_head = Bidirectional(LSTM(256))(sequence_output)
summary_output = Dense(vocab_size, activation="softmax", name="summary")(summary_head)

# Combined Model
model = Model(
    inputs=[input_ids, attention_mask],
    outputs=[ner_output, event_output, summary_output]
)

# 4. Model Compilation
# --------------------
losses = {
    "ner": tf.keras.losses.SparseCategoricalCrossentropy(),
    "event": tf.keras.losses.SparseCategoricalCrossentropy(),
    "summary": tf.keras.losses.SparseCategoricalCrossentropy()
}

metrics = {
    "ner": ["accuracy"],
    "event": ["accuracy"],
    "summary": ["accuracy"]
}

model.compile(
    optimizer=tf.keras.optimizers.Adam(3e-5),
    loss=losses,
    metrics=metrics
)

# 5. Training
# -----------
dataset = tf.data.Dataset.from_tensor_slices((
    {"input_ids": encoded_inputs["input_ids"], 
     "attention_mask": encoded_inputs["attention_mask"]},
    {"ner": ner_tags, 
     "event": event_types, 
     "summary": summary_targets}
)).batch(BATCH_SIZE)

history = model.fit(
    dataset,
    epochs=10,
    validation_split=0.2
)

# 6. Inference
# ------------
def predict_event(text):
    encoded = encode_texts([text])
    ner_pred, event_pred, summary_pred = model.predict({
        "input_ids": encoded["input_ids"],
        "attention_mask": encoded["attention_mask"]
    })
    
    return {
        "entities": decode_ner_tags(ner_pred[0]),
        "event_type": event_types[np.argmax(event_pred)],
        "summary": decode_summary(summary_pred[0])
    }

# Example Usage
sample_text = "BP invested $25B in Kirkuk field on 2023-02-26"
result = predict_event(sample_text)
print(result)

In [None]:
{
  "entities": [
    {"text": "BP", "type": "ORG"},
    {"text": "$25B", "type": "AMOUNT"},
    {"text": "Kirkuk", "type": "LOCATION"},
    {"text": "2023-02-26", "type": "DATE"}
  ],
  "event_type": "investment_deal",
  "summary": "BP announced a $25B investment in Kirkuk oil field..."
}

In [None]:
# Save Model
model.save("oil_gas_event_model", save_format="tf")

# Load in Production
loaded_model = tf.keras.models.load_model("oil_gas_event_model")

# Create Flask API
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route("/analyze", methods=["POST"])
def analyze():
    text = request.json["text"]
    result = predict_event(text)
    return jsonify(result)