# Deliverable 1: Final M&A Signal Dataset Creation

This notebook creates a large secondary dataset for predicting M&A activity. It processes a list of companies from `companies_list.csv` and performs the following steps:

1.  **Fetches Filing Metadata**: For each company, it finds the most recent filings for forms 10-K, 10-Q, and 8-K.
2.  **Analyzes Textual Signals**: For each individual filing, it downloads the text, counts M&A-related keywords, and calculates the sentiment of the surrounding context.
3.  **Fetches Financial Signals**: It pulls the latest company-wide financial metrics (like Current Ratio and Debt-to-Equity) from the SEC's XBRL data API.
4.  **Generates Heuristic Score**: It calculates a rule-based "M&A likelihood score" for each filing based on a weighted combination of the textual and financial signals.
5.  **Outputs CSV**: It saves the final, large dataset with one row per filing.

In [None]:
import sys, subprocess
def pip_install(pkg):
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', pkg])

for pkg in ['pandas', 'requests', 'beautifulsoup4', 'lxml', 'tqdm', 'nltk', 'vaderSentiment']:
    try:
        __import__(pkg if pkg not in ['beautifulsoup4', 'vaderSentiment'] else {'beautifulsoup4': 'bs4'}.get(pkg, pkg))
    except ImportError:
        pip_install(pkg)

import pandas as pd
import re, time, math, json, os
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import nltk
from nltk.tokenize import sent_tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


nltk.download('punkt', quiet=True)
analyzer = SentimentIntensityAnalyzer()


SEC_EMAIL = 'iib2022004@iiita.ac.in' 
USER_AGENT = f'University Project (Contact: {SEC_EMAIL})'

FILINGS_PER_FORM = 10

SEC_SLEEP_INTERVAL = 0.2

OUTPUT_CSV_PATH = 'data/ma_secondary_dataset.csv'

print("Setup Complete. All libraries are imported and configured.")

Setup Complete. All libraries are imported and configured.


In [None]:
def sec_api_get(url, **kwargs):
    """Wrapper for requests.get to include the required User-Agent header."""
    headers = kwargs.pop('headers', {})
    headers = {
        'User-Agent': USER_AGENT,
        'Accept-Encoding': 'gzip, deflate',
        **headers
    }
    response = requests.get(url, headers=headers, timeout=30, **kwargs)
    response.raise_for_status()
    time.sleep(SEC_SLEEP_INTERVAL)
    return response

def get_ticker_to_cik_map():
    """Fetches the official SEC ticker list and creates a Ticker -> CIK mapping."""
    tickers_url = 'https://www.sec.gov/files/company_tickers.json'
    data = sec_api_get(tickers_url).json()
    ticker_map = {row['ticker'].upper(): int(row['cik_str']) for _, row in data.items()}
    return ticker_map

try:
    companies_df = pd.read_csv("companies_list.csv")
    tickers = companies_df['ticker'].tolist()
    
    print("Fetching Ticker->CIK map from SEC...")
    ticker_to_cik = get_ticker_to_cik_map()
    
    CIKS = [ticker_to_cik[t.upper()] for t in tickers if t.upper() in ticker_to_cik]
    print(f"Successfully mapped {len(CIKS)} tickers to CIKs.")
    print(f"First 5 CIKs: {CIKS[:5]}")

except FileNotFoundError:
    print("Error: 'companies_list.csv' not found. Please create it first.")
except Exception as e:
    print(f"An error occurred: {e}")

Fetching Ticker->CIK map from SEC...
Successfully mapped 103 tickers to CIKs.
First 5 CIKs: [320193, 789019, 1018724, 1045810, 1652044]


In [None]:

SUBMISSIONS_URL_TEMPLATE = 'https://data.sec.gov/submissions/CIK{cik:0>10}.json'
FORMS_TO_SCAN = {'10-K', '10-Q', '8-K'}
MA_KEYWORDS = [
    r'\bM&A\b', r'\bmerger(s)?\b', r'\bacquisition(s)?\b',
    r'\bacquire(d|s)?\b', r'\bbusiness\s+combination(s)?\b', r'\bbuyout(s)?\b'
]
MA_KEYWORD_REGEX = re.compile('|'.join(MA_KEYWORDS), flags=re.IGNORECASE)

def get_recent_filings(cik, max_per_form):
    """Gets the metadata for the most recent filings of the desired types for a given CIK."""
    submissions_url = SUBMISSIONS_URL_TEMPLATE.format(cik=cik)
    submissions_data = sec_api_get(submissions_url).json()
    
    recent_filings = submissions_data.get('filings', {}).get('recent', {})
    filing_metadata = []
    counts = {form: 0 for form in FORMS_TO_SCAN}

    for form, date, accession_num, doc in zip(
        recent_filings.get('form', []),
        recent_filings.get('filingDate', []),
        recent_filings.get('accessionNumber', []),
        recent_filings.get('primaryDocument', [])
    ):
        if form in FORMS_TO_SCAN and counts[form] < max_per_form:
            acc_no_clean = accession_num.replace('-', '')
            doc_url = f'https://www.sec.gov/Archives/edgar/data/{cik}/{acc_no_clean}/{doc}'
            filing_metadata.append({'form': form, 'date': date, 'url': doc_url})
            counts[form] += 1
        if all(c >= max_per_form for c in counts.values()):
            break
            
    return filing_metadata

def analyze_text_for_ma_signals(html_content):
    """Parses HTML, finds keywords, and calculates context sentiment."""
    soup = BeautifulSoup(html_content, 'lxml')
    for tag in soup(['script', 'style', 'table']):
        tag.decompose()
    
    text = re.sub(r'\s+', ' ', soup.get_text()).strip()
    sentences = sent_tokenize(text)
    
    mentions_count = 0
    context_sentiments = []
    
    for i, sentence in enumerate(sentences):
        if MA_KEYWORD_REGEX.search(sentence):
            mentions_count += 1

            context_start = max(0, i - 2)
            context_end = min(len(sentences), i + 3)
            context = ' '.join(sentences[context_start:context_end])
            sentiment_score = analyzer.polarity_scores(context)['compound']
            context_sentiments.append(sentiment_score)
            
    avg_sentiment = sum(context_sentiments) / len(context_sentiments) if context_sentiments else 0.0
    return mentions_count, float(avg_sentiment)

print("Textual analysis functions are defined.")

Textual analysis functions are defined.


In [None]:
FACTS_URL_TEMPLATE = 'https://data.sec.gov/api/xbrl/companyfacts/CIK{cik:0>10}.json'

def get_latest_fact_value(facts_obj, taxonomy, tag):
    """Finds the most recent, valid numerical value for a given XBRL tag."""
    try:
        series = facts_obj['facts'][taxonomy][tag]['units']
        for unit, data_points in series.items():

            valid_points = [p for p in data_points if isinstance(p.get('val'), (int, float))]
            if not valid_points:
                continue
            latest_point = sorted(valid_points, key=lambda x: x.get('end', ''), reverse=True)[0]
            return float(latest_point['val'])
    except (KeyError, IndexError):
        return None
    return None

def fetch_financials_for_company(cik):
    """Fetches key financial metrics and calculates ratios for a single company."""
    facts_url = FACTS_URL_TEMPLATE.format(cik=cik)
    facts_data = sec_api_get(facts_url).json()
    gaap = 'us-gaap'
    
    assets_current = get_latest_fact_value(facts_data, gaap, 'AssetsCurrent')
    liabilities_current = get_latest_fact_value(facts_data, gaap, 'LiabilitiesCurrent')
    stockholders_equity = get_latest_fact_value(facts_data, gaap, 'StockholdersEquity')
    total_liabilities = get_latest_fact_value(facts_data, gaap, 'Liabilities')

    current_ratio = assets_current / liabilities_current if assets_current and liabilities_current else None
    debt_to_equity = total_liabilities / stockholders_equity if total_liabilities and stockholders_equity else None
    
    return {
        'current_ratio': current_ratio,
        'debt_to_equity': debt_to_equity,
    }

print("Financial analysis functions are defined.")

Financial analysis functions are defined.


In [None]:
def calculate_heuristic_score(mentions, current_ratio, debt_to_equity):
    """Calculates a simple 0-1 score based on text and financial signals."""

    text_score = 1 - math.exp(-(mentions or 0) / 5.0)
    
    cr_score = 0.0 if current_ratio is None else min(current_ratio / 2.0, 1.0)
    de_score = 0.0 if debt_to_equity is None else (1.0 - min(debt_to_equity, 2.0) / 2.0)
    finance_score = 0.5 * cr_score + 0.5 * de_score
    
    final_score = 0.7 * text_score + 0.3 * finance_score
    return max(0.0, min(1.0, final_score))

dataset_rows = []
for cik in tqdm(CIKS, desc='Processing Companies'):
    try:
        financials = fetch_financials_for_company(cik)
        
        filings_to_process = get_recent_filings(cik, max_per_form=FILINGS_PER_FORM)
        
        for filing in filings_to_process:
            try:
                html = sec_api_get(filing['url']).content
                mentions, sentiment = analyze_text_for_ma_signals(html)
                
                score = calculate_heuristic_score(mentions, financials.get('current_ratio'), financials.get('debt_to_equity'))

                row = {
                    'cik': cik,
                    'filing_form': filing['form'],
                    'filing_date': filing['date'],
                    'ma_mentions_in_filing': mentions,
                    'ma_sentiment_in_filing': sentiment,
                    'company_current_ratio': financials.get('current_ratio'),
                    'company_debt_to_equity': financials.get('debt_to_equity'),
                    'heuristic_ma_score': score,
                }
                dataset_rows.append(row)
            except Exception as e:
                print(f"\n[WARN] Skipping filing {filing.get('url')} for CIK {cik} due to error: {e}")

    except Exception as e:
        print(f"\n[ERROR] Failed processing CIK {cik}: {e}")

final_df = pd.DataFrame(dataset_rows)

print("\nDataset creation complete.")
final_df.head()

Processing Companies:   0%|          | 0/103 [00:00<?, ?it/s]


Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(html_content, 'lxml')
Processing Companies: 100%|██████████| 103/103 [1:43:21<00:00, 60.20s/it]


Dataset creation complete.





Unnamed: 0,cik,filing_form,filing_date,ma_mentions_in_filing,ma_sentiment_in_filing,company_current_ratio,company_debt_to_equity,heuristic_ma_score
0,320193,10-Q,2025-08-01,3,0.7646,0.867992,4.035622,0.380931
1,320193,8-K,2025-07-31,0,0.0,0.867992,4.035622,0.065099
2,320193,8-K,2025-07-25,0,0.0,0.867992,4.035622,0.065099
3,320193,8-K,2025-07-09,0,0.0,0.867992,4.035622,0.065099
4,320193,8-K,2025-05-12,0,0.0,0.867992,4.035622,0.065099


In [None]:
os.makedirs('data', exist_ok=True)

final_df.to_csv(OUTPUT_CSV_PATH, index=False)
print(f"Successfully saved secondary dataset to '{OUTPUT_CSV_PATH}'")


final_df.describe()

Successfully saved secondary dataset to 'data/ma_secondary_dataset.csv'


Unnamed: 0,cik,ma_mentions_in_filing,ma_sentiment_in_filing,company_current_ratio,company_debt_to_equity,heuristic_ma_score
count,2758.0,2758.0,2758.0,2476.0,1961.0,2758.0
mean,690956.5,28.525743,0.447887,1.371848,2.067614,0.54593
std,577141.9,40.037757,0.368895,0.929734,7.620244,0.335602
min,2488.0,0.0,-0.9682,0.387743,-48.077693,0.0
25%,80661.0,0.0,0.0,0.907941,0.739072,0.15
50%,796343.0,12.0,0.583943,1.115346,1.782747,0.715263
75%,1103982.0,42.0,0.759174,1.353446,4.065814,0.799348
max,2012383.0,255.0,1.0,5.812199,23.387464,1.0
