In [None]:
import os
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize, sent_tokenize
from tqdm import tqdm
import nltk

# Download NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

# Initialize embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Define folder path and keyword lists
submission_folder_path = "10k_submissions"

value_creation_keywords_raw = [
    "ability", "accelerate", "acceleration", "activism", "aggrandizement", "agile", "agility", "ahead time", "aimed", "anew", "arise", "aspirations", "attack", "brand new",
    "break ground", "bring to being", "bring to existence", "bring forth", "bring forward", "bring to life", "bring to light", "bring out", "bring pass", "bring to public", "bring to the world", "build", "call to existence", "change",
    "commodities", "completion", "compose", "concept", "conceptual", "configuration", "constitute", "construed", "contrive", "create", "creation",
    "creative", "creativity", "curious", "deliver", "deliverability", "deliverable", "demand driven", "demand side", "demonstrate", "demonstration", "deploy", "detect",
    "detection", "develop", "development", "discover", "discovery", "draft", "draw up", "dream", "early adopted", "emerge", "emergence", "engender",
    "engenderment", "engine", "engineer", "enrichment", "establish", "establishment", "evolution", "evolutionary", "evolve", "exceptional", "experimental",
    "experimentation", "explore", "exploration", "fabricate", "fabrication", "facilitate", "facilitation", "factory", "feasible", "first move", "first step", "forge", "form",
    "founded", "founder", "from beginning", "from ground", "from scratch", "fructify", "funded", "funding", "furnish", "generate", "give life", "give rise", "grow better", "growth",
    "heighten", "high value", "idea", "incept", "inception", "increase", "initial", "initiate", "initiation", "initiative", "innovate", "innovation", "innovation driven",
    "invent", "invention", "investment", "laboratory", "lifting", "made", "magnify", "make changes", "manufacture", "market based",
    "modernize", "modernization", "newborn", "newer", "newly arrived", "newly come", "newly issued", "novation", "novel", "novelty", "opening", "preparation", "proclaim",
    "procreate", "procreation", "produce", "production", "product related", "productive", "productivity", "progress", "progression", "promised results",
    "prototype", "prototyping", "pursuit", "quest", "raise", "realizable", "realize", "realization", "rebuild", "recent make", "recent period", "re-create", "re-develop",
    "reinvent", "remake", "research", "revolution", "revolutionary", "revolutionize", "roadmap", "search", "shape", "sift through", "sifting", "solution", "spark", "speed up",
    "strengthen", "study", "transform", "transformation", "trendy", "ultramodern", "unexplored", "unfolding", "up to date", "useful", "value added", "vision",
    "work out"
]

value_appropriation_keywords_raw = [
    "ablation", "adapt", "adaptation", "addition", "additives", "additory", "adjudication", "adjust", "adjustment", "adopt", "adoption", "advance",
    "advancement", "advantage", "advertise", "advertising", "advertisement", "advertorial", "affiliate", "affiliation", "allocate",
    "allocation", "allotment", "alter", "alteration", "ameliorate", "amelioration", "amend", "amendment", "augment", "augmentation", "aware",
    "awareness", "behoove", "beneficial", "benefit", "benefit expense", "benefit related", "big name", "brand", "brand building", "branded",
    "branding", "brand loyalty", "brand name", "broadcast", "broaden", "capacities", "capital out", "capitalize", "capture", "carry", "carry forward",
    "certificate", "certify", "challenge", "coexist", "collect", "commerce", "commercial", "commercialize",
    "commercialization", "competition", "compete", "competitive", "competitiveness", "consideration", "consolidate", "consolidation",
    "contest", "continual", "continuance", "continuation", "contribute", "contribution", "convert", "conversion", "conversion rate", "convertibility",
    "cost effective", "cost efficient", "cost saving", "customer centric", "customer driven", "customer facing", "customer focused", "customer loyalty", "customer oriented", "customer pleasing", "customer specific", "customize",
    "customization", "defend", "defense", "demarcation", "derive", "detail", "differentiate", "differentiation", "differ", "direct mail",
    "discriminate", "discrimination", "distinct", "distinction", "distinguish", "diversity", "earn", "earnings", "efficacy", "elaborate", "embedding",
    "emphasize", "emphasis", "endorse", "endorsement", "engage", "engagement", "enhance", "enhancement", "evoke", "expand", "expansion",
    "exploit", "exploitation", "extension", "extract", "extraction", "feature", "franchise", "gain", "gain strength", "glean", "glory", "harvesting",
    "hedge", "improve", "improvement", "income", "increment", "individualize", "influence", "isolate", "isolation", "joint venture", "justify",
    "label", "learn", "leverage", "license", "licensing", "lift", "loyalty", "make better", "make good", "make most", "higher margin", "margin", "marketed", "marketing",
    "marketing related", "market leading", "maximize", "maximum", "meliorate", "newly modify", "modify", "modification", "monetize", "monetization",
    "multichannel", "newspaper ad", "optimize", "outcome", "outperform", "outperformance", "outsourcing", "patent", "payoff", "payor",
    "perform", "performance", "permit", "popularity", "position", "prefer", "preferable", "premium", "prestige", "procedure", "proceed",
    "process", "profit from", "profit gains", "profitability", "prominence", "promote",
    "promotion", "promulgated", "protect", "protection", "publicity", "reallocate",
    "reallocation", "rebrand", "recognition", "recognizable", "reconsider", "reconsideration", "reconstruct", "redefine", "redevelopments",
    "refine", "refocused", "refreshed", "registered trademark", "reinvesting", "renovate", "reorganize", "representation", "reputation",
    "revenue", "revenue generating", "rewarding", "rights reserved", "segmentation", "service", "serviceableness", "standalone", "standardize",
    "status", "stronghold", "submarket", "substitutable", "substituting", "superior", "superiority", "support",
    "supportive", "take advantage of", "trade name", "trademark", "uplift", "usefulness", "utilize", "valuations"
]

# Pre-compute average embeddings for VC/VA lists
print("Pre-computing embeddings for VC keywords...")
vc_keyword_embeddings = embedding_model.encode(value_creation_keywords_raw, show_progress_bar=True)
avg_vc_embedding = np.mean(vc_keyword_embeddings, axis=0)

print("Pre-computing embeddings for VA keywords...")
va_keyword_embeddings = embedding_model.encode(value_appropriation_keywords_raw, show_progress_bar=True)
avg_va_embedding = np.mean(va_keyword_embeddings, axis=0)

# Helper function for document embedding
def compute_document_embedding(text, model, chunk_size=256):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk_sentences = []
    current_chunk_word_count = 0

    if not text.strip():
        return np.zeros(model.get_sentence_embedding_dimension())

    for sentence in sentences:
        token_count = len(word_tokenize(sentence))
        if current_chunk_word_count + token_count > chunk_size and current_chunk_sentences:
            chunks.append(" ".join(current_chunk_sentences))
            current_chunk_sentences = [sentence]
            current_chunk_word_count = token_count
        else:
            current_chunk_sentences.append(sentence)
            current_chunk_word_count += token_count
    
    if current_chunk_sentences:
        chunks.append(" ".join(current_chunk_sentences))
    
    if not chunks:
        return np.zeros(model.get_sentence_embedding_dimension())

    chunk_embeddings = model.encode(chunks, batch_size=32, show_progress_bar=False)
    return np.mean(chunk_embeddings, axis=0)

# Main processing function
def process_10k_files_for_vc_va_embeddings(folder_path):
    results = []
    
    if not os.path.isdir(folder_path):
        raise FileNotFoundError(f"Folder not found: {folder_path}")
    
    files_to_process = [f for f in os.listdir(folder_path) if f.lower().endswith(".txt")]
    if not files_to_process:
        raise FileNotFoundError(f"No .txt files in: {folder_path}")

    print(f"Found {len(files_to_process)} .txt files to process.")

    for filename in tqdm(files_to_process, desc="Processing files"):
        base_name = os.path.splitext(filename)[0]
        parts = base_name.split('_')
        cik = parts[0]
        year_suffix = parts[1]
        year = f"20{year_suffix}" if len(year_suffix) == 2 else year_suffix
        
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8', errors='ignore') as f:
            raw_text = f.read()

        doc_embedding = (compute_document_embedding(raw_text, embedding_model)
                         if raw_text.strip()
                         else np.zeros(embedding_model.get_sentence_embedding_dimension()))
        vc_score = cosine_similarity([doc_embedding], [avg_vc_embedding])[0][0] * 100 if np.any(doc_embedding) else 0.0
        va_score = cosine_similarity([doc_embedding], [avg_va_embedding])[0][0] * 100 if np.any(doc_embedding) else 0.0
        
        results.append({
            "CIK": cik,
            "Year": year,
            "Filename": filename,
            "VC_Embedding_Score (%)": vc_score,
            "VA_Embedding_Score (%)": va_score,
        })
        
    return pd.DataFrame(results)

# Execute processing
df_vc_va = process_10k_files_for_vc_va_embeddings(submission_folder_path)
df_vc_va.to_csv("vc_va_embedding_scores.csv", index=False)
df_vc_va.head()
