In [None]:
!pip install adaptkeybert datasets

In [79]:
# Version 3.0 (Based on unsupervised list of LLaMA)

#To Revise: sorted by priorities
"""
- Product
- Technology
- Leadership
- Customer
- Research & Development (more updates)
"""

keywords_dict = {
    "Financial Performance": {
        "EPS": ["performance indicators","Earnings per Share expectations","price-to-earnings ratio"],
        "Cash Flow":["free cash flow","liquidity","working capital","cash reserves"],
        "Revenue": ["operational revenue", "business income", "operating profit","increased income","sales growth rate","earnings trajectory"],
        "Return On Equity": ["capital efficiency", "investment return", "capital ROI","ROE"],
        "Margins": ["EBITDA margins", "operating margin", "profit margin", "cost-to-revenue", "gross profit","profitability changes"],
        "Cost Management": ["expense control", "cost reduction", "efficiency savings","expense trimming", "cost containment","cost optimization","cost-effectiveness"],
        "Dividend Policy": ["dividend payout", "shareholder returns", "yield", "dividend sustainability", "payout ratio"],
        "Investments": ["strategic investment", "capital deployment", "fundraising", "fund allocation"],
        "Balance Sheet": ["assets", "liabilities", "equity", "debt management", "financial health","book value","debt-to-equity ratio","equity issuance","capital structure"],
    },
    "Company": {
        "Long-term Growth": ["sustainable growth", "long-term trajectory", "future growth","scalability", "large-scale expansion","unit expansion","unit volume"],
        "Mergers & Acquisition": ["business acquisition", "M&A activity", "buyout approach","merger"],
        "Refranchising": ["franchise model", "franchise transitions", "refranchising plans"],
        "Sustainability": ["green initiatives", "environmental impact", "sustainable practices","ESG","SRI","sustainability","green energy adoption","social responsibility"],
        "Employees": ["workforce optimization","talent management","upskilling","remote work","employee management","employee benefits","diversity and inclusion","salary levels"],
        "Research & Development":["R&D spending","patent activity","technological breakthrough","future of work","continuous development","research projects"],
        "Marketing":["brand awareness","CI","corporate identity","performance marketing","brand loyalty","word-of-mouth","brand value","consumer perception"],
        "Shares Repurchase": ["buyback", "repurchase programs", "shareholder value", "equity reduction", "stock repurchases","stock rebuy"],
        "Processes":["process improvements","streamlined processes","productivity improvements","operational efficiency"],
        "Leadership":["effective leadership", "executive strength", "executive resilience","management trust","crisis management","risk assessment","contingency planning"],
    },
    "Product": {
        "Innovation": ["new features", "innovative products", "product advancements","new product","product launch","disruptive technology","portfolio diversification"],
        "Product Characteristics":["USP","unique selling point","product quality", "product differentiation","product portfolio"],
        "Pricing Strategy":["price segmentation","price optimization", "dynamic pricing", "competitive pricing","pricing models", "price elasticity", "discount strategies"],
        "Production": ["production capacities","manufacturing delays", "supply issues","production stops","factory problems","logistics bottlenecks","material shortage"],
        "Technology Trends":["autonomous systems","IoT (Internet of Things)", "machine learning","deep learning","natural language processing","AI","robotics","digital transformation","cloud computing","blockchain"]
    },
    "Market": {
        "Market Share": ["market share", "industry share", "market proportion","market penetration"],
        "Market Expansion": ["new markets", "geographical reach", "market entry","worldwide expansion"],
        "Competitors":["market rivalry", "competitive threats", "industry competition", "competitive advantage"],
        "Global Presence": ["international footprint", "global operations", "worldwide coverage"],
        "Industry Outlook":["sector growth","market trends","market evolution","industry trends"],
		"Regulations":["tax regulations","regulatory risks","governmental influence","government incentives", "state funding", "subsidies","political influence","legal disputes"],
        "Partnerships and Collaborations":["strategic alliances","partner relationships","joint venture"],
        "Supply Chain":["logistics optimization", "supply logistics", "supply chain strategies","supply constraints","inventory challenges","distribution channels", "supplier relationships","procurement"],
        "Economic Conditions": ["economic environment", "market economy", "macroeconomic factors","recession","expansion","inflationary impact","interest rate environment","foreign exchange impact"],
        "Demand":["increasing demand","decreasing demand","demand forecasting","consumer visits","store traffic"],
        "Customer": ["user interaction", "customer retention","customer loyalty","frequent buyer","user satisfaction","customer lifetime value (CLV)","per-visit spending","churn rate"]
    }
}

In [80]:
from adaptkeybert import KeyBERT
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Training

# Testing

In [81]:
# Initialize AdaptKeyBERT and SentenceTransformer
kw_model = KeyBERT()


# Flattening the keywords into a single list
seed_words = [
    argument 
    for category, arguments in keywords_dict.items() 
    for argument, keywords in arguments.items() 
    for keyword in keywords
]

sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2')

def classify_sentence(sentence, kw_model, keywords_dict,seed_words, threshold=0.65):
    keywords = kw_model.extract_keywords(sentence, top_n=5,seed_keywords=seed_words,nr_candidates=50,)
    extracted_keywords = [kw[0] for kw in keywords]

    #print(f"sentence: {sentence}, Extracted_Keywords: {extracted_keywords}, keywords: {keywords}")

    results = {}
    for category, arguments in keywords_dict.items():
        argument_scores = {}
        for argument, seed_words in arguments.items():
            seed_embeddings = sentence_transformer.encode(seed_words)
            keyword_embeddings = sentence_transformer.encode(extracted_keywords)
            similarities = cosine_similarity(keyword_embeddings, seed_embeddings)
            max_similarity = similarities.max() if similarities.size > 0 else 0
            if max_similarity >= threshold:
                argument_scores[argument] = max_similarity
        
        if argument_scores:
            results[category] = argument_scores
    
    return results if results else {"Uncategorized": "No relevant category found"}

"""
 "Eating vegetables is good for your health.",
    "This is a generic sentence with no clear category.",
    "The company was able to cut the costs for the whole year",
    "They acquired Monsato for a fair price which added more services to their portfolio",
    "Excellent results. Revenues of 2.39B ahead of cons. 2.35B while FY outlook gets a 100M boost. Billings 16 yy ahead of cons. 7 on a tough compare due to increasing but well understood seasonality 12 month target in the business. Management visibility sounds very good, supported by deferred revenue strength with total deferred plus backlog now standing at 14B. Operating cash flow of 1.2B was a record and 52 week range higher than all of F16. Operating margin expansion continues at a Dividend Yield Market Cap m balanced rate 125150 bps this FY, driving nice cash flow growth 2021 yy in F18 reaffirmed.",
    "SALESFORCE.COM, INC. CRM, Buy, 100.00 PT Current Price 87.75 Analyst Joel.Fishbein",
    "3. Gaining market share through supplychain dominance.",
    "As competitors grapple with chip shortages, Apples share of global smartphone sales breached 5year highs, hitting 21 in 4Q21.",
    "In the highend market, Samsung was hit by its Texas semiconductor fabrication plants monthlong power shutdown and Vietnam factory suspensions due to COVID19.",
    "Samsungs new Galaxy Notes launch in may be delayed as a result.",
    "Apple managed to mitigate shortages in 3QFY21, with the revenue impact coming in lower than its US34bn guidance.",
    "Apple holds buying power over suppliers through scale purchases and upfront cash payments, leaving competitors handling price shocks and shortages created by Apple deals.",
    "The customers really got to know the brand for multiple years now",
    "The company is buying back 20% of their own stocks",
    "Flat production with few data points unlikely to drive the stock higher in near term: Multi-well, pad-based drilling activity in the Wattenberg and limited completion activity in the Utica will likely result in a very back- end weighted production growth profile for PDCE.",
    "While the company's activity appears to support our 36% annual rate of estimated increase in production to 27.7 Mboepd for 2014, we only forecast sequential growth of 2% and 4%, respectively, for Q2:14 and Q3:14.",
    "While this production growth forecast is consistent with guidance, it likely creates some reticence with investors.",
    "Also, the limited number of completion- related data points probably turns off some of the company's more momentum-driven potential investors.",
    "Indicative of this lack of data points was the absence of specificity regarding PDCE's Utica Garvin pad.",
    "While all three Garvin wells are now tied into sales and Utica production scaled up 505 boepd sequentially in Q1:14, the lack of detail likely did not sit well with some.",
    "With three Palmer pad wells in the Utica set to spud this quarter, we expect the Utica to continue to be a growth driver going forward for PDC."
"""


# Example sentences
sentences = [
    "The latest advancements in AI are groundbreaking.",
    "Apples Revenue grew year-to-year by 25%",
    "Eating vegetables is good for your health.",
    "This is a generic sentence with no clear category.",
    "They increased the traffic within the stores by 10%",
    "The customer are more happy than they were before"
]


# Test classification
for sentence in sentences:
    categories = classify_sentence(sentence, kw_model, keywords_dict,seed_words)
    print(f"Sentence: {sentence}\nCategories: {categories}\n")


Sentence: The latest advancements in AI are groundbreaking.
Categories: {'Product': {'Innovation': 0.78166187, 'Technology Trends': 1.0000001}}

Sentence: Apples Revenue grew year-to-year by 25%
Categories: {'Financial Performance': {'Revenue': 0.79779124, 'Margins': 0.78512156}}

Sentence: Eating vegetables is good for your health.
Categories: {'Uncategorized': 'No relevant category found'}

Sentence: This is a generic sentence with no clear category.
Categories: {'Uncategorized': 'No relevant category found'}

Sentence: They increased the traffic within the stores by 10%
Categories: {'Market': {'Demand': 0.68890375}}

Sentence: The customer are more happy than they were before
Categories: {'Market': {'Customer': 0.7161225}}

