In [None]:
import nltk
import re
from PyPDF2 import PdfReader
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')


def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [
        lemmatizer.lemmatize(word) 
        for word in tokens if word.isalnum() and word not in stop_words
    ]
    return filtered_tokens

def contains_keywords(text, keywords):
    for keyword in keywords:
        if re.search(r'\b' + re.escape(keyword) + r'\b', text, re.IGNORECASE):
            return True
    return False

def extract_competitive_advantage_section(pdf_path, start_page, end_page):
    reader = PdfReader(pdf_path)
    competitive_advantage_text = ""

    for page_num in range(start_page - 1, end_page):  
        page_text = reader.pages[page_num].extract_text()
        if contains_keywords(page_text, keywords):
            competitive_advantage_text += page_text + "\n"
    
    return competitive_advantage_text

keywords = [
    "competitive advantage", "industry position", "market leadership", 
    "dominance", "strategic position", "edge over competitors", 
    "superiority in market", "advantage over peers"
]

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

pdf_path = "rhp.pdf"
start_page = int(input("Enter the start page: "))
end_page = int(input("Enter the end page: "))

competitive_advantage_text = extract_competitive_advantage_section(pdf_path, start_page, end_page)
filtered_tokens = preprocess_text(competitive_advantage_text)


print("Extracted Competitive Advantage Text (First 1000 characters):")
print(competitive_advantage_text[:1000])

print("\nPreprocessed Tokens (First 50 tokens):")
print(filtered_tokens[:50])

In [None]:
import spacy

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")
doc = nlp(competitive_advantage_text)

advantage_keywords = [
    "better", "more", "less", "greater", "largest", "stronger", "leading",
    "dominant", "innovative", "exclusive", "differentiated", "superior",
    "advanced", "unique", "competitive edge", "edge over", "advantage in",
    "compared to", "relative to", "versus", "market leader", "industry leader"
]

comparative_phrases = []
for sentence in doc.sents:
    if any(keyword in sentence.text.lower() for keyword in advantage_keywords):
        comparative_phrases.append(sentence.text)


print("Extracted Comparative Phrases:")
for i, phrase in enumerate(comparative_phrases[:20], 1):  # Display the first 20 for readability
    print(f"{i}. {phrase}")

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# FinBERT 
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")

# sentiment analysis pipeline
finbert_classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

comparative_sentiments = {}
for phrase in comparative_phrases:
    sentiment = finbert_classifier(phrase)
    # Assign polarity score
    if sentiment[0]["label"] == "positive":
        polarity_score = sentiment[0]["score"]
    elif sentiment[0]["label"] == "negative":
        polarity_score = -sentiment[0]["score"]
    else:  # Neutral case
        polarity_score = 0
    comparative_sentiments[phrase] = polarity_score

print("Sentiment Scores (Improved):", comparative_sentiments)

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np

embedder = SentenceTransformer("all-mpnet-base-v2")  # Upgraded model
embeddings = embedder.encode(comparative_phrases)

silhouette_scores = []
potential_clusters = range(2, 11)  # Testing for 2 to 10 clusters
for k in potential_clusters:
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(embeddings)
    score = silhouette_score(embeddings, labels)
    silhouette_scores.append(score)

optimal_clusters = potential_clusters[np.argmax(silhouette_scores)]

# Perform Clustering
clustering_model = KMeans(n_clusters=optimal_clusters, random_state=42)
cluster_labels = clustering_model.fit_predict(embeddings)

# Organize
clustered_phrases = {i: [] for i in range(optimal_clusters)}
for i, label in enumerate(cluster_labels):
    clustered_phrases[label].append(comparative_phrases[i])

print(f"Optimal Number of Clusters: {optimal_clusters}")
print("Clustered Phrases:", clustered_phrases)

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)

plt.figure(figsize=(10, 6))
for cluster_id in range(optimal_clusters):
    cluster_points = reduced_embeddings[cluster_labels == cluster_id]
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f"Cluster {cluster_id}")
plt.legend()
plt.title("Comparative Phrases Clusters")
plt.show()

In [None]:
import numpy as np

cluster_scores = {}
for cluster, phrases in clustered_phrases.items():
    if phrases:
        weights = [abs(comparative_sentiments[phrase]) for phrase in phrases]
        weighted_scores = [comparative_sentiments[phrase] * weights[i] for i, phrase in enumerate(phrases)]
        cluster_scores[cluster] = sum(weighted_scores) / sum(weights)
    else:
        cluster_scores[cluster] = 0

min_score = min(cluster_scores.values())
max_score = max(cluster_scores.values())
normalized_scores = {
    cluster: (score - min_score) / (max_score - min_score) if max_score != min_score else 0
    for cluster, score in cluster_scores.items()
}


competitive_advantage_score = sum(normalized_scores.values())
scaled_score = competitive_advantage_score * 100 / len(clustered_phrases)

print("Cluster Scores (Weighted & Normalized):", normalized_scores)
print("Competitive Advantage Score (Scaled to 0-100):", scaled_score)