In [1]:
!pip install -U sentence-transformers
!pip install faiss-cpu
!pip install tiktoken

Collecting sentence-transformers
  Using cached sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.31.1-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting hf-xet<2.0.0,>=1.1.0 (from huggingface-hub>=0.20.0->sentence-transformers)
  Downloading hf_xet-1.1.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (494 bytes)
Using cached sentence_transformers

In [1]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import faiss
import requests
from bs4 import BeautifulSoup

# === Load Company Data ===
try:
    df = pd.read_csv("/home/chebolu_srikanth/.keras/company_description.csv")
except FileNotFoundError:
    df = pd.DataFrame({
        'Ticker': ['AAPL', 'MSFT', 'GOOG'],
        'Description': [
            'Apple Inc. designs smartphones, computers, and accessories.',
            'Microsoft Corp. creates software, services, and devices worldwide.',
            'Alphabet Inc. offers online ads and related services globally.'
        ]
    })

tickers = df['Ticker'].tolist()
descriptions = df['Description'].tolist()

# === Setup Device and Models ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

bge_model = SentenceTransformer("BAAI/bge-large-en-v1.5", device=device)
bge_model.encode("warmup")

company_embeddings = bge_model.encode(descriptions, convert_to_tensor=True, normalize_embeddings=True, device=device)
faiss_index = faiss.IndexFlatIP(company_embeddings.shape[1])
faiss_index.add(company_embeddings.cpu().numpy())

try:
    relevance_tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")
    relevance_model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-large").to(device).eval()
except:
    relevance_model = None
    relevance_tokenizer = None

try:
    sentiment_tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
    sentiment_model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert").to(device).eval()
    sentiment_labels = sentiment_model.config.id2label
except:
    sentiment_model = None
    sentiment_tokenizer = None
    sentiment_labels = None

# === Scrape News from Moneycontrol ===
def scrape_moneycontrol_articles():
    url = "https://www.moneycontrol.com/news/business/"
    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.content, "html.parser")
    articles = soup.find_all("li", class_="clearfix")

    scraped = []
    for article in articles:
        headline_tag = article.find("h2")
        headline = headline_tag.get_text(strip=True) if headline_tag else None

        link_tag = article.find("a", href=True)
        link = link_tag['href'] if link_tag else None

        summary_tag = article.find("p")
        summary = summary_tag.get_text(strip=True) if summary_tag else None

        if headline:
            combined_text = f"{headline}. {summary}" if summary else headline
            scraped.append({"headline": headline, "summary": summary, "link": link, "text": combined_text})

    return scraped

# === Match Companies ===
def find_relevant_companies_multiple(articles, top_k=5, relevance_threshold=0.6):
    results = []
    texts = [a["text"] for a in articles]
    article_embeddings = bge_model.encode(texts, convert_to_tensor=True, normalize_embeddings=True, device=device)
    D, I = faiss_index.search(article_embeddings.cpu().numpy(), top_k)

    for i, article in enumerate(articles):
        article_result = {
            "Headline": article["headline"],
            "Summary": article["summary"],
            "Link": article["link"],
            "Matches": []
        }

        for j, idx in enumerate(I[i]):
            if idx == -1:
                continue

            ticker = tickers[idx]
            desc = descriptions[idx]
            sim_score = float(D[i][j])

            result = {
                "Ticker": ticker,
                "RetrievalScore": round(sim_score, 3)
            }

            if relevance_model and relevance_tokenizer:
                inputs = relevance_tokenizer(article["text"], desc, return_tensors="pt", truncation=True, padding=True, max_length=512)
                inputs = {k: v.to(device) for k, v in inputs.items()}
                with torch.no_grad():
                    logits = relevance_model(**inputs).logits
                    prob = torch.softmax(logits, dim=1)[0]
                    relevance_score = prob[1].item() if relevance_model.config.num_labels == 2 else prob.max().item()
                    result["RelevanceScore"] = round(relevance_score, 3)
                    if relevance_score < relevance_threshold:
                        continue

            if sentiment_model and sentiment_tokenizer:
                inputs = sentiment_tokenizer(article["text"], return_tensors="pt", truncation=True, padding=True, max_length=512)
                inputs = {k: v.to(device) for k, v in inputs.items()}
                with torch.no_grad():
                    logits = sentiment_model(**inputs).logits
                    label_id = torch.argmax(logits).item()
                    sentiment = sentiment_labels[label_id]
                    result["Sentiment"] = sentiment

            article_result["Matches"].append(result)
        results.append(article_result)

    return results

# === Run and Save to CSV ===
if __name__ == "__main__":
    scraped_articles = scrape_moneycontrol_articles()
    matches = find_relevant_companies_multiple(scraped_articles, top_k=3)

    # Flatten and convert to DataFrame
    output_data = []
    for article in matches:
        if not article["Matches"]:
            output_data.append({
                "Headline": article["Headline"],
                "Summary": article["Summary"],
                "Link": article["Link"],
                "Ticker": None,
                "RetrievalScore": None,
                "RelevanceScore": None,
                "Sentiment": None
            })
        else:
            for match in article["Matches"]:
                output_data.append({
                    "Headline": article["Headline"],
                    "Summary": article["Summary"],
                    "Link": article["Link"],
                    "Ticker": match.get("Ticker"),
                    "RetrievalScore": match.get("RetrievalScore"),
                    "RelevanceScore": match.get("RelevanceScore"),
                    "Sentiment": match.get("Sentiment")
                })

    df_output = pd.DataFrame(output_data)
    df_output.to_csv("moneycontrol_article_matches.csv", index=False)
    print("Results saved to moneycontrol_article_matches.csv")

Using device: cuda
Results saved to moneycontrol_article_matches.csv


In [1]:
import pandas as pd
import torch
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import faiss
import time

# === Step 1: Load Company Descriptions ===
try:
    df = pd.read_csv("/home/chebolu_srikanth/.keras/csv_files/company_description.csv")
except FileNotFoundError:
    df = pd.DataFrame({
        'Ticker': ['AAPL', 'MSFT', 'GOOG'],
        'Description': [
            'Apple Inc. designs smartphones, computers, and accessories.',
            'Microsoft Corp. creates software, services, and devices worldwide.',
            'Alphabet Inc. offers online ads and related services globally.'
        ]
    })

tickers = df['Ticker'].tolist()
descriptions = df['Description'].tolist()

# === Step 2: Setup Device and Models ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

bge_model = SentenceTransformer("BAAI/bge-large-en-v1.5", device=device)
bge_model.encode("warmup")

company_embeddings = bge_model.encode(descriptions, convert_to_tensor=True, normalize_embeddings=True, device=device)

faiss_index = faiss.IndexFlatIP(company_embeddings.shape[1])
faiss_index.add(company_embeddings.cpu().numpy())

# Load DeBERTa
try:
    relevance_tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")
    relevance_model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-large").to(device).eval()
except:
    relevance_model = None
    relevance_tokenizer = None

# Load FinBERT
try:
    sentiment_tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
    sentiment_model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert").to(device).eval()
    sentiment_labels = sentiment_model.config.id2label
except:
    sentiment_model = None
    sentiment_tokenizer = None
    sentiment_labels = None

# === Step 3: Scrape News Articles ===
def scrape_all_moneycontrol_business_news(pages=20, delay=1):
    base_url = "https://www.moneycontrol.com/news/business/page-{}"
    all_articles = []

    for page_num in range(1, pages + 1):
        url = base_url.format(page_num)
        print(f"Scraping page {page_num}...")

        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
        except Exception as e:
            print(f"Failed to retrieve page {page_num}: {e}")
            break

        soup = BeautifulSoup(response.content, "html.parser")
        articles = soup.find_all("li", class_="clearfix")

        if not articles:
            print(f"No articles found on page {page_num}. Stopping.")
            break

        for article in articles:
            headline_tag = article.find("h2")
            link_tag = article.find("a", href=True)
            summary_tag = article.find("p")

            headline = headline_tag.get_text(strip=True) if headline_tag else None
            link = link_tag['href'] if link_tag else None
            summary = summary_tag.get_text(strip=True) if summary_tag else ""

            if headline:
                text = f"{headline}. {summary}"
                all_articles.append({
                    "Headline": headline,
                    "Summary": summary,
                    "Link": link,
                    "CombinedText": text
                })

        time.sleep(delay)

    return pd.DataFrame(all_articles)

# === Step 4: Match Articles to Companies ===
def find_relevant_companies_multiple(articles_df, top_k=5, relevance_threshold=0.6):
    results = []

    article_embeddings = bge_model.encode(
        articles_df['CombinedText'].tolist(),
        convert_to_tensor=True,
        normalize_embeddings=True,
        device=device
    )
    D, I = faiss_index.search(article_embeddings.cpu().numpy(), top_k)

    for i, row in articles_df.iterrows():
        matches = []

        for j, idx in enumerate(I[i]):
            if idx == -1:
                continue

            ticker = tickers[idx]
            desc = descriptions[idx]
            sim_score = float(D[i][j])

            result = {
                "Ticker": ticker,
                "RetrievalScore": round(sim_score, 3)
            }

            if relevance_model and relevance_tokenizer:
                inputs = relevance_tokenizer(row['CombinedText'], desc, return_tensors="pt", truncation=True, padding=True, max_length=512)
                inputs = {k: v.to(device) for k, v in inputs.items()}
                with torch.no_grad():
                    logits = relevance_model(**inputs).logits
                    prob = torch.softmax(logits, dim=1)[0]
                    relevance_score = prob[1].item() if relevance_model.config.num_labels == 2 else prob.max().item()
                    result["RelevanceScore"] = round(relevance_score, 3)
                    if relevance_score < relevance_threshold:
                        continue

            if sentiment_model and sentiment_tokenizer:
                inputs = sentiment_tokenizer(row['CombinedText'], return_tensors="pt", truncation=True, padding=True, max_length=512)
                inputs = {k: v.to(device) for k, v in inputs.items()}
                with torch.no_grad():
                    logits = sentiment_model(**inputs).logits
                    label_id = torch.argmax(logits).item()
                    sentiment = sentiment_labels[label_id]
                    result["Sentiment"] = sentiment

            matches.append(result)

        results.append({
            "Headline": row["Headline"],
            "Summary": row["Summary"],
            "Link": row["Link"],
            "Matches": matches
        })

    return results

# === Step 5: Run Everything ===
if __name__ == "__main__":
    df_articles = scrape_all_moneycontrol_business_news(pages=20)
    results = find_relevant_companies_multiple(df_articles)

    # Flatten results into rows
    output_rows = []
    for res in results:
        for match in res["Matches"]:
            row = {
                "Headline": res["Headline"],
                "Summary": res["Summary"],
                "Link": res["Link"],
                **match
            }
            output_rows.append(row)

    output_df = pd.DataFrame(output_rows)
    output_df.to_csv("matched_moneycontrol_news.csv", index=False)
    print(f"Saved {len(output_df)} matched articles to 'matched_moneycontrol_news.csv'")

Using device: cuda
Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Saved 2500 matched articles to 'matched_moneycontrol_news.csv'


In [2]:
import pandas as pd
import torch
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import faiss
import time

# === Step 1: Load Company Descriptions ===
try:
    df = pd.read_csv("/home/chebolu_srikanth/.keras/company_description.csv")
except FileNotFoundError:
    df = pd.DataFrame({
        'Ticker': ['AAPL', 'MSFT', 'GOOG'],
        'Description': [
            'Apple Inc. designs smartphones, computers, and accessories.',
            'Microsoft Corp. creates software, services, and devices worldwide.',
            'Alphabet Inc. offers online ads and related services globally.'
        ]
    })

tickers = df['Ticker'].tolist()
descriptions = df['Description'].tolist()

# === Step 2: Setup Device and Models ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

bge_model = SentenceTransformer("BAAI/bge-large-en-v1.5", device=device)
bge_model.encode("warmup")

company_embeddings = bge_model.encode(descriptions, convert_to_tensor=True, normalize_embeddings=True, device=device)

faiss_index = faiss.IndexFlatIP(company_embeddings.shape[1])
faiss_index.add(company_embeddings.cpu().numpy())

# Load DeBERTa
try:
    relevance_tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")
    relevance_model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-large").to(device).eval()
except:
    relevance_model = None
    relevance_tokenizer = None

# Load FinBERT
try:
    sentiment_tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
    sentiment_model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert").to(device).eval()
    sentiment_labels = sentiment_model.config.id2label
except:
    sentiment_model = None
    sentiment_tokenizer = None
    sentiment_labels = None

# === Step 3: Scrape News Articles ===
def scrape_all_moneycontrol_business_news(pages=20, delay=1):
    base_url = "https://www.moneycontrol.com/news/business/page-{}"
    all_articles = []

    for page_num in range(1, pages + 1):
        url = base_url.format(page_num)
        print(f"Scraping page {page_num}...")

        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
        except Exception as e:
            print(f"Failed to retrieve page {page_num}: {e}")
            break

        soup = BeautifulSoup(response.content, "html.parser")
        articles = soup.find_all("li", class_="clearfix")

        if not articles:
            print(f"No articles found on page {page_num}. Stopping.")
            break

        for article in articles:
            headline_tag = article.find("h2")
            link_tag = article.find("a", href=True)
            summary_tag = article.find("p")
            date_tag = article.find("span", class_="dateline")

            headline = headline_tag.get_text(strip=True) if headline_tag else None
            link = link_tag['href'] if link_tag else None
            summary = summary_tag.get_text(strip=True) if summary_tag else ""
            date = date_tag.get_text(strip=True) if date_tag else ""

            if headline:
                text = f"{headline}. {summary}"
                all_articles.append({
                    "Headline": headline,
                    "Summary": summary,
                    "Link": link,
                    "Date": date,
                    "CombinedText": text
                })

        time.sleep(delay)

    return pd.DataFrame(all_articles)

# === Step 4: Match Articles to Companies ===
def find_relevant_companies_multiple(articles_df, top_k=5, relevance_threshold=0.6):
    results = []

    article_embeddings = bge_model.encode(
        articles_df['CombinedText'].tolist(),
        convert_to_tensor=True,
        normalize_embeddings=True,
        device=device
    )
    D, I = faiss_index.search(article_embeddings.cpu().numpy(), top_k)

    for i, row in articles_df.iterrows():
        matches = []

        for j, idx in enumerate(I[i]):
            if idx == -1:
                continue

            ticker = tickers[idx]
            desc = descriptions[idx]
            sim_score = float(D[i][j])

            result = {
                "Ticker": ticker,
                "RetrievalScore": round(sim_score, 3)
            }

            if relevance_model and relevance_tokenizer:
                inputs = relevance_tokenizer(row['CombinedText'], desc, return_tensors="pt", truncation=True, padding=True, max_length=512)
                inputs = {k: v.to(device) for k, v in inputs.items()}
                with torch.no_grad():
                    logits = relevance_model(**inputs).logits
                    prob = torch.softmax(logits, dim=1)[0]
                    relevance_score = prob[1].item() if relevance_model.config.num_labels == 2 else prob.max().item()
                    result["RelevanceScore"] = round(relevance_score, 3)
                    if relevance_score < relevance_threshold:
                        continue

            if sentiment_model and sentiment_tokenizer:
                inputs = sentiment_tokenizer(row['CombinedText'], return_tensors="pt", truncation=True, padding=True, max_length=512)
                inputs = {k: v.to(device) for k, v in inputs.items()}
                with torch.no_grad():
                    logits = sentiment_model(**inputs).logits
                    label_id = torch.argmax(logits).item()
                    sentiment = sentiment_labels[label_id]
                    result["Sentiment"] = sentiment

            matches.append(result)

        results.append({
            "Headline": row["Headline"],
            "Summary": row["Summary"],
            "Link": row["Link"],
            "Date": row["Date"],  # Include Date here
            "Matches": matches
        })

    return results

# === Step 5: Run Everything ===
if __name__ == "__main__":
    df_articles = scrape_all_moneycontrol_business_news(pages=20)
    results = find_relevant_companies_multiple(df_articles)

    # Flatten results into rows
    output_rows = []
    for res in results:
        for match in res["Matches"]:
            row = {
                "Date": res["Date"],  # Add Date here
                "Headline": res["Headline"],
                "Summary": res["Summary"],
                "Link": res["Link"],
                **match
            }
            output_rows.append(row)

    output_df = pd.DataFrame(output_rows)
    output_df.to_csv("matched_moneycontrol_news.csv", index=False)
    print(f"Saved {len(output_df)} matched articles to 'matched_moneycontrol_news.csv'")

Using device: cuda
Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...


KeyboardInterrupt: 