In [4]:
!pip install spacy
!python -m spacy download en_core_web_sm

import pandas as pd
import spacy

# Load dataset
data = pd.read_excel('combined_news_final_sorted.xlsx')

# Load SpaCy's pre-trained NER model
nlp = spacy.load("en_core_web_sm")

# Define a function to extract relevant entities
def extract_relevant_entities(text):
    doc = nlp(text)
    relevant_entities = [ent.text for ent in doc.ents if ent.label_ in ['ORG', 'PERSON', 'MONEY', 'GPE', 'PRODUCT']]
    return relevant_entities

# Apply NER to extract entities from headline and content
data['headline_entities'] = data['headline'].apply(extract_relevant_entities)
data['content_entities'] = data['content'].apply(extract_relevant_entities)

# Filter articles based on extracted entities containing relevant keywords
relevant_keywords = [
    # Reliance-Specific
    "Reliance", "RIL", "Mukesh Ambani", "Nita Ambani",
    "Reliance Jio", "Jio Platforms", "JioMart",
    "Reliance Retail", "Reliance Digital",
    "Reliance Petroleum", "Reliance AGM", "Reliance O2C",
    "Reliance Green Energy", "Jio Financial Services",

    # Stock Market Terms
    "Nifty 50", "Sensex", "BSE", "NSE", "Nifty Bank",
    "Nifty IT", "Nifty Pharma", "Nifty FMCG", "Nifty Metal",
    "F&O", "SIP", "Mutual Funds", "IPO", "Dividend",

    # Economic Indicators
    "GDP India", "Inflation India", "RBI Policy",
    "GST India", "FDI India", "Trade Deficit", "Rupee",
    "Current Account Deficit", "Fiscal Policy",

    # Sector Keywords
    "Telecom Sector", "5G Rollout", "Green Hydrogen",
    "Oil Refining", "Polyester", "Petrochemicals",
    "Organized Retail", "E-commerce Growth",

    # Competitors/Comparisons
    "Tata Group", "Adani Group", "Airtel", "Vodafone Idea",
    "Amazon India", "Flipkart", "DMart", "Nykaa",

    # Hashtags/Abbreviations
    "#RIL", "#Nifty50", "#Sensex", "#BSE", "#NSE",
    "$RELIANCE.NS", "$NSEI", "$BSESN"
]

filtered_data = data[data['headline_entities'].apply(lambda x: any(k in x for k in relevant_keywords)) |
                     data['content_entities'].apply(lambda x: any(k in x for k in relevant_keywords))]

# Save filtered data to a new CSV file
filtered_data.to_csv('ner_filtered_articles.csv', index=False)
print("NER-based filtered dataset saved as 'ner_filtered_articles.csv'.")

Collecting spacy
  Downloading spacy-3.8.5-cp311-cp311-win_amd64.whl.metadata (28 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.12-cp311-cp311-win_amd64.whl.metadata (2.2 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.11-cp311-cp311-win_amd64.whl.metadata (8.8 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp311-cp311-win_amd64.whl.metadata (2.2 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Downloading thinc-8.3.6-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downloading srsly-2.5.1-cp311-cp311-win_amd64

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.0 requires FuzzyTM>=0.4.0, which is not installed.
anaconda-cloud-auth 0.1.4 requires pydantic<2.0, but you have pydantic 2.11.2 which is incompatible.
astropy 5.3.4 requires numpy<2,>=1.21, but you have numpy 2.2.4 which is incompatible.
contourpy 1.2.0 requires numpy<2.0,>=1.20, but you have numpy 2.2.4 which is incompatible.
matplotlib 3.8.0 requires numpy<2,>=1.21, but you have numpy 2.2.4 which is incompatible.
numba 0.59.0 requires numpy<1.27,>=1.22, but you have numpy 2.2.4 which is incompatible.
pandas 2.1.4 requires numpy<2,>=1.23.2; python_version == "3.11", but you have numpy 2.2.4 which is incompatible.
pywavelets 1.5.0 requires numpy<2.0,>=1.22.4, but you have numpy 2.2.4 which is incompatible.
scipy 1.11.4 requires numpy<1.28.0,>=1.21.6, but you have numpy 2.2.4 which is incompatible.
stre


   ------------------------------------- -- 5.0/5.4 MB 40.9 kB/s eta 0:00:10
   ------------------------------------- -- 5.0/5.4 MB 40.8 kB/s eta 0:00:10
   ------------------------------------- -- 5.0/5.4 MB 40.8 kB/s eta 0:00:10
   ------------------------------------- -- 5.0/5.4 MB 40.8 kB/s eta 0:00:10
   ------------------------------------- -- 5.0/5.4 MB 40.8 kB/s eta 0:00:10
   ------------------------------------- -- 5.0/5.4 MB 40.8 kB/s eta 0:00:10
   ------------------------------------- -- 5.0/5.4 MB 40.8 kB/s eta 0:00:10
   ------------------------------------- -- 5.0/5.4 MB 40.8 kB/s eta 0:00:10
   ------------------------------------- -- 5.0/5.4 MB 40.8 kB/s eta 0:00:10
   ------------------------------------- -- 5.0/5.4 MB 40.8 kB/s eta 0:00:10
   ------------------------------------- -- 5.0/5.4 MB 40.9 kB/s eta 0:00:09
   ------------------------------------- -- 5.0/5.4 MB 40.9 kB/s eta 0:00:09
   ------------------------------------- -- 5.0/5.4 MB 40.9 kB/s eta 0:00:0

Traceback (most recent call last):
  File "<frozen runpy>", line 189, in _run_module_as_main
  File "<frozen runpy>", line 148, in _get_module_details
  File "<frozen runpy>", line 112, in _get_module_details
  File "D:\anaconda\Lib\site-packages\spacy\__init__.py", line 6, in <module>
  File "D:\anaconda\Lib\site-packages\spacy\errors.py", line 3, in <module>
    from .compat import Literal
  File "D:\anaconda\Lib\site-packages\spacy\compat.py", line 4, in <module>
    from thinc.util import copy_array
  File "D:\anaconda\Lib\site-packages\thinc\__init__.py", line 5, in <module>
    from .config import registry
  File "D:\anaconda\Lib\site-packages\thinc\config.py", line 5, in <module>
    from .types import Decorator
  File "D:\anaconda\Lib\site-packages\thinc\types.py", line 27, in <module>
    from .compat import cupy, has_cupy
  File "D:\anaconda\Lib\site-packages\thinc\compat.py", line 99, in <module>
    import h5py
  File "D:\anaconda\Lib\site-packages\h5py\__init__.py", line 2

ImportError: cannot import name 'TypeIs' from 'typing_extensions' (D:\anaconda\Lib\site-packages\typing_extensions.py)