In [1]:
import re

# Combine unigrams, bigrams, and trigrams
keywords = [
    "rupee", "nse", "bse", "sebi", "rbi", "gst", "demonetisation", "fdi",
    "sensex", "nifty", "mutual fund", "equity", "debt", "ipo", "shares",
    "stock", "investment", "tax", "inflation", "recession", "gdp",
    "dividend", "portfolio", "stock market", "interest rate",
    "foreign investment", "economic growth", "fiscal policy",
    "monetary policy", "public sector", "private sector", "non-performing",
    "credit rating", "mutual funds", "fixed deposit", "real estate",
    "reserve bank of india", "goods and services tax", "non performing assets",
    "foreign direct investment", "gross domestic product",
    "initial public offering", "public provident fund", "employee provident fund"
]

# Precompile the pattern for efficiency
pattern = re.compile(r'\b(' + '|'.join(keywords) + r')\b', re.IGNORECASE)

def is_financial_article(article):
    """
    Classify if the input article is a financial article based on keywords.
    
    Parameters:
        article (str): The article text.
    
    Returns:
        bool: True if it's a financial article, False otherwise.
    """
    # Find all occurrences of the keywords in the article
    matches = re.findall(pattern, article)
    
    # Classify as financial if at least one keyword is found
    return len(matches) > 0

# Example usage:
article = "The Reserve Bank of India announced a change in the monetary policy..."
print(is_financial_article(article))  # Output: True

True


In [2]:
data_folder = '../../data/articles/'

In [3]:
import os
import json
from tqdm.notebook import tqdm
import pandas as pd

In [4]:
all_articles = os.listdir(data_folder)
is_fin = []
for article in tqdm(all_articles):
    with open(data_folder + article, 'r') as f:
        content = json.load(f)
    pred = is_financial_article(content['cleaned_text'])
    is_fin.append((article.split('.')[0],pred))

  0%|          | 0/8390 [00:00<?, ?it/s]

In [5]:
fin_df = pd.DataFrame(is_fin, columns = ['article_id','fin'])

In [6]:
def get_article(article_id):
    with open(data_folder + article_id + '.json', 'r') as f:
        content = json.load(f)
    return content['title']

In [7]:
fin_df['title'] = fin_df['article_id'].apply(lambda x: get_article(x))

In [8]:
fin_df.groupby('fin')['article_id'].count()

fin
False    7039
True     1351
Name: article_id, dtype: int64

In [16]:
1351/8300

0.1627710843373494

In [15]:
fin_df[fin_df.fin==False].sample(10)

Unnamed: 0,article_id,fin,title
1795,6513935d395f46458f8bcb50,False,Congress govt in Karnataka acted under DMK pre...
2191,6513a96a395f46458f8c007b,False,Salman Khan death threat: Cops arrest a man fr...
3748,6513a51c395f46458f8bf50c,False,There's been a mistake somewhere: Farooq Abdul...
1157,65138de9395f46458f8bc370,False,India gears up for Ganesh Chaturthi 2023: Beng...
2228,65139284395f46458f8bca20,False,'Uri' screened in Manipur; Vicky Kaushal-starr...
7926,65139a0c395f46458f8bd59f,False,Canada’s open work permit could become a perma...
8305,6513a1fe395f46458f8bec39,False,'The Little Mermaid' review: This under the se...
6780,65139d95395f46458f8bdf60,False,Asish Saha appointed as Tripura Pradesh Congre...
5041,651395bb395f46458f8bcf2f,False,We are fighting for our identity that is in da...
8338,6513a29d395f46458f8bee18,False,'Paatal Lok' star Abhishek Banerjee joins the ...
