<a href="https://colab.research.google.com/github/Prashant27203/Query-expansion-System-/blob/main/bm25_query_expansion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install rank_bm25
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [3]:
# Load the news dataset
news_df = pd.read_csv('BBCNews.csv')

In [5]:
# Define synonyms for the different types of news
synonyms = {
    'sports': ['sports', 'athletics', 'games', 'competitions', 'matches', 'cricket', 'football', 'soccer', 'hockey'],
    'politics': ['politics', 'government', 'elections', 'democracy', 'parliament', 'congress', 'senate', 'leadership'],
    'entertainment': ['entertainment', 'celebrities', 'movies', 'films', 'television', 'music', 'arts', 'culture'],
    'technology': ['technology', 'science', 'innovation', 'computers', 'gadgets', 'internet', 'artificial intelligence', 'cybersecurity'],
    'business': ['business', 'economy', 'finance', 'markets', 'investment', 'commerce', 'trade', 'industry']
}

In [6]:
# Downloading necessary NLTK resources
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [7]:
# Reading the CSV file and preprocessing the data
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha()]
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stopwords.words('english')]
    return ' '.join(tokens)

news_df['tags'] = news_df['tags'].astype(str)
news_df['tags'] = news_df['tags'].apply(lambda x: preprocess(x) if isinstance(x, str) else '')
news_df['descr'] = news_df['descr'].astype(str)
news_df['descr'] = news_df['descr'].apply(lambda x: preprocess(x) if isinstance(x, str) else '')


In [8]:
# Define a function to map each category to its base type
def get_base_category(tag):
    tag_list = tag.split(' ')
    for t in tag_list:
        for base_category, syn_list in synonyms.items():
            if t.lower() in syn_list:
                return base_category
    return 'other'

In [None]:
!pip install rank_bm25

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [11]:
from sklearn.preprocessing import LabelEncoder

def preprocess_news(news_df):
    # Remove null values
    news_df = news_df.dropna()

    # Map each tag to its base category
    news_df['tags'] = news_df['tags'].apply(get_base_category)

    # Create a BM25 vectorizer object
    tokenized_corpus = [doc.split(" ") for doc in news_df['descr']]
    bm25 = BM25Okapi(tokenized_corpus)

    # Define a label encoder to convert categories to integers
    le = LabelEncoder()

    # Fit the label encoder on the unique categories in the dataset
    le.fit(news_df['tags'].unique())

    # Map each tag to its base category
    news_df['tags'] = news_df['tags'].apply(get_base_category)

    # Convert the categories to integers using the label encoder
    labels = le.transform(news_df['tags'].values)
    predicted_labels = []
    for i in range(len(tokenized_corpus)):
        # Get the top k documents using BM25
        k = 10
        doc_scores = bm25.get_scores(tokenized_corpus[i])
        top_k_idx = np.argsort(doc_scores)[::-1][:k]

        # Get the categories for the top k documents
        top_k_categories = labels[top_k_idx]

        # Find the most common category among the top k categories
        predicted_category = np.bincount(top_k_categories, minlength=len(synonyms)).argmax()

        predicted_labels.append(predicted_category)

    # Calculate the accuracy of the model
    accuracy = (predicted_labels == labels).mean()
    print("Accuracy: ",accuracy*100,"%")


In [12]:
preprocess_news(news_df)

Accuracy:  88.83817427385891 %
