Import Required Libraries

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


Downloading the Dataset

In [2]:
import os
import requests

In [3]:
def download_dataset(url: str):

    folder = "Dataset"
    file_path = os.path.join(folder, "data.csv")

    if os.path.exists(file_path):
        print(f"Data already exists at : {file_path}")

    else:
        try:
            response = requests.get(url = url)
            print("Successfully downloaded the dataset from url.")

            os.makedirs(folder, exist_ok = True)
            print("Created a new Data folder to stiore the dataset.")

            with open(file_path, "wb") as file:
                file.write(response.content)

            print(f"Successfully downloaded the data in the location : {file_path}")
        
        except Exception as e:
            raise e
        
    return file_path

In [4]:
url = "https://github.com/611noorsaeed/All-Recommendations/raw/refs/heads/main/Articles.csv"
file_path = download_dataset(url)

Successfully downloaded the dataset from url.
Created a new Data folder to stiore the dataset.
Successfully downloaded the data in the location : Dataset\data.csv


Loading the Dataset

In [5]:
dataset = pd.read_csv(file_path, encoding = "ISO-8859-1")

In [6]:
dataset.head()

Unnamed: 0,Article,Date,Heading,NewsType
0,KARACHI: The Sindh government has decided to b...,1/1/2015,sindh govt decides to cut public transport far...,business
1,HONG KONG: Asian markets started 2015 on an up...,1/2/2015,asia stocks up in new year trad,business
2,HONG KONG: Hong Kong shares opened 0.66 perce...,1/5/2015,hong kong stocks open 0.66 percent lower,business
3,HONG KONG: Asian markets tumbled Tuesday follo...,1/6/2015,asian stocks sink euro near nine year,business
4,NEW YORK: US oil prices Monday slipped below $...,1/6/2015,us oil prices slip below 50 a barr,business


Preprocessing the Data

In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
stop_words = set(stopwords.words("English"))
stemmer = PorterStemmer()

In [10]:
def preprocess_text(text):

    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = re.sub(r"[^\w\s]", "", text)

    # Tokenize
    words = text.split()

    # Remove stopwords and apply stemming
    preprocessed_text = [stemmer.stem(word) for word in words if word not in stop_words]

    return " ".join(preprocessed_text)

In [11]:
preprocess_text("Machine is the ability of machine how to learn.")

'machin abil machin learn'

In [12]:
dataset["cleaned_heading"] = dataset["Heading"].apply(preprocess_text)

Initialize the Sentence Transformer Model

In [15]:
def get_model(model_name: str):
    try:
        model = SentenceTransformer(model_name)
        print("Model downloaded successfully.")
    except Exception as e:
        raise e
    
    return model

In [16]:
model = get_model(
    model_name = "all-MiniLM-L6-v2"
)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Model downloaded successfully.


Text Embeddings using BERT

In [19]:
def get_embds(model):
    try:
        headings = dataset["cleaned_heading"].tolist()

        embds = model.encode(
            headings,
            convert_to_tensor = True
        )
    except Exception as e:
        raise e
    
    return embds

In [20]:
embeddings = get_embds(
    model = model
)

Save and Load embeddings

In [21]:
import pickle

In [22]:
def save_embds(embds):
    folder = "Dataset"
    file_path = os.path.join(folder, "embeddings.pkl")
    try:
        os.makedirs(folder, exist_ok = True)

        with open(file_path, "wb") as file:
            pickle.dump(embds, file)
        print(f"Embeddings saved successfully at {file_path}")
    except Exception as e:
        raise e
    
    return file_path

In [23]:
embd_path = save_embds(
    embds = embeddings
)

Embeddings saved successfully at Dataset\embeddings.pkl


In [24]:
# Loading the Embeddings
embeddings = pickle.load(open(embd_path, "rb"))

In [30]:
def recommend_articles_from_search(query, dataset, embeddings, model, num_recommendations = 5):
    # Preprocess Query
    query = preprocess_text(query)

    # Encode the query
    query_embds = model.encode([query], convert_to_tensor = True)

    # Calculating Similarity
    similarities = cosine_similarity(query_embds.reshape(1, -1), embeddings)
    similarities = similarities.flatten()

    # Get the indices of the top recommendations
    top_indices = similarities.argsort()[-num_recommendations:][::-1]

    # Select the recommended articles
    recommended_articles = dataset.iloc[top_indices][["Heading", "NewsType", "Article", "Date"]]

    return recommended_articles

In [31]:
query = "Asian markets upswing"

recommended_articles = recommend_articles_from_search(query, dataset, embeddings, model)

recommended_articles

Unnamed: 0,Heading,NewsType,Article,Date
119,most asian markets up tokyo at 15 year hig,business,Hong Kong: Japanese shares hit a 15-year high ...,4/22/2015
226,asian markets mostly recover from hefty sell off,business,Hong Kong: Most Asia shares rose Wednesday as ...,7/29/2015
2610,Dollar down Trump takes over Asia markets u,business,strong>HONG KONG: The dollar retreated against...,1/23/2017
220,asia markets mostly down shanghai up a 7th str...,business,Hong Kong: Asian markets mostly fell Friday fo...,7/24/2015
546,Asia stocks edge up to four month high after W...,business,strong>TOKYO: Asian shares edged up to a four-...,3/31/2016
