Import Required Libraries

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


Downloading the Dataset

In [2]:
import os
import requests

In [3]:
def download_dataset(url: str):

    folder = "Dataset"
    file_path = os.path.join(folder, "data.csv")

    if os.path.exists(file_path):
        print(f"Data already exists at : {file_path}")

    else:
        try:
            response = requests.get(url = url)
            print("Successfully downloaded the dataset from url.")

            os.makedirs(folder, exist_ok = True)
            print("Created a new Data folder to stiore the dataset.")

            with open(file_path, "wb") as file:
                file.write(response.content)

            print(f"Successfully downloaded the data in the location : {file_path}")
        
        except Exception as e:
            raise e
        
    return file_path

In [4]:
url = "https://github.com/611noorsaeed/All-Recommendations/raw/refs/heads/main/Articles.csv"
file_path = download_dataset(url)

Successfully downloaded the dataset from url.
Created a new Data folder to stiore the dataset.
Successfully downloaded the data in the location : Dataset\data.csv


Loading the Dataset

In [5]:
dataset = pd.read_csv(file_path, encoding = "ISO-8859-1")

In [6]:
dataset.head()

Unnamed: 0,Article,Date,Heading,NewsType
0,KARACHI: The Sindh government has decided to b...,1/1/2015,sindh govt decides to cut public transport far...,business
1,HONG KONG: Asian markets started 2015 on an up...,1/2/2015,asia stocks up in new year trad,business
2,HONG KONG: Hong Kong shares opened 0.66 perce...,1/5/2015,hong kong stocks open 0.66 percent lower,business
3,HONG KONG: Asian markets tumbled Tuesday follo...,1/6/2015,asian stocks sink euro near nine year,business
4,NEW YORK: US oil prices Monday slipped below $...,1/6/2015,us oil prices slip below 50 a barr,business


Preprocessing the Data

In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
stop_words = set(stopwords.words("English"))
stemmer = PorterStemmer()

In [10]:
def preprocess_text(text):

    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = re.sub(r"[^\w\s]", "", text)

    # Tokenize
    words = text.split()

    # Remove stopwords and apply stemming
    preprocessed_text = [stemmer.stem(word) for word in words if word not in stop_words]

    return " ".join(preprocessed_text)

In [11]:
preprocess_text("Machine is the ability of machine how to learn.")

'machin abil machin learn'

In [12]:
dataset["cleaned_heading"] = dataset["Heading"].apply(preprocess_text)