In [2]:
# Importing all the necessary libraries
import numpy as np
import pandas as pd
import nltk
import string
from nltk.stem import PorterStemmer
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pritp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Read the dataset
articles = pd.read_csv("../assets/data/articles.csv")

In [4]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7600 entries, 0 to 7599
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   article  7600 non-null   object
dtypes: object(1)
memory usage: 59.5+ KB


In [5]:
articles.head()

Unnamed: 0,article
0,Fears for T N pension after talks Unions repre...
1,The Race is On: Second Private Team Sets Launc...
2,Ky. Company Wins Grant to Study Peptides (AP) ...
3,Prediction Unit Helps Forecast Wildfires (AP) ...
4,Calif. Aims to Limit Farm-Related Smog (AP) AP...


In [6]:
# Preprocessing all the articles
def preprocess(articles):
    ps = PorterStemmer()
    sentences = []

    for article in articles:
        # Converting the article to lowercase
        article = article.lower()

        # Removing punctuations
        article = article.translate(str.maketrans('', '', string.punctuation))

        # Tokenization
        words = article.split()
        
        # Stemming
        words = [ps.stem(word) for word in words]

        sentences.append(words)
    
    return sentences

In [7]:
sentences = preprocess(list(articles["article"]))

In [8]:
# Creating a vocabulory
vocab = set()

for sentence in sentences:
    for word in sentence:
        vocab.add(word)

vocab = list(vocab)
print(f'Vocabulory size: {len(vocab)}')

Vocabulory size: 19224


In [9]:
# Creating a dictionary to map key to indices
key_to_idx = {}
i = 0

for i in range(len(vocab)):
    key_to_idx[vocab[i]] = i

In [10]:
# Storing term frequency
tf = []

for sentence in sentences:
    vec = np.zeros(len(vocab))

    for word in sentence:
        idx = key_to_idx.get(word)
        vec[idx] += 1
    
    tf.append(vec)

# Converting to numpy array
tf = np.array(tf)

In [11]:
# Storing inverse document frequency
document_word_count = []

iter = 0
for curr_word in vocab:
    count = 0

    for sentence in sentences:
        for word in sentence:
            if (curr_word == word):
                count += 1
                continue
    
    document_word_count.append(count)

    iter += 1
    if (iter % 2000 == 0):
        print(f'{iter}/{len(vocab)} words complete')

document_word_count = np.array(document_word_count)
idf = np.log((1+len(sentences))/document_word_count)

# This cell might require 5-10 minutes to run


2000/19224 words complete
4000/19224 words complete
6000/19224 words complete
8000/19224 words complete
10000/19224 words complete
12000/19224 words complete
14000/19224 words complete
16000/19224 words complete
18000/19224 words complete


In [12]:
# Calculating tf-idf representation for each article
tf_idf = []
for entry in tf:
    tf_idf.append(entry*idf)

tf_idf = np.array(tf_idf)

In [13]:
# Return the K nearest neighbors based on dot product
def get_nearest_neighbor(test, tf_idf, k=5):
    recommendations = []

    distance = np.dot(test, tf_idf.T)
    top_indices = np.argsort(distance)[len(distance)-k:]

    for idx in top_indices:
        recommendations.append(articles.iloc[idx][0])
    
    recommendations = recommendations[::-1]

    return recommendations


In [14]:
recommendations = get_nearest_neighbor(tf_idf[8], tf_idf)

print(f'You might also like:')
for i in range(len(recommendations)):
    print(f'Recommendation {i+1}: {recommendations[i]}')

You might also like:
Recommendation 1: E-mail scam targets police chief Wiltshire Police warns about "phishing" after its fraud squad chief was targeted.
Recommendation 2: Tech Firms, FBI to Fight 'Phishing' Scams Together (Reuters) Reuters - Internet companies and\law-enforcement agencies said on Wednesday they will work\together to track down online scam artists who pose as banks\and other legitimate businesses, a practice known as\"phishing."
Recommendation 3: Sister of man who died in Vancouver police custody slams chief (Canadian Press) Canadian Press - VANCOUVER (CP) - The sister of a man who died after a violent confrontation with police has demanded the city's chief constable resign for defending the officer involved.
Recommendation 4: Ex-U.S. Cyber Security Chief Sees Curb on Phishing &lt;p&gt;\&lt;/p&gt;&lt;p&gt; By Lisa Baertlein&lt;/p&gt;&lt;p&gt; SAN FRANCISCO (Reuters) - A former White House Web security\chief predicted on Wednesday that technology companies and law\enfor

In [16]:
# Testing with our own data
sample_article = "The dollar slipped broadly on Friday as traders booked profits after recent gains but the U.S. currency remained well-placed for further advances, supported by strong U.S. economic data that has prompted markets to dial back expectations for interest rate cuts."

# We need to provide sample article inside a list because of the way preprocess function is defined
test_sentences = preprocess([sample_article])[0]
test_tf = np.zeros(len(vocab))

for word in test_sentences:
    if word in vocab:
        idx = key_to_idx.get(word)
        test_tf[idx] += 1

test_tf_idf = test_tf*idf

recommendations = get_nearest_neighbor(test_tf_idf, tf_idf, 2)

print(f'You might also like:')
for i in range(len(recommendations)):
    print(f'Recommendation {i+1}: {recommendations[i]}')

You might also like:
Recommendation 1: Dollar Rises on the Interest Rate Plays  NEW YORK (Reuters) - The dollar rose on Thursday as  traders, short of dollars after relentlessly selling them for  weeks, looked to the growing yield advantage of U.S. assets as  a reason to buy back the currency before the end of the year.
Recommendation 2: Dollar's Gains Cut as Fed Raises Rates  NEW YORK (Reuters) - The dollar's gains were clipped  on  Tuesday as the Federal Reserve raised interest rates for the  fifth time this year, as expected, but quashed hopes for more  aggressive rate tightening.
