# Finding most similar headlines


## Importing Packages

In [29]:
import pandas as pd
import numpy as np
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Reading Datasets

In [30]:
headlines = pd.read_csv('abcnews-date-text.csv', parse_dates=["publish_date"])

## Defining Methods for Processing

### Method for Randomly Selecting a headline

In [32]:
def select_headline(headlines):
  random_headline = headlines.iloc[random.randrange(headlines.shape[0])]["headline_text"]
  print('Randomly Selected headline:', random_headline)
  return random_headline

### Method for Randomly Selecting a Query Word from the headlines

In [52]:
def select_word_query(headlines, tfidf):
  query = random.randrange(headlines.shape[1])
  print('Randomly Selected word index:', query)
  print('Randomly Selected word:', tfidf.get_feature_names_out()[query])
  return query

### Method for Displaying similar headlines

In [34]:
def display_similar_headlines(headlines, similarity, num):
  print(headlines.iloc[np.argsort(similarity[0])[::-1][0:num]][["publish_date", "headline_text"]])

## Working with TF-IDF Vectorizer for Documents and Words

### TF-IDF Vectorizer for Documents, with preset settings

This section uses a TF-IDF Vectorizer for Documents, with preset settings

#### Initialize TF-IDF Vectorizer

In [42]:
tfidf = TfidfVectorizer()
all_headlines_tf_idf = tfidf.fit_transform(headlines["headline_text"][headlines["headline_text"] != np.nan])

#### Selecting a made-up headline and evaluating it's TF-IDF

In [43]:
made_up_headline_tf_idf = tfidf.transform([select_headline(headlines)])

Randomly Selected headline: broncos grounded despite knights demolition


#### Shapes of 'All Headlines' and 'Made-Up' Headline

In [44]:
print("All    Headlines Shape: ",all_headlines_tf_idf.shape)
print("Made-Up Headline Shape: ",made_up_headline_tf_idf.shape)

All    Headlines Shape:  (1103663, 95878)
Made-Up Headline Shape:  (1, 95878)


#### Evaluating Similarity B/w ***Choosen Headline and All Headlines***

In [45]:
sim = cosine_similarity(made_up_headline_tf_idf, all_headlines_tf_idf)
sim

array([[0., 0., 0., ..., 0., 0., 0.]])

#### Displaying Similar Headlines

##### Defining Number of Headlines, to be displayed

In [46]:
num_of_headlines = 10

##### Printing the top headlines, with the limits set above.

In [47]:
display_similar_headlines(headlines, sim, num_of_headlines)

       publish_date                                headline_text
261036   2006-09-20  broncos grounded despite knights demolition
260319   2006-09-16                       broncos humble knights
680430   2012-03-16                knights vs broncos highlights
551148   2010-07-18            dugan stars in knights demolition
822786   2013-09-27         knights grounded before prelim final
174719   2005-07-09         eagles grounded despite winning ways
5224     2003-03-16                knights broncos open accounts
809554   2013-08-08                              demolition deal
606534   2011-04-10        broncos make late changes for knights
260320   2006-09-16                   broncos obliterate knights


### TF-IDF Vectorizer for Words

This section uses a TF-IDF Vectorizer for Words, with *stopwords and minimum document frequency set*

#### Importing Required Packages

In [49]:
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

#### Initialize TF-IDF Vectorizer

In [53]:
tfidf_word = TfidfVectorizer(stop_words=stopwords, min_df=1000)
all_headlines_tf_idf = tfidf_word.fit_transform(headlines["headline_text"])

  % sorted(inconsistent)


#### Selecting a made-up headline and evaluating it's TF-IDF

In [55]:
made_up_headline_tf_idf = all_headlines_tf_idf.T[select_word_query(headlines, tfidf_word)]

Randomly Selected word index: 1
Randomly Selected word: 100


#### Evaluating Similarity B/w ***Choosen Query and All Headlines***

In [59]:
sim = cosine_similarity(made_up_headline_tf_idf, all_headlines_tf_idf.T)
sim

array([[2.72498919e-03, 1.00000000e+00, 7.57377657e-04, ...,
        7.67570958e-04, 1.77351649e-03, 4.94455149e-03]])

#### Displaying Similar Headlines

##### Defining Number of Headlines, to be displayed

In [57]:
num_of_headlines = 10

##### Printing the top headlines, with the limits set above.

In [58]:
#Printing Similar documents found by cosine similarity
print("\nFound following similar words: ")
print("Index\tWord")
for index in np.argsort(sim[0])[::-1][0:num_of_headlines]:
  print(f"{index}\t{tfidf_word.get_feature_names_out()[index]}")


Found following similar words: 
Index	Word
1	100
1127	years
267	days
166	celebrates
639	million
691	old
543	jobs
1126	year
1049	turns
1022	toll


