In [5]:
!python -m pip install seaborn plotly

import numpy as np
import pandas as pd

import os
import math
import time

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.express as px

#Text processing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

#Feature representation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#Similarity matrices
from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.metrics import pairwise_distances


Collecting plotly
  Downloading plotly-4.9.0-py2.py3-none-any.whl (12.9 MB)
Collecting retrying>=1.3.3
  Downloading retrying-1.3.3.tar.gz (10 kB)
Building wheels for collected packages: retrying
  Building wheel for retrying (setup.py): started
  Building wheel for retrying (setup.py): finished with status 'done'
  Created wheel for retrying: filename=retrying-1.3.3-py3-none-any.whl size=11434 sha256=80d9fd59b99a2a482bf04e70d4340759a0f8dc6cdd65bc753e99ca25c9951811
  Stored in directory: c:\users\prasan\appdata\local\pip\cache\wheels\f9\8d\8d\f6af3f7f9eea3553bc2fe6d53e4b287dad18b06a861ac56ddf
Successfully built retrying
Installing collected packages: retrying, plotly
Successfully installed plotly-4.9.0 retrying-1.3.3
You should consider upgrading via the 'C:\Users\Prasan\Miniconda3\python.exe -m pip install --upgrade pip' command.


In [6]:
news_articles = pd.read_json("News_Category_Dataset_v2.json", lines = True)

In [7]:
news_articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200853 entries, 0 to 200852
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   category           200853 non-null  object        
 1   headline           200853 non-null  object        
 2   authors            200853 non-null  object        
 3   link               200853 non-null  object        
 4   short_description  200853 non-null  object        
 5   date               200853 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 9.2+ MB


In [8]:
news_articles.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


In [9]:
news_articles = news_articles[news_articles['date'] >= pd.Timestamp(2018,1,1)]

In [10]:
news_articles.shape

(8583, 6)

In [11]:
news_articles = news_articles[news_articles['headline'].apply(lambda x: len(x.split())>5)]

print("Total number of articles after removal of headlines with short title:", news_articles.shape[0])

Total number of articles after removal of headlines with short title: 8530


In [12]:
news_articles.sort_values('headline',inplace=True, ascending=False)

duplicated_articles_series = news_articles.duplicated('headline', keep = False)

news_articles = news_articles[~duplicated_articles_series]

print("Total number of articles after removing duplicates:", news_articles.shape[0])

Total number of articles after removing duplicates: 8485


In [13]:
news_articles.isna().sum()

category             0
headline             0
authors              0
link                 0
short_description    0
date                 0
dtype: int64

In [14]:
print("Total number of articles : ", news_articles.shape[0])

print("Total number of authors : ", news_articles["authors"].nunique())

print("Total number of unqiue categories : ", news_articles["category"].nunique())

Total number of articles :  8485
Total number of authors :  892
Total number of unqiue categories :  26


In [15]:
fig = go.Figure([go.Bar(x=news_articles["category"].value_counts().index, y=news_articles["category"].value_counts().values)])

fig['layout'].update(title={"text" : 'Distribution of articles category-wise','y':0.9,'x':0.5,'xanchor': 'center','yanchor': 'top'}, xaxis_title="Category name",yaxis_title="Number of articles")

fig.update_layout(width=800,height=700)

fig

In [16]:
news_articles_per_month = news_articles.resample('m',on = 'date')['headline'].count()

news_articles_per_month

date
2018-01-31    2065
2018-02-28    1694
2018-03-31    1778
2018-04-30    1580
2018-05-31    1368
Freq: M, Name: headline, dtype: int64

In [17]:
fig = go.Figure([go.Bar(x=news_articles_per_month.index.strftime("%b"), y=news_articles_per_month)])

fig['layout'].update(title={"text" : 'Distribution of articles month-wise','y':0.9,'x':0.5,'xanchor': 'center','yanchor': 'top'}, xaxis_title="Month",yaxis_title="Number of articles")

fig.update_layout(width=500,height=500)

fig

In [19]:
# Probability distribution function of headline lenght similar to Gaussian function

fig = ff.create_distplot([news_articles['headline'].str.len()], ["ht"],show_hist=False,show_rug=False)

fig['layout'].update(title={'text':'PDF','y':0.9,'x':0.5,'xanchor': 'center','yanchor': 'top'}, xaxis_title="Length of a headline",yaxis_title="probability")

fig.update_layout(showlegend = False,width=500,height=500)

fig

In [20]:
news_articles.index = range(news_articles.shape[0])

In [21]:
# Adding a new column containing both day of the week and month, it will be required later while recommending based on day of the week and month
news_articles["day and month"] = news_articles["date"].dt.strftime("%a") + "_" + news_articles["date"].dt.strftime("%b")


In [22]:
news_articles_temp = news_articles.copy()

In [23]:
#Stop word removal
stop_words = set(stopwords.words('english'))

In [24]:
for i in range(len(news_articles_temp["headline"])):
    string = ""
    
    for word in news_articles_temp["headline"][i].split():
        word = ("".join(e for e in word if e.isalnum()))
        word = word.lower()
        if not word in stop_words:
          string += word + " "  
    
    if(i % 1000 == 0):
      print(i)           # To track number of records processed
    
    news_articles_temp.at[i,"headline"] = string.strip()

0
1000
2000
3000
4000
5000
6000
7000
8000


In [25]:
#Lemmatization
lemmatizer = WordNetLemmatizer()

In [26]:
for i in range(len(news_articles_temp["headline"])):
    string = ""
    
    for w in word_tokenize(news_articles_temp["headline"][i]):
        string += lemmatizer.lemmatize(w,pos = "v") + " "
    
    news_articles_temp.at[i, "headline"] = string.strip()
    
    if(i % 1000 == 0):
        print(i)  

0
1000
2000
3000
4000
5000
6000
7000
8000


In [27]:
#Headlines based similarity on news articles
#Bag of words method
headline_vectorizer = CountVectorizer()

headline_features   = headline_vectorizer.fit_transform(news_articles_temp['headline'])

In [28]:
headline_features.get_shape()

(8485, 11122)

In [29]:
# To display a very long headline completely

pd.set_option('display.max_colwidth', -1)  

In [34]:
def bag_of_words_based_model(row_index, num_similar_items):
    couple_dist = pairwise_distances(headline_features,headline_features[row_index])
    
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    
    df = pd.DataFrame({'publish_date': news_articles['date'][indices].values,
               'headline':news_articles['headline'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    
    print("="*30,"Queried article details","="*30)
    
    print('headline : ',news_articles['headline'][indices[0]])
    
    print("\n","="*25,"Recommended articles : ","="*23)
    
    #return df.iloc[1:,1]
    return df.iloc[1:,]

# Change the row index for any other queried article
bag_of_words_based_model(133, 11) 

headline :  Woman Fired After Flipping Off Trump's Motorcade Sues Former Employer



Unnamed: 0,publish_date,headline,Euclidean similarity with the queried article
1,2018-04-02,The Trump Administration Is Suing California Again,2.828427
2,2018-05-01,Texas Sues Trump Administration To End DACA,3.162278
3,2018-03-07,Stormy Daniels Suing Trump Over Nondisclosure Agreement,3.162278
4,2018-04-28,Trump: Mueller Should Never Have Been Appointed,3.162278
5,2018-04-24,Spanish Woman Looks More Like Trump Than The Donald Himself,3.162278
6,2018-02-12,What You Should Know About Trump's Nihilist Budget,3.162278
7,2018-05-09,The Caliphate Of Trump And A Planet In Ruins,3.162278
8,2018-03-26,Trump Ally Sues Qatar For Hacking His Email,3.162278
9,2018-02-21,All They Will Call You Will Be Deportees,3.162278
10,2018-04-11,Pursuing Desegregation In The Trump Era,3.162278


In [35]:
#TF-IDF method
tfidf_headline_vectorizer = TfidfVectorizer(min_df = 0)

tfidf_headline_features = tfidf_headline_vectorizer.fit_transform(news_articles_temp['headline'])

In [36]:
def tfidf_based_model(row_index, num_similar_items):
    couple_dist = pairwise_distances(tfidf_headline_features,tfidf_headline_features[row_index])
    
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    
    df = pd.DataFrame({'publish_date': news_articles['date'][indices].values,
               'headline':news_articles['headline'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    
    print("="*30,"Queried article details","="*30)
    
    print('headline : ',news_articles['headline'][indices[0]])
    
    print("\n","="*25,"Recommended articles : ","="*23)
    
    #return df.iloc[1:,1]
    return df.iloc[1:,]

tfidf_based_model(133, 11)

headline :  Woman Fired After Flipping Off Trump's Motorcade Sues Former Employer



Unnamed: 0,publish_date,headline,Euclidean similarity with the queried article
1,2018-05-21,The Supreme Court Just Made It A Lot Harder For You To Sue Your Employer,1.164067
2,2018-04-02,The Trump Administration Is Suing California Again,1.253867
3,2018-04-10,"Lou Dobbs Flips Out On Live TV, Urges Trump To 'Fire The SOB' Robert Mueller",1.25881
4,2018-04-26,Cardi B's Former Manager Sues Her For $10 Million,1.268704
5,2018-04-03,A Third Woman Is Suing To Break A Trump-Related Nondisclosure Agreement,1.274264
6,2018-02-24,Former RNC Chair Fires Back At Claim He Was Only Hired Because He Was Black,1.274847
7,2018-01-16,State Employer Side Payroll Taxes And Loser Liberalism,1.276696
8,2018-02-21,Democrats Flip Kentucky State House Seat Where Trump Won Overwhelmingly,1.282008
9,2018-01-09,Big Tax Game Hunting: Employer Side Payroll Taxes,1.285147
10,2018-02-28,Democrats Flip 2 More GOP-Held State House Seats,1.287403


In [47]:
#Word2Vec embedding
!python -m pip install gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

You should consider upgrading via the 'C:\Users\Prasan\Miniconda3\python.exe -m pip install --upgrade pip' command.


In [59]:
import gensim

loaded_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
print(loaded_model)

<gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x000001A4F49D0848>


In [61]:
vocabulary = KeyedVectors(loaded_model.vector_size)
w2v_headline = []

for i in news_articles_temp['headline']:
    w2Vec_word = np.zeros(300, dtype="float32")
    
    for word in i.split():
        if word in vocabulary:
            w2Vec_word = np.add(w2Vec_word, loaded_model[word])
    
    w2Vec_word = np.divide(w2Vec_word, len(i.split()))
    w2v_headline.append(w2Vec_word)

w2v_headline = np.array(w2v_headline)

In [62]:
def avg_w2v_based_model(row_index, num_similar_items):
    couple_dist = pairwise_distances(w2v_headline, w2v_headline[row_index].reshape(1,-1))
    
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    
    df = pd.DataFrame({'publish_date': news_articles['date'][indices].values,
               'headline':news_articles['headline'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    
    print("="*30,"Queried article details","="*30)
    
    print('headline : ',news_articles['headline'][indices[0]])
    
    print("\n","="*25,"Recommended articles : ","="*23)
    #return df.iloc[1:,1]
    
    return df.iloc[1:,]

avg_w2v_based_model(133, 11)

headline :  ‘Will & Grace’ Creator To Donate Gay Bunny Book To Every Grade School In Indiana



Unnamed: 0,publish_date,headline,Euclidean similarity with the queried article
1,2018-05-12,"George Soros, Progressive Groups To Spend Millions To Elect Reformist Prosecutors",0.0
2,2018-05-25,George Takei Accuser Walks Back Story Of Drugging And Sexual Assault,0.0
3,2018-03-28,George W. Bush Dances Up A Storm At His Nephew's Wedding,0.0
4,2018-03-06,George W. Bush Reportedly Sounds Off On Trump: 'Sorta Makes Me Look Pretty Good',0.0
5,2018-03-30,George W. Bush's Ethics Chief Warns Trump Insiders: People Will Go 'To The Slammer',0.0
6,2018-02-08,George W. Bush: It's Pretty Clear That Russia Meddled In 2016 Election,0.0
7,2018-05-08,George Zimmerman Accused Of Stalking Detective Working On Trayvon Martin Film,0.0
8,2018-05-15,Georgia GOP Gov. Candidate Wants To Round Up Immigrants In 'Deportation Bus',0.0
9,2018-05-22,Georgia Governor’s Primary Highlights Competing Visions For The Democratic Party,0.0
10,2018-05-06,"Georgia Highway Sniper 'Idolized' Parkland Shooter, Police Say",0.0


In [63]:
#Weighted similarity based on headline and category
from sklearn.preprocessing import OneHotEncoder 

In [64]:
category_onehot_encoded = OneHotEncoder().fit_transform(np.array(news_articles_temp["category"]).reshape(-1,1))

In [65]:
def avg_w2v_with_category(row_index, num_similar_items, w1,w2): #headline_preference = True, category_preference = False):
    w2v_dist  = pairwise_distances(w2v_headline, w2v_headline[row_index].reshape(1,-1))
    
    category_dist = pairwise_distances(category_onehot_encoded, category_onehot_encoded[row_index]) + 1
    
    weighted_couple_dist   = (w1 * w2v_dist +  w2 * category_dist)/float(w1 + w2)
    
    indices = np.argsort(weighted_couple_dist.flatten())[0:num_similar_items].tolist()
    
    df = pd.DataFrame({'publish_date': news_articles['date'][indices].values,
               'headline':news_articles['headline'][indices].values,
                'Weighted Euclidean similarity with the queried article': weighted_couple_dist[indices].ravel(),
                'Word2Vec based Euclidean similarity': w2v_dist[indices].ravel(),
                 'Category based Euclidean similarity': category_dist[indices].ravel(),
                'Categoty': news_articles['category'][indices].values})
    
    print("="*30,"Queried article details","="*30)
    
    print('headline : ',news_articles['headline'][indices[0]])
    
    print('Categoty : ', news_articles['category'][indices[0]])
    
    print("\n","="*25,"Recommended articles : ","="*23)
    
    #return df.iloc[1:,[1,5]]
    return df.iloc[1:, ]

avg_w2v_with_category(528,10,0.1,0.8)

headline :  Chuck E. Cheese’s Now Offers 'Sensory Sensitive Sundays' For Special Needs Children
Categoty :  EDUCATION



Unnamed: 0,publish_date,headline,Weighted Euclidean similarity with the queried article,Word2Vec based Euclidean similarity,Category based Euclidean similarity,Categoty
1,2018-04-11,Pennsylvania School District Arms 500 Teachers With Mini Baseball Bats,0.888889,0.0,1.0,EDUCATION
2,2018-01-30,Columbia University Refuses To Recognize Graduate Student Union,0.888889,0.0,1.0,EDUCATION
3,2018-04-09,"Hey, Laura Ingraham, David Hogg Actually Did Get Into UC Irvine",0.888889,0.0,1.0,EDUCATION
4,2018-04-16,"Beyoncé Announces $100,000 In Scholarships For HBCU Students",0.888889,0.0,1.0,EDUCATION
5,2018-01-03,Children Forced To 'Deal With It' And Bundle Up As Classrooms Lose Heat,0.888889,0.0,1.0,EDUCATION
6,2018-02-20,Company That Sells Bulletproof Gucci And Hermès Bags Sees Huge Sales In School Backpacks,0.888889,0.0,1.0,EDUCATION
7,2018-04-11,Parkland School District Votes Against Arming Teachers,0.888889,0.0,1.0,EDUCATION
8,2018-04-06,Puerto Rico To Shutter 283 More Schools This Summer As Education Crisis Deepens,0.888889,0.0,1.0,EDUCATION
9,2018-04-19,"Desperate For Teachers, Districts Beg Retirees To Come Back",0.888889,0.0,1.0,EDUCATION


In [66]:
#Weighted similarity based on headline, category and author
authors_onehot_encoded = OneHotEncoder().fit_transform(np.array(news_articles_temp["authors"]).reshape(-1,1))

In [67]:
def avg_w2v_with_category_and_authors(row_index, num_similar_items, w1,w2,w3): #headline_preference = True, category_preference = False):
    w2v_dist  = pairwise_distances(w2v_headline, w2v_headline[row_index].reshape(1,-1))
    
    category_dist = pairwise_distances(category_onehot_encoded, category_onehot_encoded[row_index]) + 1
    
    authors_dist = pairwise_distances(authors_onehot_encoded, authors_onehot_encoded[row_index]) + 1
    
    weighted_couple_dist   = (w1 * w2v_dist +  w2 * category_dist + w3 * authors_dist)/float(w1 + w2 + w3)
    
    indices = np.argsort(weighted_couple_dist.flatten())[0:num_similar_items].tolist()
    
    df = pd.DataFrame({'publish_date': news_articles['date'][indices].values,
                'headline':news_articles['headline'][indices].values,
                'Weighted Euclidean similarity with the queried article': weighted_couple_dist[indices].ravel(),
                'Word2Vec based Euclidean similarity': w2v_dist[indices].ravel(),
                'Category based Euclidean similarity': category_dist[indices].ravel(),
                'Authors based Euclidean similarity': authors_dist[indices].ravel(),       
                'Categoty': news_articles['category'][indices].values,
                'Authors': news_articles['authors'][indices].values})
    
    print("="*30,"Queried article details","="*30)
    
    print('headline : ',news_articles['headline'][indices[0]])
    
    print('Categoty : ', news_articles['category'][indices[0]])
    
    print('Authors : ', news_articles['authors'][indices[0]])
    
    print("\n","="*25,"Recommended articles : ","="*23)
    #return df.iloc[1:,[1,6,7]]
    
    return df.iloc[1:, ]


avg_w2v_with_category_and_authors(528,10,0.1,0.1,1)

headline :  Universities Tell Applicants That Protesting Gun Violence Won’t Affect Admissions
Categoty :  EDUCATION
Authors :  Carla Herreria



Unnamed: 0,publish_date,headline,Weighted Euclidean similarity with the queried article,Word2Vec based Euclidean similarity,Category based Euclidean similarity,Authors based Euclidean similarity,Categoty,Authors
1,2018-03-02,Hawaii Democrat Defends Stance On Guns After Actress Questions Her Silence On Bill,1.034518,0.0,2.414214,1.0,POLITICS,Carla Herreria
2,2018-03-18,Democrats Flood McCabe With Federal Job Offers So He Can Access His Pension,1.034518,0.0,2.414214,1.0,POLITICS,Carla Herreria
3,2018-02-28,"Kevin Smith Opens Up About Death, Fat-Shaming And Chris Pratt's Prayers",1.034518,0.0,2.414214,1.0,COMEDY,Carla Herreria
4,2018-01-17,Hackers Messaged Donald Trump With Former Fox News Hosts' Twitter Accounts,1.034518,0.0,2.414214,1.0,MEDIA,Carla Herreria
5,2018-01-06,White Supremacist Charged With Terrorism After Alleged Attempt To Derail Train,1.034518,0.0,2.414214,1.0,CRIME,Carla Herreria
6,2018-03-08,HBO's Martin Luther King Jr. Film Reveals His 'Dark And Dangerous' Final Years,1.034518,0.0,2.414214,1.0,BLACK VOICES,Carla Herreria
7,2018-05-25,Rachel Dolezal Faces Felony Charges For Welfare Fraud,1.034518,0.0,2.414214,1.0,CRIME,Carla Herreria
8,2018-01-26,Olympic Committee Gives USA Gymnastics An Ultimatum Over Sex Abuse Scandal,1.034518,0.0,2.414214,1.0,SPORTS,Carla Herreria
9,2018-01-04,Gunfire Outside Colorado State Capitol Forces Brief Lockdown,1.034518,0.0,2.414214,1.0,CRIME,Carla Herreria


In [68]:
 #Weighted similarity based on headline, category, author and publishing day
 publishingday_onehot_encoded = OneHotEncoder().fit_transform(np.array(news_articles_temp["day and month"]).reshape(-1,1))

In [69]:
def avg_w2v_with_category_authors_and_publshing_day(row_index, num_similar_items, w1,w2,w3,w4): #headline_preference = True, category_preference = False):
    w2v_dist  = pairwise_distances(w2v_headline, w2v_headline[row_index].reshape(1,-1))
    
    category_dist = pairwise_distances(category_onehot_encoded, category_onehot_encoded[row_index]) + 1
    
    authors_dist = pairwise_distances(authors_onehot_encoded, authors_onehot_encoded[row_index]) + 1
    
    publishingday_dist = pairwise_distances(publishingday_onehot_encoded, publishingday_onehot_encoded[row_index]) + 1
    
    weighted_couple_dist   = (w1 * w2v_dist +  w2 * category_dist + w3 * authors_dist + w4 * publishingday_dist)/float(w1 + w2 + w3 + w4)
    
    indices = np.argsort(weighted_couple_dist.flatten())[0:num_similar_items].tolist()
    
    df = pd.DataFrame({'publish_date': news_articles['date'][indices].values,
                'headline_text':news_articles['headline'][indices].values,
                'Weighted Euclidean similarity with the queried article': weighted_couple_dist[indices].ravel(),
                'Word2Vec based Euclidean similarity': w2v_dist[indices].ravel(),
                'Category based Euclidean similarity': category_dist[indices].ravel(),
                'Authors based Euclidean similarity': authors_dist[indices].ravel(),   
                'Publishing day based Euclidean similarity': publishingday_dist[indices].ravel(), 
                'Categoty': news_articles['category'][indices].values,
                'Authors': news_articles['authors'][indices].values,
                'Day and month': news_articles['day and month'][indices].values})
    
    print("="*30,"Queried article details","="*30)
    
    print('headline : ',news_articles['headline'][indices[0]])
    
    print('Categoty : ', news_articles['category'][indices[0]])
    
    print('Authors : ', news_articles['authors'][indices[0]])
    
    print('Day and month : ', news_articles['day and month'][indices[0]])
    
    print("\n","="*25,"Recommended articles : ","="*23)
    #return df.iloc[1:,[1,7,8,9]]
    
    return df.iloc[1:, ]


avg_w2v_with_category_authors_and_publshing_day(528,10,0.1,0.1,0.1,1)

headline :  Universities Tell Applicants That Protesting Gun Violence Won’t Affect Admissions
Categoty :  EDUCATION
Authors :  Carla Herreria
Day and month :  Sat_Feb



Unnamed: 0,publish_date,headline_text,Weighted Euclidean similarity with the queried article,Word2Vec based Euclidean similarity,Category based Euclidean similarity,Authors based Euclidean similarity,Publishing day based Euclidean similarity,Categoty,Authors,Day and month
1,2018-02-17,U.S. Figure Skater Nathan Chen Redeems Himself With Record-Setting Skate,1.031863,0.0,2.414214,1.0,1.0,SPORTS,Carla Herreria,Sat_Feb
2,2018-02-17,Florida Gubernatorial Candidate Calls On Governor To Halt AR-15 Sales,1.031863,0.0,2.414214,1.0,1.0,POLITICS,Carla Herreria,Sat_Feb
3,2018-02-24,Tribal Filipinos Were A Surprising Muse For ‘Black Panther’s’ Dora Milaje,1.031863,0.0,2.414214,1.0,1.0,BLACK VOICES,Carla Herreria,Sat_Feb
4,2018-02-24,Nigerian Bobsled Team Brought Something More Valuable Than Gold To The Olympics,1.140648,0.0,2.414214,2.414214,1.0,BLACK VOICES,Taryn Finley,Sat_Feb
5,2018-02-17,Maame Biney's Pioneering Run At The Winter Olympics Is Over,1.140648,0.0,2.414214,2.414214,1.0,SPORTS,Ron Dicker,Sat_Feb
6,2018-02-03,These Patriots Offensive Linemen Say They’d Support A Gay Teammate,1.140648,0.0,2.414214,2.414214,1.0,QUEER VOICES,"Jim Buzinski, Outsports",Sat_Feb
7,2018-02-03,The Largest Number Of Scientists In Modern U.S. History Are Running For Office In 2018,1.140648,0.0,2.414214,2.414214,1.0,POLITICS,Alexander C. Kaufman,Sat_Feb
8,2018-02-17,Russian Indictments Could Set Stage For More Mueller Charges,1.140648,0.0,2.414214,2.414214,1.0,POLITICS,"Jan Wolf, Reuters",Sat_Feb
9,2018-02-10,Watch This Jamaican Bobsledder's Tearful Plea For Diversity At Winter Olympics,1.140648,0.0,2.414214,2.414214,1.0,BLACK VOICES,Ron Dicker,Sat_Feb
