In [1]:
import numpy as np
import pandas as pd

import os
import math
import time

import matplotlib.pyplot as plt
import seaborn as sns


from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.metrics import pairwise_distances

In [2]:
dataset = pd.read_json('newsdataset.json',lines = True)

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200853 entries, 0 to 200852
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   category           200853 non-null  object        
 1   headline           200853 non-null  object        
 2   authors            200853 non-null  object        
 3   link               200853 non-null  object        
 4   short_description  200853 non-null  object        
 5   date               200853 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 9.2+ MB


In [4]:
dataset.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


In [5]:
dataset = dataset[dataset['headline'].apply(lambda x:len(x.split())>5)]
print(dataset.shape[0])

180543


In [6]:
dataset  = dataset[dataset['date']>=pd.Timestamp(2018,1,1)]

In [7]:
print(dataset.shape[0])

8530


In [8]:
dataset.sort_values('headline',inplace=True,ascending=False)
dups = dataset.duplicated('headline',keep = False)
dataset = dataset[~dups]
print(dataset.shape[0])

8485


In [9]:
dataset.isna().sum()

category             0
headline             0
authors              0
link                 0
short_description    0
date                 0
dtype: int64

In [10]:
print(dataset['category'].nunique())

26


In [11]:
categories = dataset.groupby('category')
print('total categories: {}'.format(categories.ngroups))
print(categories.size())

total categories: 26
category
ARTS & CULTURE      13
BLACK VOICES       406
BUSINESS            85
COLLEGE              1
COMEDY             443
CRIME              170
EDUCATION           31
ENTERTAINMENT     1699
GREEN               28
HEALTHY LIVING      15
IMPACT              73
LATINO VOICES       83
MEDIA              290
PARENTS             32
POLITICS          3042
QUEER VOICES       451
RELIGION            63
SCIENCE             40
SPORTS             364
STYLE               34
TASTE                9
TECH                53
TRAVEL              72
WEIRD NEWS         205
WOMEN              226
WORLD NEWS         557
dtype: int64


In [12]:
dataset.index = range(dataset.shape[0])

In [13]:
dataset["day and month"] = dataset["date"].dt.strftime("%a") + "_" + dataset["date"].dt.strftime("%b")

In [14]:
dataset_cp = dataset.copy()

In [15]:

stop = stopwords.words('english')

In [16]:
for i in range(len(dataset_cp["headline"])):
    string = ""
    for word in dataset_cp["headline"][i].split():
        ss = ("".join(e for e in word if e.isalnum()))
        ss = ss.lower()
        if not ss in stop:
          string += ss + " "  
    if(i%1000==0):
      print(i)           # To track number of records processed
    dataset_cp.at[i,"headline"] = string.strip()

0
1000
2000
3000
4000
5000
6000
7000
8000


In [17]:
dataset_cp.head()

Unnamed: 0,category,headline,authors,link,short_description,date,day and month
0,QUEER VOICES,grace creator donate gay bunny book every grad...,Elyse Wanshel,https://www.huffingtonpost.com/entry/will-grac...,It's about to be a lot easier for kids in Mike...,2018-04-02,Mon_Apr
1,QUEER VOICES,voice blind auditions make history first trans...,"Lyndsey Parker, Yahoo Entertainment",https://www.huffingtonpost.com/entry/the-voice...,"Austin Giorgio, 21: “How Sweet It Is (To Be Lo...",2018-03-06,Tue_Mar
2,QUEER VOICES,penumbra queer audio drama didnt know needed,"Sarah Emily Baum, ContributorFreelance Writer",https://www.huffingtonpost.com/entry/the-penum...,"Young, fun, fantastical and, most notably, inc...",2018-01-05,Fri_Jan
3,COMEDY,opposition gives trump hot lawyer,Ed Mazza,https://www.huffingtonpost.com/entry/trump-hot...,"He's here to make a ""strong case"" for the pres...",2018-05-11,Fri_May
4,ENTERTAINMENT,stranger things fans able visit upside irl,Elyse Wanshel,https://www.huffingtonpost.com/entry/stranger-...,"Hawkins is headed to Hollywood, Orlando and Si...",2018-04-03,Tue_Apr


In [18]:
lemmatizer = WordNetLemmatizer()
import nltk
nltk.download('punkt')
nltk.download('wordnet')
 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NIRANJAN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\NIRANJAN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [19]:
for i in range(len(dataset_cp['headline'])):
    string = ""
    for w in word_tokenize(dataset_cp['headline'][i]):
        string += lemmatizer.lemmatize(w,pos='v') + " "
    dataset_cp.at[i,'headline'] = string.strip()
    

In [20]:
dataset_cp.head()

Unnamed: 0,category,headline,authors,link,short_description,date,day and month
0,QUEER VOICES,grace creator donate gay bunny book every grad...,Elyse Wanshel,https://www.huffingtonpost.com/entry/will-grac...,It's about to be a lot easier for kids in Mike...,2018-04-02,Mon_Apr
1,QUEER VOICES,voice blind audition make history first trans ...,"Lyndsey Parker, Yahoo Entertainment",https://www.huffingtonpost.com/entry/the-voice...,"Austin Giorgio, 21: “How Sweet It Is (To Be Lo...",2018-03-06,Tue_Mar
2,QUEER VOICES,penumbra queer audio drama didnt know need,"Sarah Emily Baum, ContributorFreelance Writer",https://www.huffingtonpost.com/entry/the-penum...,"Young, fun, fantastical and, most notably, inc...",2018-01-05,Fri_Jan
3,COMEDY,opposition give trump hot lawyer,Ed Mazza,https://www.huffingtonpost.com/entry/trump-hot...,"He's here to make a ""strong case"" for the pres...",2018-05-11,Fri_May
4,ENTERTAINMENT,stranger things fan able visit upside irl,Elyse Wanshel,https://www.huffingtonpost.com/entry/stranger-...,"Hawkins is headed to Hollywood, Orlando and Si...",2018-04-03,Tue_Apr


In [21]:
headline_vectorizer = CountVectorizer()
headline_features = headline_vectorizer.fit_transform(dataset_cp['headline'])

In [22]:
print(headline_features.shape)

(8485, 11122)


In [23]:
pd.set_option('display.max_colwidth', None) 

In [24]:
def bag_of_words(row_index,sim_items):
    couple_dist = pairwise_distances(headline_features,headline_features[sim_items])
    indices = np.argsort(couple_dist.ravel())[0:sim_items]
    df = pd.DataFrame({'publish_date':dataset['date'][indices].values,
                      'headline':dataset['headline'][indices].values,
                      'similarity':couple_dist[indices].ravel()})
    print("-"*30,"Queried article details ","-"*30)
    print("The headline is ",dataset['headline'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    return df.iloc[1:,]

In [25]:
bag_of_words(133,11)

------------------------------ Queried article details  ------------------------------
The headline is  ‘RuPaul’s Drag Race All Stars 3’ Episode 6 Recap: Which Queen Returned To The Competition?



Unnamed: 0,publish_date,headline,similarity
1,2018-03-16,'RuPaul's Drag Race All Stars 3' Episode 8 Recap: Which Queen Snatched The Crown?,2.0
2,2018-02-02,‘RuPaul’s Drag Race All Stars 3’ Episode 2 Recap: Sour Milk,2.236068
3,2018-02-12,‘RuPaul’s Drag Race All Stars 3’ Episode 3 Recap: Who Won The Bitchelor's Heart?,2.236068
4,2018-02-16,‘RuPaul’s Drag Race All Stars 3’ Episode 4 Recap: Who Won Snatch Game?,2.236068
5,2018-01-29,‘RuPaul’s Drag Race All Stars 3’ Episode 1 Recap: Which Queen Went Home First?,2.236068
6,2018-03-09,‘RuPaul’s Drag Race All Stars 3’ Episode 7 Recap: We Can Never Go Back To Before,2.44949
7,2018-02-23,‘RuPaul’s Drag Race All Stars 3’ Episode 5 Recap: The Warhol Ball Crowns One Pop Art Queen,2.828427
8,2018-03-08,"'RuPaul's Drag Race' Reveals Guest Judges, Stars For Season 10",3.162278
9,2018-03-16,Trixie Mattel Sounds Off On Shocking 'RuPaul's Drag Race All Stars' Win,3.162278
10,2018-01-18,'RuPaul's Drag Race' Stars Open Up About Mental Health And The Toll Of Superstardom,3.162278


In [26]:
tfidfvect = TfidfVectorizer(min_df=0)
tfidf_headline_features = tfidfvect.fit_transform(dataset_cp['headline'])

In [27]:
def tfidf_based_model(row_index, num_similar_items):
    couple_dist = pairwise_distances(tfidf_headline_features,tfidf_headline_features[row_index])
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df = pd.DataFrame({'publish_date': dataset['date'][indices].values,
               'headline':dataset['headline'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    print("="*30,"Queried article details","="*30)
    print('headline : ',dataset['headline'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    
    return df.iloc[1:,]
tfidf_based_model(133, 11)

headline :  Woman Fired After Flipping Off Trump's Motorcade Sues Former Employer



Unnamed: 0,publish_date,headline,Euclidean similarity with the queried article
1,2018-05-21,The Supreme Court Just Made It A Lot Harder For You To Sue Your Employer,1.164067
2,2018-04-02,The Trump Administration Is Suing California Again,1.253867
3,2018-04-10,"Lou Dobbs Flips Out On Live TV, Urges Trump To 'Fire The SOB' Robert Mueller",1.25881
4,2018-04-26,Cardi B's Former Manager Sues Her For $10 Million,1.268704
5,2018-04-03,A Third Woman Is Suing To Break A Trump-Related Nondisclosure Agreement,1.274264
6,2018-02-24,Former RNC Chair Fires Back At Claim He Was Only Hired Because He Was Black,1.274847
7,2018-01-16,State Employer Side Payroll Taxes And Loser Liberalism,1.276696
8,2018-02-21,Democrats Flip Kentucky State House Seat Where Trump Won Overwhelmingly,1.282008
9,2018-01-09,Big Tax Game Hunting: Employer Side Payroll Taxes,1.285147
10,2018-02-28,Democrats Flip 2 More GOP-Held State House Seats,1.287403
