In [93]:
#Author: Samriddha KC 
#Project: Reccomender System project for a publishing company 
# Import all the required packages. 
import string
import umap.umap_ as umap
import pandas as pd 
import pickle 
import numpy as np
import re 
import random
import sklearn
from numpy import linalg
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import pairwise_kernels
from sklearn.metrics.pairwise import cosine_similarity
import sqlite3
from sklearn.metrics.pairwise import linear_kernel
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Use sqlite3 library to establish connection with the .db file 
conn = sqlite3.connect("/Users/samriddhakc/Desktop/publishing_data.db")

In [3]:
# Create a data frame with all posts
allposts_df=pd.read_sql("select * from posts;", conn)

In [4]:
allposts_df.head(5)

Unnamed: 0.1,Unnamed: 0,id,headline,snippets,lead_paragraphs,published_at,updated_at,doc_id,keyword_id,author_id
0,0,0,Ravens Outlast 49ers With a Strong Finishing Kick,Quarterback Lamar Jackson converted two critic...,BALTIMORE — The most difficult task in sports ...,2019-04-19,2019-11-08,0,3396,0
1,1,1,Irish Ex-Soldier Who Married ISIS Fighter Is A...,"Lisa Smith, 38, and her 2-year-old daughter we...",DUBLIN — A former Irish soldier who converted ...,2019-12-01,2020-07-10,0,"[1315, 1436, 2998]",1
2,2,2,Plunge in 3 Hong Kong Stocks Offers a Cautiona...,The sudden drops raise fresh questions about c...,HONG KONG — Even by the standards of Hong Kong...,2019-03-16,2019-12-01,0,"[1325, 827, 1436, 1639, 2486]",2
3,3,3,Did ‘The Irishman’ Take a Bite Out of the Than...,This year’s holiday weekend saw a 16 percent d...,LOS ANGELES — Theaters had a more meager Thank...,2019-12-01,2019-12-01,0,"[3201, 130]",3
4,4,4,Have You Ever Tried to Make Money Online?,Do you see the internet as a way to earn extra...,The internet seems to offer plenty of opportun...,2019-12-02,2019-12-02,0,[],4


In [5]:
# Create a data frame to check out the first few sections in the table to better understand the data. 
documenttypes_df=pd.read_sql("select * from doc_types limit 5;", conn)

In [6]:
# Print the first five rows of the dataset. 
allposts_df.head(5)

Unnamed: 0.1,Unnamed: 0,id,headline,snippets,lead_paragraphs,published_at,updated_at,doc_id,keyword_id,author_id
0,0,0,Ravens Outlast 49ers With a Strong Finishing Kick,Quarterback Lamar Jackson converted two critic...,BALTIMORE — The most difficult task in sports ...,2019-04-19,2019-11-08,0,3396,0
1,1,1,Irish Ex-Soldier Who Married ISIS Fighter Is A...,"Lisa Smith, 38, and her 2-year-old daughter we...",DUBLIN — A former Irish soldier who converted ...,2019-12-01,2020-07-10,0,"[1315, 1436, 2998]",1
2,2,2,Plunge in 3 Hong Kong Stocks Offers a Cautiona...,The sudden drops raise fresh questions about c...,HONG KONG — Even by the standards of Hong Kong...,2019-03-16,2019-12-01,0,"[1325, 827, 1436, 1639, 2486]",2
3,3,3,Did ‘The Irishman’ Take a Bite Out of the Than...,This year’s holiday weekend saw a 16 percent d...,LOS ANGELES — Theaters had a more meager Thank...,2019-12-01,2019-12-01,0,"[3201, 130]",3
4,4,4,Have You Ever Tried to Make Money Online?,Do you see the internet as a way to earn extra...,The internet seems to offer plenty of opportun...,2019-12-02,2019-12-02,0,[],4


In [7]:
# Check what 0 and 1's mean on the doc_id 
documenttypes_df

Unnamed: 0,id,doc_type
0,0,article
1,1,multimedia


In [8]:
# Filter out just the news articles out of all the posts as some of them also contain multimedia. 
allposts_df.loc[allposts_df['doc_id']==0]

Unnamed: 0.1,Unnamed: 0,id,headline,snippets,lead_paragraphs,published_at,updated_at,doc_id,keyword_id,author_id
0,0,0,Ravens Outlast 49ers With a Strong Finishing Kick,Quarterback Lamar Jackson converted two critic...,BALTIMORE — The most difficult task in sports ...,2019-04-19,2019-11-08,0,3396,0
1,1,1,Irish Ex-Soldier Who Married ISIS Fighter Is A...,"Lisa Smith, 38, and her 2-year-old daughter we...",DUBLIN — A former Irish soldier who converted ...,2019-12-01,2020-07-10,0,"[1315, 1436, 2998]",1
2,2,2,Plunge in 3 Hong Kong Stocks Offers a Cautiona...,The sudden drops raise fresh questions about c...,HONG KONG — Even by the standards of Hong Kong...,2019-03-16,2019-12-01,0,"[1325, 827, 1436, 1639, 2486]",2
3,3,3,Did ‘The Irishman’ Take a Bite Out of the Than...,This year’s holiday weekend saw a 16 percent d...,LOS ANGELES — Theaters had a more meager Thank...,2019-12-01,2019-12-01,0,"[3201, 130]",3
4,4,4,Have You Ever Tried to Make Money Online?,Do you see the internet as a way to earn extra...,The internet seems to offer plenty of opportun...,2019-12-02,2019-12-02,0,[],4
5,5,5,‘I Wanted to Die:’ Northern Ireland Confronts ...,"Haunted by a violent past, the territory has o...","LURGAN, Northern Ireland — On a cold February ...",2019-12-01,2020-06-10,0,"[1424, 2947, 2941, 2520, 1315, 3237]",5
6,6,6,Snow and Sleet Cause Havoc for Travelers,"The morning commute will be cold and rainy, wi...",[Update: Get the latest information on weather...,2018-12-04,2019-12-01,0,"[3430, 2989]",6
7,7,7,"Holiday Travel, AIDS, Sea Snakes: Your Weekend...",Here’s what you need to know about the week’s ...,(Want to get this briefing by email? Here’s th...,2019-12-01,2019-12-01,0,[],"[7, 8]"
8,8,8,Melania Trump Reveals Christmas Décor but Stay...,"What you can see, and what you can’t see, in t...",You know the holiday season has officially beg...,2019-01-28,2019-12-02,0,"[2519, 3069, 787, 2815, 2261, 1857]",9
9,9,9,"Sidelined for Months, Judiciary Panel Will Rec...",The House Judiciary Committee had been leading...,"This spring, as President Trump defiantly reje...",2019-12-01,2020-05-15,0,"[3182, 852, 1523, 2955]",10


In [9]:
# Drop published_at, updated_at, author_id and Unamed:0 because we do not have sufficent 
# information to conclude that they affect the "werdiness" of the document in any particular way. 
# Also, you can also drop doc_id because it has been already usedto filter the articles and serve no further purpose. 
allposts_df=allposts_df[['id','headline','snippets','lead_paragraphs','keyword_id']]

In [10]:
allkeywords_df=pd.read_sql("select * from keywords;", conn)

In [11]:
def custom_count(row): 
    return len(row.keyword_id.split(','))

In [12]:
# Brute Force Method 1: Sort by number of keywords 
# Problem: Does not take into account the relevance of each key word and considers all of them to be the same
# Also, ignores the headline, snippets, and lead_paragraphs which might potentially have important information.
sort_by_keywordcountDF=allposts_df[['id','keyword_id']]
sort_by_keywordcountDF['keyword_count']=sort_by_keywordcountDF.apply(lambda row:custom_count(row),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [13]:
keywordcount_ID=list(sort_by_keywordcountDF.sort_values(by=['keyword_count'])['id'])

In [14]:
# Gives top n weird posts based on the count of key word. 
def weird_n_keywordcount(keywordcount_ID,n): 
    return keywordcount_ID[len(keywordcount_ID)-n:]

In [15]:
weird_n_keywordcount(keywordcount_ID,10)

[20368, 18603, 2564, 58590, 58492, 58530, 58504, 23271, 57336, 57317]

In [16]:
# Method 2. 
# Step 0: Combine all columns to make headline,snippets,lead_paragraphs to make a single text column with all the texts.
# Step 1: Preprocess the data to take out relevant words(stop words in english) 
# Step 3: Train a Count vectorizer with the vocabulary as the keywords list.
# Step 4: Transform the "text" column with the vectorizer. 
# Step 5: Use cosine similarity(inverse of consine similarity) to measure the difference between the vectors.
# Step 6: Pick n weird posts by taking out top n posts with highest sine disimilarity.
'''This is an improvement to Method 1 because instead of just counting from key words we are looking at count from  all the 
seen words in the documents and measuting their similarity.'''

In [17]:
# There is a biased suppostion that all three categories contribute equally to the weirdness. According to the comapny's 
# needs and data, this should be adjusted by doign. weighted average. 
allposts_df["text"]=allposts_df["headline"]+" "+allposts_df["snippets"]+" "+allposts_df["snippets"]
allposts_df["text"]=allposts_df["text"].fillna('')

In [18]:
all_text_list=list(allposts_df["text"])
all_vocabulary=list(allkeywords_df["keyword"])

In [19]:
stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from', 
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're',
             's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']
def preprocess_text(texts): 
    processedData=[]
    wordLemm=WordNetLemmatizer()
    space_pattern="[\s]+"   
    word_pattern=r"[0-9]+"
    for text in texts:
        text=re.sub(word_pattern,' ',text)
        text=re.sub(space_pattern,' ',text)
        text=text.lower()
        for c in string.punctuation: 
            text=text.replace(c,"")
        filtered_text=""
        '''for word in text.split(' '): 
            if len(word)>1 and word not in stopwordlist: 
                word=wordLemm.lemmatize(word)
                filtered_text+=word+' '''
        processedData.append(text)
    return processedData

In [20]:
texts=["My     name IS 5086 samriddha?!!!!!!","MY NAME IS RABINA"]

In [21]:
preprocess_text(texts)

['my name is samriddha', 'my name is rabina']

In [22]:
processed_text_list=preprocess_text(all_text_list)

In [75]:
count_vectorizer=CountVectorizer(ngram_range=(1,2),max_features=3500,stop_words="english")

In [76]:
count_vectorizer.fit(processed_text_list)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=3500, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [78]:
index=np.arange(len(processed_text_list))
count_vectorized_texts=vectorizer.fit_transform(processed_text_list)

In [80]:
count_array_texts=count_vectorized_texts.toarray()

In [90]:
count_array_texts.shape

(65896, 3500)

In [81]:
count_cosine_sim=[]
for article_idx in range(len(np_array_texts)):
    current_element=count_array_texts[article_idx]
    current_element=np.vstack((current_element,np.zeros(len(current_element))))
    current_sample_idexes=random.sample(range(len(count_array_texts)), 100)
    current_sample=np_array_texts[current_sample_idexes]
    count_cosine_sim.append(np.sum(cosine_similarity(current_element,current_sample),axis=1)[0])

In [82]:
def prin_top_n(score_lis,n): 
    for key,value in score_lis[:n]: 
        print (allposts_df.iloc[key]['headline'])

In [84]:
countvec_score_lis=[[key,value] for key,value in enumerate(count_cosine_sim)]
countvec_score_lis.sort(key=lambda x:x[1])
prin_top_n(countvec_score_lis,10)

Steelers Cling to Playoff Hopes, the Browns to the Idea of a Rivalry
The Week in Business: The King of Luxury Just Got Even Richer
Amazon Removes Holiday Ornaments With Images of Auschwitz After Criticism
What Rappers Wore to the Genius Barbecue Concert in Brooklyn
Amazon Removes Holiday Ornaments With Images of Auschwitz After Criticism
Took to One’s Heels
How to Bake the Perfect Madeleine
Jennifer Lopez, Awkwafina and Adam Driver at the Gotham Awards
Your Mom Is Wrong: Your Vegetarian Diet Did Not Cause Cancer
Took to One’s Heels


In [23]:
# Step 3: Train a Tfidf vectorizer with the vocabulary as the keywords list.
'''This is an improvement to Method 2 because on top of taking consideration of counts, this also takes into consideration the 
rarity of words which makes seprating the weird articles more convinient'''
vectorizer=TfidfVectorizer(ngram_range=(1,2),max_features=3500,stop_words="english")

In [24]:
vectorizer.fit(processed_text_list)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=3500, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [25]:
index=np.arange(len(processed_text_list))
vectorized_texts=vectorizer.fit_transform(processed_text_list)

In [26]:
np_array_texts=vectorized_texts.toarray()

In [156]:
cosine_sim=[]
for article_idx in range(len(np_array_texts)):
    current_element=np_array_texts[article_idx]
    current_element=np.vstack((current_element,np.zeros(len(current_element)))) 
    current_sample_idexes=random.sample(range(len(np_array_texts)),100)
    current_sample=np_array_texts[current_sample_idexes]
    cosine_sim.append(np.sum(cosine_similarity(current_element,current_sample),axis=1)[0])

In [40]:
cosine_sim_2=[]
for article_idx in range(len(np_array_texts)):
    current_element=np_array_texts[article_idx]
    current_element=np.vstack((current_element,np.zeros(len(current_element))))
    current_sample_idexes=random.sample(range(len(np_array_texts)),1000)
    current_sample=np_array_texts[current_sample_idexes]
    cosine_sim_2.append(np.sum(cosine_similarity(current_element,current_sample),axis=1)[0])

In [41]:
new_copy=cosine_sim_2

In [65]:
new_copy_2=cosine_sim

In [44]:
with open('cosine_sim_2.pkl', 'wb') as f:
    pickle.dump(cosine_sim_2, f)

In [157]:
score_lis_100=[[key,value] for key,value in enumerate(cosine_sim)]

In [158]:
score_lis_100.sort(key=lambda x:x[1])

In [159]:
prin_top_n(score_lis_100,10)

Are Liberals Against Marriage?
The Week in Business: The King of Luxury Just Got Even Richer
What Rappers Wore to the Genius Barbecue Concert in Brooklyn
The Beauty Myth for Boys
The Beauty Myth for Boys
Why We Can’t Stop Watching the George and Kellyanne Conway Show
‘The Crying Book’ Follows the Many Tracks of Our Tears
How to Pluck a Feather
How to Tip at the Holidays, According to Doormen, and Nannies, and Weed Guys, and…
Lawsuit Against Luke Walton Is Dropped


In [95]:
score_lis_1000=[[key,value] for key,value in enumerate(cosine_sim_2)]
score_lis_1000.sort(key=lambda x:x[1])

In [96]:
prin_top_n(score_lis_1000,10)

The Week in Business: The King of Luxury Just Got Even Richer
The Week in Business: Uber’s Sexual Assault Problem
The Week in Business: Striking Deals Left and Right
The Week in Business: Striking Deals Left and Right
The Week in Business: Hip, Hip Hooray for Your 401(k)
The Week in Business: Amazon Won Christmas
The Week in Business: Amazon Won Christmas
The Week in Business: Coronavirus Hits the World Economy
The Week in Business: Coronavirus Could Complicate Everything
The Week in Business: Coronavirus Could Complicate Everything


In [97]:
for key,value in new_lis_2[::-1][:20]:
     print (allposts_df.iloc[key]['headline'])

Coronavirus in Japan, New York Hospitals, Maduro Indictment: Your Friday Briefing
Veterans Day, E.P.A., Hong Kong: Your Monday Evening Briefing
Coronavirus in Japan, New York Hospitals, Maduro Indictment: Your Friday Briefing
Impeachment, Israel, Phoebe Waller-Bridge: Your Friday Briefing
Your Monday Briefing
Coronavirus, Japan’s Economy, Uighurs: Your Tuesday Briefing
Coronavirus, Markets, Expelled Journalists: Your Thursday Briefing
Coronavirus, Remembering Christchurch, Israel Elections: Your Monday Briefing
Just 700 Speak This Language (50 in the Same Brooklyn Building)
China, Joe Biden, Kennedy Family: Your Friday Briefing
Coronavirus, Masks, Jobs: Your Friday Briefing
Israel, Coronavirus, Huawei: Your Wednesday Briefing
260,000 Words, Full of Self-Praise, From Trump on the Virus
Impeachment, Germany, Huawei: Your Friday Briefing
Impeachment, California, Robert Evans: Your Tuesday Briefing
Your Friday Briefing
Impeachment, Deval Patrick, Venice Flooding: Your Thursday Briefing
Cor

In [116]:
# Model Evaluation.
def evaluate(arr,indexes): 
    return np.sum(cosine_similarity(arr[indexes]))

In [106]:
# Evaluate Count Vectorizer with a random sample size of 100 with the top 20 most similar(not weird) and 
# top 20 weird(different) articles. 
count_top_20=[idx for idx, value in countvec_score_lis[:20]]
count_bottom_20=[idx for idx, value in countvec_score_lis[len(countvec_score_lis)-20:]]

In [121]:
# This is expected result and we know this model worked as expected as the bottom 20 has more similarity than the top 20/.
evaluate(count_array_texts,count_top_20),evaluate(count_array_texts,count_bottom_20)

(25.759203574789662, 250.90556904365687)

In [160]:
# Evaluate Tf-Idf Vectorizer with a random sample size of 100 with the top 20 most similar(not weird) and 
# top 20 weird(different) articles. 
tfidf_top_20=[idx for idx, value in score_lis_100[:20]]
tfidf_bottom_20=[idx for idx,value in score_lis_100[len(score_lis_100)-20:]]

In [162]:
# This is expected result and we know this model worked as expected as the bottom 20 has more similarity than the top 20/.
# Even though, this does not clearly beat the CountVectorizer, note this is just for a random sample of 100 in our method. 
# With the increase in this size, Tf-idf should do the trick. 
evaluate(np_array_texts,tfidf_top_20),evaluate(np_array_texts,tfidf_bottom_20)

(26.19055694527579, 211.53515378507785)

In [149]:
tfidf_top_20_1000=[idx for idx, value in score_lis_1000[:20]]
tfidf_bottom_20_1000=[idx for idx,value in score_lis_1000[len(score_lis_1000)-20:]]

In [153]:
# This is expected result and we know this model worked as expected as the bottom 20 has more similarity than the top 20/.
# Further, note it brings better result compared to the previous cases as the articles from the top from this model are 
# not similar at all. with (1<25 & 1<26)
evaluate(np_array_texts,tfidf_top_20_1000),evaluate(np_array_texts,tfidf_bottom_20_1000)

(1.0, 208.20911672642143)

In [182]:
# Export Model 
original_df=pd.read_sql("select * from posts;", conn)
original_df['score']=new_copy
original_df.to_pickle("./original_df.pkl")

In [183]:
# Future Work. 
# * For the next step, I plan to explore the possbility of using Word2Vec and any RNN based framework on this. 
# * However, because those methods are more memory intensive, and for such a large corpus of document, I am still 
#  figuring out if word2vec is required at all here. 

In [184]:
original_df

Unnamed: 0.1,Unnamed: 0,id,headline,snippets,lead_paragraphs,published_at,updated_at,doc_id,keyword_id,author_id,score
0,0,0,Ravens Outlast 49ers With a Strong Finishing Kick,Quarterback Lamar Jackson converted two critic...,BALTIMORE — The most difficult task in sports ...,2019-04-19,2019-11-08,0,3396,0,3.982230
1,1,1,Irish Ex-Soldier Who Married ISIS Fighter Is A...,"Lisa Smith, 38, and her 2-year-old daughter we...",DUBLIN — A former Irish soldier who converted ...,2019-12-01,2020-07-10,0,"[1315, 1436, 2998]",1,2.506007
2,2,2,Plunge in 3 Hong Kong Stocks Offers a Cautiona...,The sudden drops raise fresh questions about c...,HONG KONG — Even by the standards of Hong Kong...,2019-03-16,2019-12-01,0,"[1325, 827, 1436, 1639, 2486]",2,3.895389
3,3,3,Did ‘The Irishman’ Take a Bite Out of the Than...,This year’s holiday weekend saw a 16 percent d...,LOS ANGELES — Theaters had a more meager Thank...,2019-12-01,2019-12-01,0,"[3201, 130]",3,6.205419
4,4,4,Have You Ever Tried to Make Money Online?,Do you see the internet as a way to earn extra...,The internet seems to offer plenty of opportun...,2019-12-02,2019-12-02,0,[],4,4.456285
5,5,5,‘I Wanted to Die:’ Northern Ireland Confronts ...,"Haunted by a violent past, the territory has o...","LURGAN, Northern Ireland — On a cold February ...",2019-12-01,2020-06-10,0,"[1424, 2947, 2941, 2520, 1315, 3237]",5,4.642089
6,6,6,Snow and Sleet Cause Havoc for Travelers,"The morning commute will be cold and rainy, wi...",[Update: Get the latest information on weather...,2018-12-04,2019-12-01,0,"[3430, 2989]",6,2.796791
7,7,7,"Holiday Travel, AIDS, Sea Snakes: Your Weekend...",Here’s what you need to know about the week’s ...,(Want to get this briefing by email? Here’s th...,2019-12-01,2019-12-01,0,[],"[7, 8]",15.590757
8,8,8,Melania Trump Reveals Christmas Décor but Stay...,"What you can see, and what you can’t see, in t...",You know the holiday season has officially beg...,2019-01-28,2019-12-02,0,"[2519, 3069, 787, 2815, 2261, 1857]",9,6.330045
9,9,9,"Sidelined for Months, Judiciary Panel Will Rec...",The House Judiciary Committee had been leading...,"This spring, as President Trump defiantly reje...",2019-12-01,2020-05-15,0,"[3182, 852, 1523, 2955]",10,14.235295
