In [44]:
import nltk
from nltk.stem import *
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from tqdm import tqdm

import scipy.stats as stats
import sklearn
import random
import os
from pathlib import Path
from sklearn.linear_model import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report 
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances, manhattan_distances, euclidean_distances

#    os.remove("articles.csv")

#Stopwords have been cleaned in the 'dataclean' code moving forward
custom_stopwords = [
        # dates/times
        "january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december", "jan", "feb","mar", "apr", "jun", "jul", "aug", "oct", "nov", "dec", "jan.", "feb.","mar.", "apr.", "jun.", "jul.", "aug.", "oct.", "nov.", "dec.", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "morning", "evening",
        # symbols that don't separate a sentence
        '$','“','”','’','—',
        # specific article terms that are useless
        "read", "share", "file", "'s","i", "photo", "percent","s", "t", "inc.", "corp", "group", "inc", "corp.", "source", "bloomberg", "cnbc","cnbcs", "cnn", "reuters","bbc", "published", "broadcast","york","msnbc",
        # other useless terms
        "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "co", "inc", "com", "theyve", "theyre", "theres", "heres", "didnt", "wouldn", "couldn", "didn","nbcuniversal","according", "just", "us", "ll", "times",
        # etc
 "us","the", "a", "of", "have", "has", "had", "having", "hello", "welcome", "yeah", "wasn", "today", "etc", "ext","definitely", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "while", "of", "at", "by", "for", "about", "into", "through", "during", "before", "after", "to", "from", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "just", "don", "now", "will"
        ]

# we'll compare two stemmers and a lemmatizer
lrStem = LancasterStemmer()
sbStem = SnowballStemmer("english")
wnLemm = WordNetLemmatizer()


In [45]:
ArticleDB = pd.read_excel("Temporary Article Pull.xlsx")
artText = ArticleDB["content"]


In [46]:
from sklearn.feature_extraction.text import CountVectorizer
countVect = CountVectorizer(stop_words = custom_stopwords, binary=True)
vector = countVect.fit(artText)
#print(vector)

In [47]:
#print(X_train_counts)
features= countVect.vocabulary_
fts = list(features.keys())
print(len(fts))
print(fts)

25231




In [48]:
#BinaryEncoding
def RecBinaryEncoding(fts, artText):
    print("bin Encoding")
    df_rows = []
    #tokenizer = RegexpTokenizer(r'\w+')

    for art in tqdm(artText):
        if type(art) == str: 
            body = art.lower()
            body = body.split() 
            wordsCounter = Counter(body)
            df_rows.append([1 if word in wordsCounter else 0 for word in fts])
        else:
            df_rows.append([0 for word in fts])
    X = pd.DataFrame(df_rows, columns = fts)

    return X

In [49]:
#Term Freq. Encoding
def TfEncoding(fts, artText):
    print("tf Encoding")
    tf_rows = []
    
    for art in tqdm(artText):
        if type(art) == str:
            body = art.lower()
            body = body.split()
            wordsCounter = Counter(body)
            tf_rows.append([wordsCounter[word] if word in wordsCounter else 0 for word in fts])
        else:
            tf_rows.append([0 for word in fts])
    X = pd.DataFrame(tf_rows, columns = fts)
    
    return X

In [61]:
#term Frequency - inverse document frequency encoding
def tfidfEncoding(fts, artText):
    print("tifidf Encoding")

    # Base calculations
    binX = RecBinaryEncoding(fts, artText)
    tfX = TfEncoding(fts, artText)
    
    # Calculate idf
    df_row = [binX[word].sum() for word in fts]
    idf = [1/(df+1) for df in df_row]
    #transpose list (not the cleverest method)
    idf_row = []
    idf_row.append(idf)
    idf_list = pd.DataFrame(idf_row, columns = fts)
    
    # Extract term frequencies
    tf = tfX.values
    # Set up loop to multiply each article (row) by the idf per term (col)
    tf_idf = []
    r, c = tf.shape
    for art in range(0,r):
        tf_idf.append(tf[art]*idf)
    tf_idf = pd.DataFrame(tf_idf, columns = fts)
    X = tf_idf
    import winsound
    duration = 600  # millisecond
    freq = 300  # Hz
    winsound.Beep(freq, duration)
    
    return X

In [51]:
def Cosinepairup(npV, rows, Y):
    for i in range(rows):
        #find most related articles indexed
        a = npV[i,:]
        index = np.argpartition(a, -4)[-4:]
        index2= index[np.argsort(a[index])]

        #show the index in X matrix
        #print(i)
        #print(index)
        #print(index2)
        #show the similarity value
        #print(a[index2])

        related = []
        #ensure that same article is not ranked as the most similar article
        for j in range(3,-1,-1):
            if i == index2[j]:
                pass #do not count the same article as most related
            elif len(related) == 3:
                pass
            else:
                related.append(str(index2[j]))

        Y.at[i, 'related_articles'] = ', '.join(related)

    return Y[['related_articles']]

In [52]:
def recommender(Encoded, contextTable):
    #Encoded = Encoding.drop(columns=['article_id'])
    
    #Similarity matrix between each article
    Csim = cosine_similarity(Encoded)

    #convert to numpy
    npV = np.asarray(Csim)
    rows = np.size(npV,0)

    
    #match most related articles by article index
    finalMatches = Cosinepairup(npV, rows, Encoded)
    finalTable = contextTable.join(finalMatches, how='left')
    
    return finalTable

In [20]:
binEncoded = RecBinaryEncoding(fts, artText)
#binEncoded.head()

100%|██████████| 619/619 [00:05<00:00, 123.58it/s]


In [15]:
tfEncoded = TfEncoding(fts,artText)
tfEncoded.head()

tf Encoding


100%|██████████| 619/619 [00:03<00:00, 179.02it/s]


Unnamed: 0,preview,research,report,business,insider,intelligence,premium,service,learn,click,...,legitimately,plumped,misconceived,creeping,favoured,mooted,revitalise,piecemeal,tyrie,mettle
0,1,3,10,6,6,5,2,1,2,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,7,2,0,5,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,2,2,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [62]:
tfidfEncoded = tfidfEncoding(fts, artText)
tfidfEncoded.head()

tifidf Encoding
bin Encoding


100%|██████████| 619/619 [00:02<00:00, 217.43it/s]


tf Encoding


100%|██████████| 619/619 [00:03<00:00, 202.56it/s]


Unnamed: 0,preview,research,report,business,insider,intelligence,premium,service,learn,click,...,legitimately,plumped,misconceived,creeping,favoured,mooted,revitalise,piecemeal,tyrie,mettle
0,0.142857,0.032258,0.062112,0.022814,0.082192,0.111111,0.117647,0.009259,0.064516,0.015152,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.043478,0.007605,0.0,0.111111,0.0,0.018519,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.006211,0.0,0.0,0.0,0.0,0.0,0.0,0.015152,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.006211,0.0,0.0,0.0,0.0,0.0,0.0,0.015152,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.007605,0.027397,0.0,0.0,0.0,0.0,0.015152,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [63]:
finalTable = recommender(tfidfEncoded, ArticleDB)

In [60]:
finalTable.head()

Unnamed: 0,nonRel,Rel,url,prediction,title,description,source,date,content,origContent,related_articles
0,0.194252,0.805748,https://www.businessinsider.com/the-us-home-he...,1,THE US HOME HEALTHCARE REPORT: How US provider...,This is a preview of a research report from Bu...,business-insider,2019-02-20T15:34:00Z,This is a preview of a research report from Bu...,This is a preview of a research report from Bu...,"12, 431, 365"
1,0.202238,0.797762,https://www.washingtonpost.com/news/powerpost/...,1,The Technology 202: The British come to Silico...,And it may not be such a friendly visit.,the-washington-post,2019-02-19T13:55:26Z,Ctrl N FILES In this file photo taken on Ma...,Ctrl + N\r\n(FILES) In this file photo taken o...,"175, 120, 519"
2,0.223153,0.776847,https://www.foxnews.com/politics/trump-venezue...,1,Trump declares 'socialism is dying' amid Venez...,Trump declares 'socialism is dying' amid Venez...,fox-news,2019-02-18T22:42:05Z,President Donald Trump speaking in a major fo...,"President Donald Trump, speaking in a major fo...","3, 482, 9"
3,0.223153,0.776847,https://www.foxnews.com/politics/trump-venezue...,1,Trump declares 'socialism is dying' amid Venez...,Trump declares 'socialism is dying' amid Venez...,fox-news,2019-02-18T22:42:05Z,President Donald Trump speaking in a major fo...,"President Donald Trump, speaking in a major fo...","2, 482, 9"
4,0.22516,0.77484,https://www.businessinsider.com/alaska-permane...,1,The political debate over Alaska's universal c...,The Alaska Permanent Fund is a $65 billion fun...,business-insider,2019-02-20T20:59:28Z,Alaska s permanent fund and its annual univers...,Alaska's permanent fund and its annual univers...,"105, 319, 59"


In [56]:
finalTable.to_excel("TFidfEncodedNewStop.xlsx")