In [47]:
import json
import pandas as pd

In [48]:
df = pd.read_csv('benzinga.csv')
newsfeeds = df.to_json(orient='records')
newsfeeds = json.loads(newsfeeds)
len(newsfeeds)

14072

In [49]:
from gensim.models import KeyedVectors
from simhash import Simhash, SimhashIndex
import numpy as np

In [50]:
# omit the loeading messages, unlike the class exercise
model_w2v_AP = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [51]:
# function calculates similarity between two strings using a particular word vector model
def calc_similarity(input1, input2, vectors):
    s1words = set(vocab_check(vectors, input1.split()))
    s2words = set(vocab_check(vectors, input2.split()))
    
    output = vectors.n_similarity(s1words, s2words)
    return output

In [52]:
# Get a subset of 100 Webhose titles assign index values
feeds = []
i = 0
for feed in newsfeeds[:]:
    feed['id'] = i
    #print(feed['id'], str(feed['title']))
    i += 1
    feeds.append(feed)

In [53]:
import logging
logging.getLogger('simhash').setLevel(logging.CRITICAL)

hamming_distance = 10
objs = [(str(feed['id']), Simhash(str(feed['title']))) for feed in feeds]
index = SimhashIndex(objs, k=hamming_distance)

In [54]:
# this list keeps the first occurance of duplicated titles
list_dupi = set()
i = 0

# loop to find all the duplicate
while i < len(feeds):
    feed_hash = Simhash(str(feeds[i]['title']))
    dup_indices = index.get_near_dups(feed_hash)
    if dup_indices:
        dup_indices = sorted(set([int(ind) for ind in dup_indices]))
        list_dupi.update( dup_indices[1:] )
    i+=1
    if i in list_dupi:
        i+=1

In [55]:
len(list_dupi)

1130

In [66]:
counter = 0
max_prints = 10

for id in list_dupi:
    original = str(feeds[id]['title'])
    duplicates = [int(dup_id) for dup_id in index.get_near_dups(Simhash(original)) if int(dup_id) < len(feeds)]
    if not duplicates:
        continue
    
    duplicate_id = duplicates[0]
    duplicate = str(feeds[duplicate_id]['title'])
    
    similarity_score = calc_similarity(original, duplicate, model_w2v_AP)

    print(f"Original ({id}): {original}")
    print(f"Duplicate ({duplicate_id}): {duplicate}")
    print(f"Similarity Score: {similarity_score:.2f}, Distance in id: {abs(id-duplicate_id)}")
    print("-" * 60)
    
    counter += 1
    
    if counter >= max_prints:
        break


Original (4103): 7 Stocks To Watch For October 27, 2020
Duplicate (7559): 7 Stocks To Watch For October 27, 2021
Similarity Score: 1.00, Distance in id: 3456
------------------------------------------------------------
Original (4104): What Does Microsoft's Debt Look Like?
Duplicate (3532): How Does Microsoft's Debt Look?
Similarity Score: 0.91, Distance in id: 572
------------------------------------------------------------
Original (10249): Citigroup, Bank of America And 48 Stocks Moving In Friday's Mid-Day Session
Duplicate (3496): 70 Stocks Moving In Friday's Mid-Day Session
Similarity Score: 0.88, Distance in id: 6753
------------------------------------------------------------
Original (4108): Earnings Scheduled For October 27, 2020
Duplicate (4027): Earnings Scheduled For October 14, 2020
Similarity Score: 1.00, Distance in id: 81
------------------------------------------------------------
Original (2061): Stocks That Hit 52-Week Highs On Wednesday
Duplicate (2448): Stocks That

Observations:

- The titles identified as duplicates have high similarity scores, often > 0.8. 
- However, a closer examination of the titles shows that they often refer to different dates or different subjects. 
  For example, the titles '7 Stocks To Watch For October 27, 2020' and '7 Stocks To Watch For October 27, 2021' are 
  almost identical but refer to different years. Similarly, 'PreMarket Prep Stock Of The Day: Shopify' and 
  'PreMarket Prep Stock Of The Day: Apple' are about different stocks.
- Additionally, the distance in ID between purported duplicates is often substantial. Large gaps in IDs suggest 
  that the titles are from articles published at significantly different times, reinforcing the idea that these 
  are not true duplicates but rather recurrent topics covered by Benzinga over time.

Conclusion:

The original dataset obtained from the Benzinga API seems to be well-organized. Although there are titles with 
high similarity scores, the content differences and the ID gaps suggest that the dataset has been competently 
deduplicated. Users can have confidence in the uniqueness of articles sourced from this API.

In [58]:
# Since nothing needs to be deduplicated, below is not used

# feeds = [feed for feed in feeds if int(feed['id']) not in list_dupi]
# print(len(feeds))

# file_path = "benzinga.json"

# with open(file_path, "w") as file:
#     json.dump(feeds, file)