In [37]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import pandas as pd
from sentence_transformers import SentenceTransformer
import json

In [38]:
model = SentenceTransformer('sentence-transformers/average_word_embeddings_glove.6B.300d')

In [2]:
article1 = pd.read_csv("data/articles1.csv")
article2 = pd.read_csv("data/articles2.csv")
article3 = pd.read_csv("data/articles3.csv")

In [3]:
article1.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


In [4]:
article2.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,53293,73471,Patriots Day Is Best When It Digs Past the Her...,Atlantic,David Sims,2017-01-11,2017.0,1.0,,"Patriots Day, Peter Berg’s new thriller that r..."
1,53294,73472,A Break in the Search for the Origin of Comple...,Atlantic,Ed Yong,2017-01-11,2017.0,1.0,,"In Norse mythology, humans and our world were ..."
2,53295,73474,Obama’s Ingenious Mention of Atticus Finch,Atlantic,Spencer Kornhaber,2017-01-11,2017.0,1.0,,“If our democracy is to work in this increasin...
3,53296,73475,"Donald Trump Meets, and Assails, the Press",Atlantic,David A. Graham,2017-01-11,2017.0,1.0,,Updated on January 11 at 5:05 p. m. In his fir...
4,53297,73476,Trump: ’I Think’ Hacking Was Russian,Atlantic,Kaveh Waddell,2017-01-11,2017.0,1.0,,Updated at 12:25 p. m. After months of equivoc...


In [5]:
article3.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,103459,151908,Alton Sterling’s son: ’Everyone needs to prote...,Guardian,Jessica Glenza,2016-07-13,2016.0,7.0,https://www.theguardian.com/us-news/2016/jul/1...,The son of a Louisiana man whose father was sh...
1,103460,151909,Shakespeare’s first four folios sell at auctio...,Guardian,,2016-05-25,2016.0,5.0,https://www.theguardian.com/culture/2016/may/2...,Copies of William Shakespeare’s first four boo...
2,103461,151910,My grandmother’s death saved me from a life of...,Guardian,Robert Pendry,2016-10-31,2016.0,10.0,https://www.theguardian.com/commentisfree/2016...,"Debt: $20, 000, Source: College, credit cards,..."
3,103462,151911,I feared my life lacked meaning. Cancer pushed...,Guardian,Bradford Frost,2016-11-26,2016.0,11.0,https://www.theguardian.com/commentisfree/2016...,"It was late. I was drunk, nearing my 35th birt..."
4,103463,151912,Texas man serving life sentence innocent of do...,Guardian,,2016-08-20,2016.0,8.0,https://www.theguardian.com/us-news/2016/aug/2...,A central Texas man serving a life sentence fo...


In [12]:
df1 = article1.copy()
df2 = article2.copy()
df3 = article3.copy()

In [13]:
df1 = df1.drop(columns=['Unnamed: 0',"date","year","month"])
df2 = df2.drop(columns=['Unnamed: 0',"date","year","month"])
df3 = df3.drop(columns=['Unnamed: 0',"date","year","month"])

In [16]:
df1.set_index("id",inplace=True)
df2.set_index("id",inplace=True)
df3.set_index("id",inplace=True)

In [39]:
# Preprocessing function 
def preprocess_doc(doc):
    # Convert to lowercase
    doc = doc.lower()
    # Remove numbers and punctuation
    doc = re.sub(r'\d+', '', doc)
    doc = doc.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the document
    tokens = nltk.word_tokenize(doc)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if not token in stop_words]
    # Stemming the tokens
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(token) for token in tokens]
    # Join the tokens back into a string
    preprocessed_doc = ' '.join(tokens)
    return preprocessed_doc

In [20]:
vector1 = [preprocess_doc(text) for text in df1["content"]]

In [21]:
vector2 = [preprocess_doc(text) for text in df2["content"]]

In [22]:
vector3 = [preprocess_doc(text) for text in df3["content"]]

In [30]:
encoded_vector1 = model.encode(vector1)

In [31]:
encoded_vector2 = model.encode(vector2)

In [32]:
encoded_vector3 = model.encode(vector3)

In [40]:
df1["content_vector"] = encoded_vector1.tolist()
df2["content_vector"] = encoded_vector2.tolist()
df3["content_vector"] = encoded_vector3.tolist()

In [44]:
combined_data = pd.concat([df1,df2,df3])

In [49]:
combined_data.dtypes

title             object
publication       object
author            object
url               object
content           object
content_vector    object
dtype: object

In [40]:
combined_data.to_parquet("data/combined_data.parquet",index=True)

NameError: name 'combined_data' is not defined

In [41]:
combined_data = pd.read_parquet("data/combined_data.parquet")


In [None]:
combined_data.isna().sum()

title                 2
publication           0
author            15876
url               57011
content               0
content_vector        0
dtype: int64

In [42]:
df = combined_data.copy()
df = df.dropna(subset=['title','author'])

In [43]:
df.head()

Unnamed: 0_level_0,title,publication,author,url,content,content_vector
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,,WASHINGTON — Congressional Republicans have...,"[-0.03487817198038101, -0.030575549229979515, ..."
17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,,"After the bullet shells get counted, the blood...","[-0.0900963693857193, 0.048336535692214966, -0..."
17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,,"When Walt Disney’s “Bambi” opened in 1942, cri...","[-0.05581660568714142, -0.0249489676207304, -0..."
17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,,"Death may be the great equalizer, but it isn’t...","[-0.03903885558247566, 0.012085803784430027, 0..."
17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,,"SEOUL, South Korea — North Korea’s leader, ...","[-0.12088580429553986, 0.08755343407392502, 0...."


In [52]:
api_key = "8c5d315d-5722-47cf-94b6-71325e3a1c3f"
environment = "us-west4-gcp"
index_name = "plagiarism-search"

In [53]:
import pinecone
pinecone.init(api_key=api_key, environment=environment)

In [48]:
if index_name not in pinecone.list_indexes():
    pinecone.create_index(name=index_name, dimension=300)

MaxRetryError: HTTPSConnectionPool(host='controller.us-west4-gcp.pinecone.io', port=443): Max retries exceeded with url: /databases (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000282222D6CA0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

In [54]:
index = pinecone.Index(index_name)

In [None]:
# Format for insertion (id, vectors, metadata)

In [51]:
insertion_data = []
count = 0
for i, row in df.iterrows():
    count += 1
    insertion_data.append(
        {
            "id" : str(row.name),
            "values" : row['content_vector'].tolist(),
            "metadata" : {
                "title" : row['title'],
                "author" : row["author"],
                "publication" : row["publication"]
            }
        }
    )
    if count >= 1000 :
        index.upsert(insertion_data)
        insertion_data = []
        count = 0
index.upsert(insertion_data)


MaxRetryError: HTTPSConnectionPool(host='plagiarism-search-unknown.svc.us-west4-gcp.pinecone.io', port=443): Max retries exceeded with url: /vectors/upsert (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000282789F9F10>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

In [11]:
upload_data = pd.DataFrame(insertion_data)
upload_data.head()

NameError: name 'insertion_data' is not defined

In [12]:
query_sent = "'washington — congressional republicans new fear comes health care lawsuit obama administration might win incoming trump administration could choose longer defend executive branch suit challenges administration ’ authority spend billions dollars health insurance subsidies americans handing house republicans big victory issues sudden loss disputed subsidies could conceivably cause health care program implode leaving millions people without access health insurance republicans prepared replacement could lead chaos insurance market spur political backlash republicans gain full control government stave outcome republicans could find awkward position appropriating huge sums temporarily prop obama health care law angering conservative voters demanding end law years another twist donald j trump ’ administration worried preserving executive branch prerogatives could choose fight republican allies house central questions dispute eager avoid ugly political pileup republicans capitol hill trump transition team gaming handle lawsuit election put limbo least late february united states court appeals district columbia circuit yet ready divulge strategy “ given pending litigation involves obama administration congress would inappropriate comment ” said phillip j blando spokesman trump transition effort “ upon taking office trump administration evaluate case related aspects affordable care act ” potentially decision judge rosemary collyer ruled house republicans standing sue executive branch spending dispute obama administration distributing health insurance subsidies violation constitution without approval congress justice department confident judge collyer ’ decision would reversed quickly appealed subsidies remained place appeal successfully seeking temporary halt proceedings mr trump house republicans last month told court “ ’ transition team currently discussing potential options resolution matter take effect ’ inauguration jan ” suspension case house lawyers said “ provide future administration time consider whether continue prosecuting otherwise resolve appeal ” republican leadership officials house acknowledge possibility “ cascading effects ” payments totaled estimated billion suddenly stopped insurers receive subsidies exchange paying costs deductibles eligible consumers could race drop coverage since would losing money loss subsidies could destabilize entire program cause lack confidence leads insurers seek quick exit well anticipating trump administration might inclined mount vigorous fight house republicans given ’ dim view health care law team lawyers month sought intervene case behalf two participants health care program request lawyers predicted deal house republicans new administration dismiss settle case “ produce devastating consequences individuals receive reductions well nation ’ health insurance health care systems generally ” matter happens house republicans say want prevail two overarching concepts congressional power purse right congress sue executive branch violates constitution regarding spending power house republicans contend congress never appropriated money subsidies required constitution suit initially championed john boehner house speaker time later house committee reports republicans asserted administration desperate funding required treasury department provide despite widespread internal skepticism spending proper white house said spending permanent part law passed annual appropriation required — even though administration initially sought one important house republicans judge collyer found congress standing sue white house issue — ruling many legal experts said flawed — want precedent set restore congressional leverage executive branch spending power standing trump administration may come pressure advocates presidential authority fight house matter shared views health care since precedents could broad repercussions complicated set dynamics illustrating quick legal victory house trump era might come costs republicans never anticipated took obama white house',"
preprocessed_sentence = preprocess_doc(query_sent)
encoded_sent = model.encode(preprocessed_sentence).tolist()

In [29]:
result = index.query(encoded_sent, top_k=5, include_metadata=True)
result

{'matches': [{'id': '17283',
              'metadata': {'author': 'Carl Hulse',
                           'publication': 'New York Times',
                           'title': 'House Republicans Fret About Winning '
                                    'Their Health Care Suit - The New York '
                                    'Times'},
              'score': 0.99999994,
              'values': []},
             {'id': '89566',
              'metadata': {'author': 'Alice Ollstein',
                           'publication': 'Talking Points Memo',
                           'title': 'Republicans Buy Time In Case That Could '
                                    'Destabilize Obamacare'},
              'score': 0.982497156,
              'values': []},
             {'id': '77320',
              'metadata': {'author': 'Russell Berman',
                           'publication': 'Atlantic',
                           'title': '\u200bHow Republicans Finally Got a '
                             

In [30]:
scores = []
for res in result["matches"]:
    scores.append(str(res["score"]*100) + " %")
scores

['99.999994 %', '98.2497156 %', '98.1007457 %', '97.8566706 %', '97.7684855 %']

In [36]:
id, title, authors = [], [], []
for res in result["matches"]:
        id.append(res['id'])
        title.append(res['metadata']['title'])
        authors.append(res['metadata']['publication'])
print(id)


['17283', '89566', '77320', '198371', '189202']


In [19]:
for res in  result["matches"]:
    print(f"ID : {res['id']} \n Title : {res['metadata']['title']} \n Author : {res['metadata']['author']} \n Publication : {res['metadata']['publication']} \n Score : {res['score']*100 : .2f}")

ID : 17283 
 Title : House Republicans Fret About Winning Their Health Care Suit - The New York Times 
 Author : Carl Hulse 
 Publication : New York Times 
 Score :  100.00
ID : 89566 
 Title : Republicans Buy Time In Case That Could Destabilize Obamacare 
 Author : Alice Ollstein 
 Publication : Talking Points Memo 
 Score :  98.25
ID : 77320 
 Title : ​How Republicans Finally Got a Victory on Obamacare 
 Author : Russell Berman 
 Publication : Atlantic 
 Score :  98.10


In [25]:
def query_topk(input_sentence, topk = 10):
    preprocessed_Sent = preprocess_doc(input_sentence)
    encoded_query = model.encode(preprocessed_Sent).tolist()
    results = index.query(encoded_query, top_k = topk, include_metadata=True)
    id = titles = authors = publications = scores = []
    for res in results["matches"]:
        id.append(res['id'])
        titles.append(res['metadata']['title'])
        authors.append(res['metadata']['publication'])
        publications.append(res['metadata']['author'])
        scores.append(str(res['score']*100) + " %")
    column_names = ["ID", "Title", "Publication", "Author", "Similarity Percentage"]
    query_results = [column_names, id, titles, publications, authors, scores]
    return query_results

In [15]:
query_sent = "'washington — congressional republicans new fear comes health care lawsuit obama administration might win incoming trump administration could choose longer defend executive branch suit challenges administration ’ authority spend billions dollars health insurance subsidies americans handing house republicans big victory issues sudden loss disputed subsidies could conceivably cause health care program implode leaving millions people without access health insurance republicans prepared replacement could lead chaos insurance market spur political backlash republicans gain full control government stave outcome republicans could find awkward position appropriating huge sums temporarily prop obama health care law angering conservative voters demanding end law years another twist donald j trump ’ administration worried preserving executive branch prerogatives could choose fight republican allies house central questions dispute eager avoid ugly political pileup republicans capitol hill trump transition team gaming handle lawsuit election put limbo least late february united states court appeals district columbia circuit yet ready divulge strategy “ given pending litigation involves obama administration congress would inappropriate comment ” said phillip j blando spokesman trump transition effort “ upon taking office trump administration evaluate case related aspects affordable care act ” potentially decision judge rosemary collyer ruled house republicans standing sue executive branch spending dispute obama administration distributing health insurance subsidies violation constitution without approval congress justice department confident judge collyer ’ decision would reversed quickly appealed subsidies remained place appeal successfully seeking temporary halt proceedings mr trump house republicans last month told court “ ’ transition team currently discussing potential options resolution matter take effect ’ inauguration jan ” suspension case house lawyers said “ provide future administration time consider whether continue prosecuting otherwise resolve appeal ” republican leadership officials house acknowledge possibility “ cascading effects ” payments totaled estimated billion suddenly stopped insurers receive subsidies exchange paying costs deductibles eligible consumers could race drop coverage since would losing money loss subsidies could destabilize entire program cause lack confidence leads insurers seek quick exit well anticipating trump administration might inclined mount vigorous fight house republicans given ’ dim view health care law team lawyers month sought intervene case behalf two participants health care program request lawyers predicted deal house republicans new administration dismiss settle case “ produce devastating consequences individuals receive reductions well nation ’ health insurance health care systems generally ” matter happens house republicans say want prevail two overarching concepts congressional power purse right congress sue executive branch violates constitution regarding spending power house republicans contend congress never appropriated money subsidies required constitution suit initially championed john boehner house speaker time later house committee reports republicans asserted administration desperate funding required treasury department provide despite widespread internal skepticism spending proper white house said spending permanent part law passed annual appropriation required — even though administration initially sought one important house republicans judge collyer found congress standing sue white house issue — ruling many legal experts said flawed — want precedent set restore congressional leverage executive branch spending power standing trump administration may come pressure advocates presidential authority fight house matter shared views health care since precedents could broad repercussions complicated set dynamics illustrating quick legal victory house trump era might come costs republicans never anticipated took obama white house',"


In [26]:
results = query_topk(query_sent)
results

[['ID', 'Title', 'Publication', 'Author', 'Similarity Percentage'],
 ['17283',
  'House Republicans Fret About Winning Their Health Care Suit - The New York Times',
  'New York Times',
  'Carl Hulse',
  '99.999994 %',
  '89566',
  'Republicans Buy Time In Case That Could Destabilize Obamacare',
  'Talking Points Memo',
  'Alice Ollstein',
  '98.2497156 %',
  '77320',
  '\u200bHow Republicans Finally Got a Victory on Obamacare',
  'Atlantic',
  'Russell Berman',
  '98.1007457 %',
  '198371',
  'Trump’s ominous threat to withhold payment from health insurers, explained',
  'Vox',
  'Nicholas Bagley',
  '97.8566706 %',
  '189202',
  'Republicans win Obamacare legal challenge, add to insurer concerns',
  'Reuters',
  'Lawrence Hurley',
  '97.7684855 %',
  '91569',
  'Insurer Group Says HHS Statement On Continuing Obamacare Subsidies Is Not Enough',
  'Talking Points Memo',
  'Tierney Sneed',
  '97.719872 %',
  '216738',
  'Obamacare’s future in critical condition with Trump’s victory',
  '