In [1]:
# Import the requird modules
# App's dependencies
from dotenv import load_dotenv
from flask import Flask
from flask import render_template
from flask import request
from flask import url_for
import json
import os
import pandas as pd
import pinecone
import re
import requests
from sentence_transformers import SentenceTransformer
from statistics import mean
import swifter

In [2]:

!pip install -U pinecone-client

Requirement already up-to-date: pinecone-client in c:\users\umrah\anaconda3\lib\site-packages (2.0.13)


In [3]:
pip install --ignore-installed PyYAML

Collecting PyYAML
  Using cached PyYAML-6.0-cp38-cp38-win_amd64.whl (155 kB)
Installing collected packages: PyYAML
Note: you may need to restart the kernel to use updated packages.


ERROR: Could not install packages due to an EnvironmentError: [WinError 5] Access is denied: 'C:\\Users\\umrah\\anaconda3\\Lib\\site-packages\\yaml\\_yaml.cp38-win_amd64.pyd'
Consider using the `--user` option or check the permissions.



In [None]:
conda install -c conda-forge swifter

In [5]:
# Boilerplate code to  tell Flask the name of the app
app = Flask(__name__)

In [6]:
# Define constants that would be used in the app
PINECONE_INDEX_NAME = "plagiarism-checker" # Pinecone index
DATA_FILE = "articles1.csv" # File name of the dataset
NROWS = 20000 # Number of rows to read from the CSV file

In [28]:
# Use initialize_pinecone method and the API key is used to initialise Pinecone
def initialize_pinecone():
    PINECONE_API_KEY = '51d9d229-57a8-445b-880e-6613be475afc'
    pinecone.init(api_key=PINECONE_API_KEY)

In [8]:
# The delete_existing_pinecone_index method searches Pinecone instance for indexes with 
# the same name as the one that's being used(“plagiarism-checker”). If an existing index is found, it is deleted.
def delete_existing_pinecone_index():
    if PINECONE_INDEX_NAME in pinecone.list_indexes():
        pinecone.delete_index(PINECONE_INDEX_NAME)

In [13]:
#  The create_pinecone_index method creates a new index using the name we chose (“plagiarism-checker”), 
# the “cosine” proximity metric, and only one shard.
def create_pinecone_index():
    pinecone.create_index(name=PINECONE_INDEX_NAME, metric="cosine", shards=1)
    pinecone_index = pinecone.Index(name=PINECONE_INDEX_NAME)
    return pinecone_index


In [14]:
# The our create_model method uses the sentence_transformers library to work with the Average Word Embeddings Model. We’ll 
# encode the vector embeddings using this model later
def create_model():
    model = SentenceTransformer('average_word_embeddings_komninos')

    return model

In [20]:
# The prepare_data method adjusts the dataset by renaming the first "id" column and dropping the "date" column
# It then combines the article title with the article content into a single field.
# This combined field would be used when creating vector embeddings
def prepare_data(data):
    # rename id column and remove unnecessary columns
    data.rename(columns={"Unnamed: 0": "article_id"}, inplace = True)
    data.drop(columns=['date'], inplace = True)
     # combine the article title and content into a single field
    data['content'] = data['content'].fillna('')
    data['content'] = data.content.swifter.apply(lambda x: ' '.join(re.split(r'(?<=[.:;])\s', x)))
    data['title_and_content'] = data['title'] + ' ' + data['content']
    # create a vector embedding based on title and article content
    encoded_articles = model.encode(data['title_and_content'], show_progress_bar=True)
    data['article_vector'] = pd.Series(encoded_articles.tolist())

    return data

In [21]:
# The upload_items method creates a vector embedding for each article by encoding it using our model
# The vector embeddings are then inserted into the Pinecone index
def upload_items(data):
    items_to_upload = [(row.id, row.article_vector) for i, row in data.iterrows()]
    pinecone_index.upsert(items=items_to_upload)

In [22]:
# The process_file method reads the CSV file and then calls the prepare_data and upload_items methods on it
def process_file(filename):
    data = pd.read_csv(filename, nrows=NROWS)
    data = prepare_data(data)
    upload_items(data)
    pinecone_index.info()

    return data

In [23]:
# The map_titles and map_publications methods create some dictionaries of the titles and and publications names to make it easier to find articles by IDs later 
def map_titles(data):
    return dict(zip(uploaded_data.id, uploaded_data.title))

def map_publications(data):
    return dict(zip(uploaded_data.id, uploaded_data.publication))


In [24]:
# The query_pinecone method takes the user's article content input, converts it into a vector embedding,and then queries the Pinecone index to find similar articles
# This method is called when the /api/search endpoint is hit, which occurs any time the user submits a new search query
def query_pinecone(originalContent):
    query_content = str(originalContent)
    query_vectors = [model.encode(query_content)]

    query_results = pinecone_index.query(queries=query_vectors, top_k=10)
    res = query_results[0]

    results_list = []

    for idx, _id in enumerate(res.ids):
        results_list.append({
            "id": _id,
            "title": titles_mapped[int(_id)],
            "publication": publications_mapped[int(_id)],
            "score": res.scores[idx],
        })

    return json.dumps(results_list)

In [29]:
# Each of the methods described so far are called when the backend app is started
# This would prepare for the final step of actually querying the Pinecone index based on user input
initialize_pinecone()
delete_existing_pinecone_index()
model = create_model()
uploaded_data = process_file(filename=DATA_FILE)
titles_mapped = map_titles(uploaded_data)
publications_mapped = map_publications(uploaded_data)

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=690.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=266766827.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=2589750.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=164.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=190.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=2135.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=122.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=248.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=20000.0), HTML(value='')))




AttributeError: 'NoneType' object has no attribute 'encode'