In [18]:
from dotenv import load_dotenv
from flask import Flask
from flask import render_template
from flask import request
from flask import url_for
import json
import os
import pandas as pd
import pinecone
import re
import requests
from sentence_transformers import SentenceTransformer
from statistics import mean
import swifter

app = Flask(__name__)

PINECONE_INDEX_NAME = "plagiarism-checker"
DATA_FILE = "articles.csv"
NROWS = 20000

def initialize_pinecone():
    load_dotenv()
    os.environ["PINECONE_API_KEY"] = "16210325-84fb-490b-bb1b-8e4be075eb38"
    pinecone.init(api_key="16210325-84fb-490b-bb1b-8e4be075eb38", environment="us-east1-gcp")

def delete_existing_pinecone_index():
    if PINECONE_INDEX_NAME in pinecone.list_indexes():
        pinecone.delete_index(PINECONE_INDEX_NAME)

def create_pinecone_index():
    pinecone.create_index(PINECONE_INDEX_NAME, dimension=8, metric="cosine", shards=1)
    pinecone_index = pinecone.Index(PINECONE_INDEX_NAME)

    return pinecone_index

def create_model():
    model = SentenceTransformer('average_word_embeddings_komninos')

    return model

def prepare_data(data):
    # rename id column and remove unnecessary columns
    data.rename(columns={"Unnamed: 0": "article_id"}, inplace = True)
    data.drop(columns=['date'], inplace = True)

    # combine the article title and content into a single field
    data['content'] = data['content'].fillna('')
    data['content'] = data.content.swifter.apply(lambda x: ' '.join(re.split(r'(?<=[.:;])\s', x)))
    data['title_and_content'] = data['title'] + ' ' + data['content']

    # create a vector embedding based on title and article content
    encoded_articles = model.encode(data['title_and_content'], show_progress_bar=True)
    data['article_vector'] = pd.Series(encoded_articles.tolist())

    return data


#def upload_items(data):
    #items_to_upload = [(row.id, row.article_vector) for i, row in data.iterrows()]
    #vectors = [Vector(id=_id, values=vector) for _id, vector in items_to_upload]
    #vectors = [vector for _id, vector in items_to_upload]
    #pinecone_index.upsert(items=items_to_upload,  vectors=vectors)
    
#def upload_items(data):
    #items_to_upload = list(zip(data.index, data.vector))
    #vectors = []
    #for _id, vector in items_to_upload:
      #  vectors.append(Vector(id=_id, values=vector))
   # pinecone_index.upsert(items=items_to_upload, vectors=vectors)

def upload_items(data):
    if "vector" not in data.columns:
        raise ValueError("DataFrame is missing the 'vector' column.")
    items_to_upload = list(zip(data.index, data.vector))
    vectors = []
    for _id, vector in items_to_upload:
        vectors.append(vector)
    pinecone_index.add(vectors)

#def process_file(filename):
    #data = pd.read_csv(filename, nrows=NROWS)
    #data = prepare_data(data)
    #upload_items(data)
    #pinecone_index.info()

    #return data
    
def process_file(filename):
    if not column_exists(filename, "vector"):
        raise ValueError("The input file does not contain the required 'vector' column.")
    data = pd.read_csv(filename, nrows=NROWS, encoding='UTF-16')
    data = prepare_data(data)
    upload_items(data)
    
    return data

def column_exists(filename, column_name):
    with open(filename, "r") as file:
        header = file.readline().strip().split(",")
        return column_name in header
    
def map_titles(data):
    return dict(zip(uploaded_data.id, uploaded_data.title))

def map_publications(data):
    return dict(zip(uploaded_data.id, uploaded_data.publication))

def query_pinecone(originalContent):
    query_content = str(originalContent)
    query_vectors = [model.encode(query_content)]

    query_results = pinecone_index.query(queries=query_vectors, top_k=10)
    res = query_results[0]

    results_list = []

    for idx, _id in enumerate(res.ids):
        results_list.append({
            "id": _id,
            "title": titles_mapped[int(_id)],
            "publication": publications_mapped[int(_id)],
            "score": res.scores[idx],
        })

    return json.dumps(results_list)

initialize_pinecone()
delete_existing_pinecone_index()
pinecone_index = create_pinecone_index()
model = create_model()
uploaded_data = process_file(filename=DATA_FILE)
titles_mapped = map_titles(uploaded_data)
publications_mapped = map_publications(uploaded_data)

@app.route("/")
def index():
    return render_template("index.html")

@app.route("/api/search", methods=["POST", "GET"])
def search():
    if request.method == "POST":
        return query_pinecone(request.form.get("originalContent", ""))
    if request.method == "GET":
        return query_pinecone(request.args.get("originalContent", ""))
    return "Only GET and POST methods are allowed for this endpoint"

UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 1898: character maps to <undefined>

In [10]:
!pip install pandas



In [3]:
!pip install flask



In [4]:
!pip install python-dotenv

Collecting python-dotenv
  Using cached python_dotenv-0.21.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-0.21.1


In [6]:
!pip install pinecone-client

Collecting pinecone-client
  Using cached pinecone_client-2.1.0-py3-none-any.whl (170 kB)
Collecting tqdm>=4.64.1
  Using cached tqdm-4.64.1-py2.py3-none-any.whl (78 kB)
Collecting dnspython>=2.0.0
  Using cached dnspython-2.3.0-py3-none-any.whl (283 kB)
Collecting loguru>=0.5.0
  Using cached loguru-0.6.0-py3-none-any.whl (58 kB)
Collecting win32-setctime>=1.0.0
  Using cached win32_setctime-1.1.0-py3-none-any.whl (3.6 kB)
Installing collected packages: win32-setctime, tqdm, loguru, dnspython, pinecone-client
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.64.0
    Uninstalling tqdm-4.64.0:
      Successfully uninstalled tqdm-4.64.0
Successfully installed dnspython-2.3.0 loguru-0.6.0 pinecone-client-2.1.0 tqdm-4.64.1 win32-setctime-1.1.0


In [8]:
!pip install sentence_transformers

Collecting sentence_transformers
  Using cached sentence-transformers-2.2.2.tar.gz (85 kB)
Collecting transformers<5.0.0,>=4.6.0
  Using cached transformers-4.26.0-py3-none-any.whl (6.3 MB)
Collecting torch>=1.6.0
  Downloading torch-1.13.1-cp39-cp39-win_amd64.whl (162.5 MB)
Collecting torchvision
  Downloading torchvision-0.14.1-cp39-cp39-win_amd64.whl (1.1 MB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp39-cp39-win_amd64.whl (1.1 MB)
Collecting huggingface-hub>=0.4.0
  Using cached huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-win_amd64.whl (3.3 MB)
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py): started
  Building wheel for sentence-transformers (setup.py): finished with status 'done'
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125940 sha256=10593f1c23

In [10]:
!pip install 

Collecting swifter
  Using cached swifter-1.3.4.tar.gz (830 kB)
Building wheels for collected packages: swifter
  Building wheel for swifter (setup.py): started
  Building wheel for swifter (setup.py): finished with status 'done'
  Created wheel for swifter: filename=swifter-1.3.4-py3-none-any.whl size=16321 sha256=d13943d86d0484166264c736bdc1a9ba639193b502d3b98cc158928a85b03371
  Stored in directory: c:\users\elitebook\appdata\local\pip\cache\wheels\2b\5e\f2\3931524f702ffd03309e96d35ee2fbf9c61c27377511ee8d4c
Successfully built swifter


Installing collected packages: swifter
Successfully installed swifter-1.3.4


In [39]:
!pip install my_module

Collecting my_module
  Downloading my_module-1.6.2.zip (967 bytes)
Building wheels for collected packages: my-module
  Building wheel for my-module (setup.py): started
  Building wheel for my-module (setup.py): finished with status 'done'
  Created wheel for my-module: filename=my_module-1.6.2-py3-none-any.whl size=1427 sha256=12d287fa025c1f43bf92ea54643e9a0965ad8a9d34007f43aeaa29a227ca279e
  Stored in directory: c:\users\elitebook\appdata\local\pip\cache\wheels\f7\ed\98\51de11048a1c870d648ccd0344bd11c620a8c020ba3b0016f3
Successfully built my-module
Installing collected packages: my-module
Successfully installed my-module-1.6.2


In [41]:
!pip install Vector

Collecting Vector
  Downloading vector-0.11.0-py3-none-any.whl (168 kB)
Installing collected packages: Vector
Successfully installed Vector-0.11.0


In [11]:
!pip install Vector



In [14]:
!pip install sentence_transformers

