In [9]:
from flask import Flask, request, jsonify
from flask_cors import CORS
import nltk
import pdfplumber
from sentence_transformers import SentenceTransformer, util
import re


In [10]:
app = Flask(__name__)
CORS(app)  # Enable CORS for all routes

<flask_cors.extension.CORS at 0x1c5b3e22e00>

In [11]:
# Download NLTK data (if not already downloaded)
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shric\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shric\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:

def model(user_ques_1, user_ques_2):
    # Stop word removal
    stop = stopwords.words('english')
    ques_1_nstop = ' '.join([word for word in user_ques_1.split() if word not in stop])
    ques_2_nstop = ' '.join([word for word in user_ques_2.split() if word not in stop])

    # Tokenize the questions
    tok_ques_1 = nltk.word_tokenize(ques_1_nstop)
    tok_ques_2 = nltk.word_tokenize(ques_2_nstop)
    tok_ques_1_str = ' '.join(map(str, tok_ques_1))
    tok_ques_2_str = ' '.join(map(str, tok_ques_2))

    # Lemmatize data
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()

    def lemmatize_text(text):
        return ' '.join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)])

    ques_1_lemm_str = lemmatize_text(tok_ques_1_str)
    ques_2_lemm_str = lemmatize_text(tok_ques_2_str)

    # Convert lemmatized data to lower case
    ques_1_lemm_str = ques_1_lemm_str.lower()
    ques_2_lemm_str = ques_2_lemm_str.lower()

    # Text similarity scores obtained using 'paraphrase-MiniLM-L3-v2' BERT model
    st = time.time()
    embd1 = model1.encode(ques_1_lemm_str, convert_to_tensor=True)
    embd2 = model1.encode(ques_2_lemm_str, convert_to_tensor=True)
    cosine_scores1 = util.pytorch_cos_sim(embd1, embd2)
    et = time.time()
    elapsed_time = et - st

    # Text similarity scores obtained using BERT model 'all-distilroberta-v1'
    st = time.time()
    embd1 = model2.encode(ques_1_lemm_str, convert_to_tensor=True)
    embd2 = model2.encode(ques_2_lemm_str, convert_to_tensor=True)
    cosine_scores2 = util.pytorch_cos_sim(embd1, embd2)
    et = time.time()
    elapsed_time = et - st

    # Text similarity scores obtained using BERT model 'multi-qa-distilbert-cos-v1'
    st = time.time()
    embd1 = model3.encode(ques_1_lemm_str, convert_to_tensor=True)
    embd2 = model3.encode(ques_2_lemm_str, convert_to_tensor=True)
    cosine_scores3 = util.pytorch_cos_sim(embd1, embd2)
    et = time.time()
    elapsed_time = et - st

    # Calculate average similarity score
    avg_similarity_score = (cosine_scores1.item() + cosine_scores2.item() + cosine_scores3.item()) / 3

    return avg_similarity_score


In [13]:
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

In [14]:
@app.route('/process_pdfs', methods=['POST'])
def process_pdfs():
    try:
        # Assuming you are sending the PDF files in the request
        file1 = request.files['file1']
        file2 = request.files['file2']

        # Extract text from both PDF files
        question_paper1_text = extract_text_from_pdf(file1)
        question_paper2_text = extract_text_from_pdf(file2)

        # Use regular expressions to split the text into individual questions
        arr1 = re.split(r'\d+\.', question_paper1_text)
        arr2 = re.split(r'\d+\.', question_paper2_text)

        # Remove any leading or trailing whitespace from the questions
        arr1 = [question.strip() for question in arr1 if question.strip()]
        arr2 = [question.strip() for question in arr2 if question.strip()]

        new_paper = []

        for q1 in arr1:
            for q2 in arr2:
                similarity = model(q1, q2)
                if similarity > 0.7:
                    new_paper.append(q1)
                    break  # Break to avoid adding duplicates

        # Add questions from arr2 that didn't have matches
        for q2 in arr2:
            if q2 not in new_paper:
                new_paper.append(q2)

        # Add questions from arr1 that didn't have matches
        for q1 in arr1:
            if q1 not in new_paper:
                new_paper.append(q1)

        # Respond with the result
        return jsonify({"result": new_paper})

    except Exception as e:
        return jsonify({"error": str(e)})

In [None]:

if __name__ == '__main__':
    app.run(debug=True, port=5000, use_reloader=False)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
