In [1]:
import spacy
import json
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.tokens import Doc
# from fuzzywuzzy import fuzz

In [2]:
nlp = spacy.load("en_core_web_md")

In [3]:
# Read USER ANSWERS data from the JSON file
with open('data.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Process each text using spaCy
user_data = []
for entry in data:
    doc = nlp(entry['text'])
    user_entry = {'id': entry['id'], 'text': doc}
    user_data.append(user_entry)

In [4]:
# Read DATABASE data from the JSON file
with open('dummy.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Process each text using spaCy
database_data = []
for entry in data:
    ans_doc = nlp(entry['answer'])
    qn_doc = nlp(entry['question'])
    database_entry = {'id': entry['id'], 'question': qn_doc, 'answer': ans_doc}
    database_data.append(database_entry)

# Now processed_data contains the processed text from the JSON file
# You can use the processed data for further analysis or similarity comparison

In [5]:
user_data

[{'id': 1, 'text': France's capital is Paris},
 {'id': 2, 'text': William Shakespeare},
 {'id': 3, 'text': Saturn},
 {'id': 4, 'text': There are seven continents},
 {'id': 5, 'text': Avacado is the main ingredient.}]

In [6]:
database_data

[{'id': 1,
  'question': What is the capital of France?,
  'answer': The capital of France is Paris.},
 {'id': 2,
  'question': Who wrote 'Romeo and Juliet'?,
  'answer': William Shakespeare wrote 'Romeo and Juliet'.},
 {'id': 3,
  'question': What is the largest planet in our solar system?,
  'answer': Jupiter is the largest planet in our solar system.},
 {'id': 4,
  'question': How many continents are there on Earth?,
  'answer': There are seven continents on Earth.},
 {'id': 5,
  'question': What is the main ingredient in guacamole?,
  'answer': The main ingredient in guacamole is avocado.}]

In [7]:
# removing stopwords from the user_data and database_data - AN ATTEMPT TO INCREASE WORD SIMILARITY

# Process the text for each answer in the user data
for user_answer in user_data:
    filtered_user_answer = []
    filtered_db_answer = []
    user_doc = nlp(user_answer["text"])

    # Removing stop words and storing it in filtered_user_answer which is stored in the filtered_answer key
    for word in user_doc:
        if word.is_stop==False:
            filtered_user_answer.append(word)
    
    user_answer["filtered_user_answer"] = filtered_user_answer
    user_answer["filtered_user_answer"] = ' '.join(token.text for token in user_answer["filtered_user_answer"])

    # Find the corresponding entry in database_data based on 'id'
    db_answer = next((item for item in database_data if item["id"] == user_answer["id"]), None)

    if db_answer:
        database_doc = nlp(db_answer["answer"])
        # Removing stop words and storing it in filtered_user_answer which is stored in the filtered_answer key
        for word in database_doc:
            if word.is_stop==False:
                filtered_db_answer.append(word)
        
        db_answer["filtered_db_answer"] = filtered_db_answer
        db_answer["filtered_db_answer"] = ' '.join(token.text for token in db_answer["filtered_db_answer"])

In [8]:
# removing words from the user_answer and database_answer that is also from the database_question

# Process the text for each answer in the user data
for user_answer in user_data:
    final_user_answer = []
    final_db_answer = []
    user_doc = nlp(user_answer["filtered_user_answer"])

    # Find the corresponding entry in database_data based on 'id'
    db_answer = next((item for item in database_data if item["id"] == user_answer["id"]), None)

    if db_answer:
        db_qn_doc = nlp(db_answer["question"])
        db_ans_doc = nlp(db_answer["filtered_db_answer"])
        
        qn_words_list = [token.text for token in db_qn_doc]
        for word in db_ans_doc:
            if not (word.text in qn_words_list):
                if not (word.text in final_db_answer):
                    final_db_answer.append(word)

        for word in user_doc:
            if not (word.text in qn_words_list):
                if not (word.text in final_user_answer):
                    final_user_answer.append(word)
        
        db_answer["final_answer"] = final_db_answer
        db_answer["final_answer"] = ' '.join(token.text for token in db_answer["final_answer"] if not token.is_punct)

        user_answer["final_answer"] = final_user_answer
        user_answer["final_answer"] = ' '.join(token.text for token in user_answer["final_answer"] if not token.is_punct)

In [9]:
# # removing words from the database answer that are contained in the database question - AN ATTEMPT TO INCREASE WORD SIMILARITY
# for db_answer in database_data:
#     final_answer = []
#     db_qn_doc = nlp(db_answer["question"])
#     db_ans_doc = nlp(db_answer["filtered_db_answer"])

#     # for avoid in db_qn_doc:
#     #     for word in db_ans_doc:
#     #         if (word != avoid):
#     #             if not (word in final_answer) :
#     #                 final_answer.append(word)

#     qn_words_list = [token.text for token in db_qn_doc]
#     for word in db_ans_doc:
#         if not (word.text in qn_words_list):
#             if not (word.text in final_answer):
#                 final_answer.append(word)

In [11]:
# Function to calculate similarity between two documents
def calculate_similarity(user_doc, database_doc):
    return user_doc.similarity(database_doc)

# Process the text for each answer in the user data
for user_answer in user_data:
    user_doc = nlp(user_answer["final_answer"])

    # Find the corresponding entry in database_data based on 'id'
    db_answer = next((item for item in database_data if item["id"] == user_answer["id"]), None)

    if db_answer:
        database_doc = nlp(db_answer["final_answer"])

        # Calculate similarity
        similarity_score = calculate_similarity(user_doc, database_doc)

        # Store the similarity score or use it as needed
        user_answer["similarity_score"] = similarity_score

# Now user_data contains similarity scores for each corresponding pair

  return user_doc.similarity(database_doc)


In [12]:
user_data

[{'id': 1,
  'text': France's capital is Paris,
  'filtered_user_answer': 'France capital Paris',
  'final_answer': 'Paris',
  'similarity_score': 1.0},
 {'id': 2,
  'text': William Shakespeare,
  'filtered_user_answer': 'William Shakespeare',
  'final_answer': 'William Shakespeare',
  'similarity_score': 1.0},
 {'id': 3,
  'text': Saturn,
  'filtered_user_answer': 'Saturn',
  'final_answer': 'Saturn',
  'similarity_score': 0.7763198775373695},
 {'id': 4,
  'text': There are seven continents,
  'filtered_user_answer': 'seven continents',
  'final_answer': 'seven',
  'similarity_score': 1.0},
 {'id': 5,
  'text': Avacado is the main ingredient.,
  'filtered_user_answer': 'Avacado main ingredient .',
  'final_answer': 'Avacado',
  'similarity_score': 0.0}]

In [13]:
database_data

[{'id': 1,
  'question': What is the capital of France?,
  'answer': The capital of France is Paris.,
  'filtered_db_answer': 'capital France Paris .',
  'final_answer': 'Paris'},
 {'id': 2,
  'question': Who wrote 'Romeo and Juliet'?,
  'answer': William Shakespeare wrote 'Romeo and Juliet'.,
  'filtered_db_answer': "William Shakespeare wrote ' Romeo Juliet ' .",
  'final_answer': 'William Shakespeare'},
 {'id': 3,
  'question': What is the largest planet in our solar system?,
  'answer': Jupiter is the largest planet in our solar system.,
  'filtered_db_answer': 'Jupiter largest planet solar system .',
  'final_answer': 'Jupiter'},
 {'id': 4,
  'question': How many continents are there on Earth?,
  'answer': There are seven continents on Earth.,
  'filtered_db_answer': 'seven continents Earth .',
  'final_answer': 'seven'},
 {'id': 5,
  'question': What is the main ingredient in guacamole?,
  'answer': The main ingredient in guacamole is avocado.,
  'filtered_db_answer': 'main ingred

In [14]:
# Calculating the average of the scores
user_similarity = 0

for user_entry in user_data:
    user_similarity += user_entry["similarity_score"] # You can access similarity scores using user_data[i]["similarity_score"]

avg_score = user_similarity/len(user_data)

In [15]:
# Writing similarity scores into json file
with open("user_final_score.json", "w") as write_file:
    json.dump(avg_score, write_file)

----------------------------------------------------------------