In [3]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
import spacy
import pandas as pd
import nltk

In [10]:
nltk.download('stopwords')
nltk.download('punkt')

nlp = spacy.load("en_core_web_sm")

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prajw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prajw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!














In [13]:
data = pd.read_csv('Dataset.csv')
data.head()

Unnamed: 0,id,Domain,Difficulty level,Questions,Answers
0,1,SQL,Easy,"What does SQL stand for, and what is its prima...",SQL stands for Structured Query Language. Its ...
1,2,SQL,Easy,What is a database schema?,A database schema is a blueprint that defines ...
2,3,SQL,Easy,"Explain the difference between a table, a reco...",Table : A Table is A collection of data organi...
3,4,SQL,Easy,What is a primary key in a database table?,A primary key is a unique identifier for each ...
4,5,SQL,Easy,"Define the term ""foreign key"" and its signific...",A foreign key is a column or a set of columns ...


In [19]:
# Preprocessing
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()


# def preprocess_text(text):
# #     text = text
#     text = text.lower()
#     text = "".join([char for char in text if char not in string.punctuation])
#     tokens = word_tokenize(text)
#     tokens = [word for word in tokens if word not in stop_words]
#     return " ".join(tokens)
#     return text

def preprocess_text(text):
    text = text.lower()
    doc = nlp(text)
    print(doc)
    tokens = [token.text for token in doc if not token.is_punct and not token.is_stop]
    return " ".join(tokens)

In [14]:
user_question = "What does SQL stand for, and what is its primary purpose?"
# Function to get actual answer based on question
def get_actual_answer(user_question):
    return data[data["Questions"] == user_question]["Answers"]

actual_answer = get_actual_answer(user_question).iloc[0]
user_input = '''SQL stands for Structured query language which is a programming language which is used for storing
                and processing information in a relational database'''


In [15]:
def evaluate(actual_answer,user_answer):
    if actual_answer:
        # Preprocess both the user's answer and the actual answer
        preprocessed_user_input = preprocess_text(user_answer)
        preprocessed_actual_answer = preprocess_text(actual_answer)
        
#         # Tokenize the preprocessed answers
#         user_tokens = set(preprocessed_user_input.split())
#         actual_tokens = set(preprocessed_actual_answer.split())

#         # Compute missing points
#         missing_points = actual_tokens - user_tokens
            
        # Generate embeddings for user's answer and actual answer
        user_embedding = embed([preprocessed_user_input])[0]
    #     print(user_embedding)
        actual_answer_embedding = embed([preprocessed_actual_answer])[0]
    #     print(actual_answer_embedding)

        # Compute similarity using cosine similarity
        similarity = cosine_similarity([actual_answer_embedding],[user_embedding])[0, 0]
#         missing_phrases_sentence = ", ".join(missing_phrases) if missing_phrases else "None"

        print(f"Similarity between user's answer and actual answer: {similarity}")
#         print(f"Missing points in user's answer: {missing_points}")
    else:
        print("Question not found in dataset.")


In [16]:
def load_data(domain, easy=2, medium=1, hard=1):
    domain_data = data[data['Domain'] == domain]
    easy = domain_data[domain_data['Difficulty level'] == 'Easy'].sample(easy)
    medium = domain_data[domain_data['Difficulty level'] == 'Medium'].sample(medium)
    hard = domain_data[domain_data['Difficulty level'] == 'Hard'].sample(hard)
    return pd.concat([easy, medium, hard])

In [17]:
def answer_questions(questions):
    print("Select a question to answer:")
    for index, row in questions.iterrows():
        print(f"{row['Questions']}")
        actual_answer = row["Answers"]
        user_answer = input("Your answer: ")
        evaluate(actual_answer,user_answer)
        

In [20]:
domain = input("Select a domain")
questions = load_data(domain)
answer_questions(questions)

Select a domainWeb Development
Select a question to answer:
What is JavaScript, and how is it used in web development?
Your answer: javascript is a scripting language which is used for interactivity on web page
javascript is a scripting language which is used for interactivity on web page
javascript is a scripting language used to add interactivity, dynamic behavior, and client-side functionality to web pages, including form validation and event handling.
Similarity between user's answer and actual answer: 0.7291901707649231
What is the role of the <head> element in an HTML document?
Your answer: 

the <head> element contains metadata and links to external resources, such as css stylesheets and javascript files, that are essential for defining the document's properties and behavior.
Similarity between user's answer and actual answer: -0.029971623793244362
Explain the concept of semantic HTML and its benefits in web development.
Your answer: 

semantic html uses meaningful tags (e.g., <


[38;5;1mâœ˜ Can't find wheel path: en_ner_fashion-0.0.0-py3-none-any.whl[0m



SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [9]:
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 393.8 kB/s eta 0:00:33
      --------------------------------------- 0.2/12.8 MB 1.2 MB/s eta 0:00:11
     - -------------------------------------- 0.4/12.8 MB 2.2 MB/s eta 0:00:06
     - -------------------------------------- 0.6/12.8 MB 2.7 MB/s eta 0:00:05
     -- ------------------------------------- 0.8/12.8 MB 3.1 MB/s eta 0:00:04
     --- ------------------------------------ 1.1/12.8 MB 3.5 MB/s eta 0:00:04
     --- ------------------------------------ 1.2/12.8 MB 3.3 MB/s eta 0:00:04
     ---- ----------------------------------- 1.4/12.8 MB 3.4 MB/s eta 0:00:04
     ---- ----------------------------------- 1