In [4]:
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util

# Text preprocessing function
def preprocess_text(text):
    text = text.lower()
    words = word_tokenize(text)
    stop_word = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_word]
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    processed_text = ' '.join(words)
    return processed_text

# Load dataset
try:
    ds = pd.read_csv('D:/My WorkSpace/Summer-Internship/05-August-2024/questions_answers.csv', encoding='ISO-8859-1')
    print(ds.head(2))
except UnicodeDecodeError as err:
    print(f"Error: {err}")

# Handle missing values
ds.fillna("Not Mentioned", inplace=True)
ds['ans1'] = ds['ans1'].apply(preprocess_text)

# Initialize the model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Function to suggest sections
def suggest_sections(ans, ds, min_suggestions=5):
    preprocessed_ans = preprocess_text(ans)
    ans_embedding = model.encode(preprocessed_ans)
    section_embeddings = model.encode(ds['ans1'].tolist())
    similarities = util.pytorch_cos_sim(ans_embedding, section_embeddings)[0]
    
    similarity_threshold = 0.2
    relevant_indices = []

    while len(relevant_indices) < min_suggestions and similarity_threshold > 0:
        relevant_indices = [i for i, sim in enumerate(similarities) if sim > similarity_threshold]
        similarity_threshold -= 0.05  # Adjust step size if needed

        sorted_indices = sorted(relevant_indices, key=lambda i: similarities[i], reverse=True)
        
        suggestions = [
            {
                'index': i,
                'question': ds.iloc[i]['question'],
                'ans': ds.iloc[i]['ans'],
                'similarity_score': similarities[i].item()  # Convert tensor to float
            }
            for i in sorted_indices
        ]
    
    return suggestions

# Example usage
ans = 'Python is a general purpose programming language'
suggestions = suggest_sections(ans, ds, min_suggestions=1)

if suggestions:
    for suggestion in suggestions:
        print(f"S_no: {suggestion['index']}")
        print(f"Question: {suggestion['question']}")
        print(f"Your answer: {ans}")
        print(f"Expected answer : {suggestion['ans']}")
        print(f"Answer: {suggestion['ans']}")
        print(f"Similarity Score: {suggestion['similarity_score']*100}")
        print("_________________________________________________________________________________________\n")
else:
    print("No record is found")


  sr_no                                           question  \
0     1  What is Python? List some popular applications...   
1     2  What are the benefits of using Python language...   

                                                 ans  \
0  Python is a widely-used general-purpose, high-...   
1  Object-Oriented Language\nHigh-Level Language\...   

                                                ans1  
0  Python is a widely-used general-purpose, high-...  
1  Object-Oriented Language\nHigh-Level Language\...  
S_no: 0
Question: What is Python? List some popular applications of Python in the world of technology.
Your answer: Python is a general purpose programming language
Expected answer : Python is a widely-used general-purpose, high-level programming language. It was created by Guido van Rossum in 1991 and further developed by the Python Software Foundation. It was designed with an emphasis on code readability, and its syntax allows programmers to express their concepts in fewer