In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Download NLTK resources (if not already downloaded)
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to C:\Users\RAMESH
[nltk_data]     KUMAR\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\RAMESH
[nltk_data]     KUMAR\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Function to preprocess text

In [2]:
def preprocess_text(text):
    # Check if text is NaN
    if isinstance(text, float) and np.isnan(text):
        return ''
    
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # Join tokens back into a single string
    processed_text = ' '.join(filtered_tokens)
    
    return processed_text

# Load data from CSV files

In [3]:
prev_year_csv_path = "C:\\Users\\RAMESH KUMAR\\Downloads\\questions_only_text.csv"
random_csv_path = "C:\\Users\\RAMESH KUMAR\\Downloads\\next_random_question_only_text.csv"

prev_year_df = pd.read_csv(prev_year_csv_path)
random_df = pd.read_csv(random_csv_path)

#Print dataframes for debugging

In [4]:
print("Previous Year Questions DataFrame:")
print(prev_year_df.head())
print("\nRandom Questions DataFrame:")
print(random_df.head())

Previous Year Questions DataFrame:
                                            question
0  The IUPAC name of the copound having the formu...
1  Which nomenclature is not according to IUPAC s...
2  Structure of the compound whose IUPAC name is ...
3  Pick out the correct statement with respect to...
4  Which one of the following statements is not c...

Random Questions DataFrame:
                                            question
0  The cell wall of the cells of an alga has cell...
1  From the velocity-time graph of a particle giv...
2  The graph between the square root of the frequ...
3  Assertion: A charged particle cannot be accele...
4  If we mix a pentavalent impurity in a crystal ...


# Preprocess and tokenize the text data

In [5]:
prev_year_df['question_processed'] = prev_year_df['question'].apply(preprocess_text)
random_df['question_processed'] = random_df['question'].apply(preprocess_text)

#Drop rows where question_processed is empty

In [6]:
prev_year_df = prev_year_df[prev_year_df['question_processed'] != '']

#Printing  preprocessed dataframes for debugging

In [7]:
print("\nPreprocessed Previous Year Questions DataFrame:")
print(prev_year_df.head())
print("\nPreprocessed Random Questions DataFrame:")
print(random_df.head())


Preprocessed Previous Year Questions DataFrame:
                                            question  \
0  The IUPAC name of the copound having the formu...   
1  Which nomenclature is not according to IUPAC s...   
2  Structure of the compound whose IUPAC name is ...   
3  Pick out the correct statement with respect to...   
4  Which one of the following statements is not c...   

                                  question_processed  
0  iupac name copound formula ch≡cchch2 is1 3bute...  
1  nomenclature according iupac system1 brch2chch...  
2  structure compound whose iupac name 3ethyl2hyd...  
3  pick correct statement respect mncn63 1 sp3d2 ...  
4  one following statements correct1 catalyst ini...  

Preprocessed Random Questions DataFrame:
                                            question  \
0  The cell wall of the cells of an alga has cell...   
1  From the velocity-time graph of a particle giv...   
2  The graph between the square root of the frequ...   
3  Assertion: A ch

# Compute TF-IDF vectors

In [8]:
vectorizer = TfidfVectorizer()
prev_year_tfidf = vectorizer.fit_transform(prev_year_df['question_processed'])
random_tfidf = vectorizer.transform(random_df['question_processed'])

# Calculate cosine similarity

In [9]:
similarity_matrix = cosine_similarity(random_tfidf, prev_year_tfidf)

#Print similarity matrix for debugging

In [10]:
print("\nCosine Similarity Matrix:")
print(similarity_matrix)


Cosine Similarity Matrix:
[[0.         0.         0.         ... 0.         0.06964142 0.05242263]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.01378767]
 [0.         0.         0.         ... 0.         0.         0.        ]]


# Select top 15 random questions with highest similarity to any previous year question

In [11]:
num_top_questions = 15
if len(random_df) < num_top_questions:
    num_top_questions = len(random_df)

#Get indices of top similarities

In [12]:
similarity_scores = similarity_matrix.max(axis=1)
top_indices = np.argsort(-similarity_scores)[:num_top_questions]

#Ensuring top_indices are within bounds

In [13]:
valid_indices = [idx for idx in top_indices if idx < len(random_df)]

#Printing valid indices for debugging

In [14]:
print("\nValid Indices of Top Similarities:")
print(valid_indices)



Valid Indices of Top Similarities:
[132, 79, 426, 128, 317, 180, 358, 133, 488, 92, 471, 20, 429, 338, 177]


In [15]:
selected_questions = random_df.iloc[valid_indices]['question']

#Printing selected questions

In [16]:
print("\nSelected Questions:")
for idx, question in enumerate(selected_questions, start=1):
    print(f"{idx}. {question}")

# Ensure all questions are printed in case of any buffering issue
import sys
sys.stdout.flush()


Selected Questions:
1. The minimum pressure required to compress 600 dm3 of a gas at 1 bar to 150 dm3 at 40°C is
(1) 4.0 bar
(2) 0.2 bar
(3) 1.0 bar
(4) 2.5 bar
2. Viruses have : 1. DNA enclosed in a protein coat
2. Prokaryotic nucleus
3. Single Chromosome
4. Both DNA and RNA 
3. Placenta and pericarp are both edible portions in
1. Apple
2. Banana
3. Tomato
4. Potato

4. Which of the following is not correctly matched for the organism and its cell wall degrading enzyme?
1. Plant cells - Cellulase
2. Algae - Methylase
3. Fungi - Chitinase
4. Bacteria – Lysozyme

5. The correct thermodynamic conditions for the spontaneous reaction at all temperatures is :
(1) ∆H > 0 and ∆S < 0
(2) ∆H < 0 and ∆S > 0
(3) ∆H < 0 and ∆S < 0
(4) ∆H < 0 and ∆S = 0
6. Match List - I with List - II




Type of flower

Example


(A)
Zygomorphic
(I)
Mustard


(B)
Hypogynous
(II)
Plum


(C)
Perigynous
(III)
Cassia


(D)
Epigynous
(IV)
Cucumber




Choose the correct answer from the options given below:
1. (A) - (I