In [1]:
import pandas as pd

In [2]:
input_path = 'QA_dataset.csv'
df = pd.read_csv(input_path)
df.columns = ['question', 'answer']
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [3]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
df['tokens'] = df['question'].apply(word_tokenize)
df.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vedit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,question,answer,tokens
0,What is the capital of France?,Paris,"[What, is, the, capital, of, France, ?]"
1,What is the capital of Germany?,Berlin,"[What, is, the, capital, of, Germany, ?]"
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee,"[Who, wrote, 'To, Kill, a, Mockingbird, ', ?]"
3,What is the largest planet in our solar system?,Jupiter,"[What, is, the, largest, planet, in, our, sola..."
4,What is the boiling point of water in Celsius?,100,"[What, is, the, boiling, point, of, water, in,..."


In [4]:
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
df['filtered_tokens'] = df['tokens'].apply(lambda tokens: [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words])
df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vedit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,question,answer,tokens,filtered_tokens
0,What is the capital of France?,Paris,"[What, is, the, capital, of, France, ?]","[capital, france]"
1,What is the capital of Germany?,Berlin,"[What, is, the, capital, of, Germany, ?]","[capital, germany]"
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee,"[Who, wrote, 'To, Kill, a, Mockingbird, ', ?]","[wrote, kill, mockingbird]"
3,What is the largest planet in our solar system?,Jupiter,"[What, is, the, largest, planet, in, our, sola...","[largest, planet, solar, system]"
4,What is the boiling point of water in Celsius?,100,"[What, is, the, boiling, point, of, water, in,...","[boiling, point, water, celsius]"


In [5]:
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize_tokens(tokens):
    tagged_tokens = pos_tag(tokens)
    return [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in tagged_tokens]

df['lemmatized_tokens'] = df['filtered_tokens'].apply(lemmatize_tokens)
df.head()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\vedit\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vedit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,question,answer,tokens,filtered_tokens,lemmatized_tokens
0,What is the capital of France?,Paris,"[What, is, the, capital, of, France, ?]","[capital, france]","[capital, france]"
1,What is the capital of Germany?,Berlin,"[What, is, the, capital, of, Germany, ?]","[capital, germany]","[capital, germany]"
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee,"[Who, wrote, 'To, Kill, a, Mockingbird, ', ?]","[wrote, kill, mockingbird]","[write, kill, mockingbird]"
3,What is the largest planet in our solar system?,Jupiter,"[What, is, the, largest, planet, in, our, sola...","[largest, planet, solar, system]","[large, planet, solar, system]"
4,What is the boiling point of water in Celsius?,100,"[What, is, the, boiling, point, of, water, in,...","[boiling, point, water, celsius]","[boil, point, water, celsius]"


In [6]:
df['keyword'] = df['lemmatized_tokens'].apply(lambda tokens: '-'.join(tokens))
df.head()

Unnamed: 0,question,answer,tokens,filtered_tokens,lemmatized_tokens,keyword
0,What is the capital of France?,Paris,"[What, is, the, capital, of, France, ?]","[capital, france]","[capital, france]",capital-france
1,What is the capital of Germany?,Berlin,"[What, is, the, capital, of, Germany, ?]","[capital, germany]","[capital, germany]",capital-germany
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee,"[Who, wrote, 'To, Kill, a, Mockingbird, ', ?]","[wrote, kill, mockingbird]","[write, kill, mockingbird]",write-kill-mockingbird
3,What is the largest planet in our solar system?,Jupiter,"[What, is, the, largest, planet, in, our, sola...","[largest, planet, solar, system]","[large, planet, solar, system]",large-planet-solar-system
4,What is the boiling point of water in Celsius?,100,"[What, is, the, boiling, point, of, water, in,...","[boiling, point, water, celsius]","[boil, point, water, celsius]",boil-point-water-celsius


In [7]:
keyword_answer_df = df[['keyword', 'answer']]
keyword_answer_df.head()

Unnamed: 0,keyword,answer
0,capital-france,Paris
1,capital-germany,Berlin
2,write-kill-mockingbird,Harper-Lee
3,large-planet-solar-system,Jupiter
4,boil-point-water-celsius,100


In [17]:
# 💡 Ensure keywords and answers are strings and drop any missing values
keyword_answer_df = keyword_answer_df.dropna()
keyword_answer_df['keyword'] = keyword_answer_df['keyword'].astype(str)
keyword_answer_df['answer'] = keyword_answer_df['answer'].astype(str)

In [18]:
output_path = 'keyword_answer_output.csv'
keyword_answer_df.to_csv(output_path, index=False)
print(f"✅ Output saved to {output_path}")

✅ Output saved to keyword_answer_output.csv


In [19]:
import difflib

user_question = input("Ask a question: ")

user_tokens = word_tokenize(user_question)
filtered = [word.lower() for word in user_tokens if word.isalpha() and word.lower() not in stop_words]
lemmatized = lemmatize_tokens(filtered)
processed_question = '-'.join(lemmatized)

qa_dict = dict(zip(keyword_answer_df['keyword'], keyword_answer_df['answer']))

close_matches = difflib.get_close_matches(processed_question, qa_dict.keys(), n=1, cutoff=0.6)

if close_matches:
    best_match = close_matches[0]
    print("🤖 Answer:", qa_dict[best_match])
else:
    print("🤖 I'm not sure how to answer that yet.")


Ask a question:  capital of france


🤖 Answer: Paris
