In [1]:
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string

# Download NLTK resources (run this once per Colab session)
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /home/r1j1n/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/r1j1n/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /home/r1j1n/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Hardcoded text corpus
corpus = """
Artificial intelligence (AI) is a wide-ranging branch of computer science concerned with building smart machines capable of performing tasks that typically require human intelligence.
AI is becoming more and more integrated with daily life, from virtual assistants on phones to complex algorithms used to make business decisions.
Machine Learning (ML) is a subfield of AI that focuses on algorithms that can learn from data.
Deep learning (DL) is a more specialized subfield of ML using neural networks with many layers, allowing for more complex pattern recognition.
"""



In [3]:
# Tokenize, lowercase, and remove punctuation from text
def preprocess_text(text):
  text = text.lower()
  text = ''.join([char for char in text if char not in string.punctuation]) # removing punctuation
  tokens = word_tokenize(text) # tokenizing
  return tokens

# Simple function to process a question and find the best matching sentence
def simple_qa(question, corpus):
  # Preprocess both text corpus and the question
  corpus_tokens = [preprocess_text(sentence) for sentence in corpus.split('\n') if sentence]
  question_tokens = preprocess_text(question)

  # Vectorize text corpus and question using TF-IDF
  vectorizer = TfidfVectorizer(tokenizer=lambda x:x, preprocessor=lambda x:x) # Using tokenizer to skip text processing and vectorizing pre-processed tokens
  vectorizer.fit(corpus_tokens)
  corpus_vectors = vectorizer.transform(corpus_tokens)
  question_vector = vectorizer.transform([question_tokens])

  # Finding the sentence with the highest cosine similarity
  similarity_scores = cosine_similarity(question_vector, corpus_vectors).flatten()
  best_match_index = np.argmax(similarity_scores)

  # Return the best matching sentence
  return corpus.split('\n')[best_match_index]


In [4]:
# Sample questions
questions = [
    "What is AI?",
    "Tell me about machine learning.",
    "what does Deep Learning do?",
    "What are the applications of AI?",
    "How many subfields of AI are in this text?" #testing questions beyond the text
]

# Process questions and print results
print("The Corpus used for this example is : \n", corpus)
print("\n-------------------------------------\n")
for question in questions:
    answer = simple_qa(question, corpus)
    print(f"Question: {question}")
    print(f"Answer: {answer}\n")


The Corpus used for this example is : 
 
Artificial intelligence (AI) is a wide-ranging branch of computer science concerned with building smart machines capable of performing tasks that typically require human intelligence.
AI is becoming more and more integrated with daily life, from virtual assistants on phones to complex algorithms used to make business decisions.
Machine Learning (ML) is a subfield of AI that focuses on algorithms that can learn from data.
Deep learning (DL) is a more specialized subfield of ML using neural networks with many layers, allowing for more complex pattern recognition.


-------------------------------------

Question: What is AI?
Answer: AI is becoming more and more integrated with daily life, from virtual assistants on phones to complex algorithms used to make business decisions.

Question: Tell me about machine learning.
Answer: AI is becoming more and more integrated with daily life, from virtual assistants on phones to complex algorithms used to ma

