<a href="https://colab.research.google.com/github/Swanand58/intro-to-information-retrieval/blob/main/InformationRetrievalAssignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import string
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
document_lists = [
    "Today OpenAI released ChatGPT!!!!!! Go ChatGPT!",
    "The ChatGPT is released today.",
    "OpenAI's ChatGPT performed well.",
    "Find the ChatGPT news last week.",
    "ChatGPT is a system released by OpenAI."
]

In [None]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [None]:
def preprocess(document):

  print("Original Given Document: ", document)

  #Case folding - We will convert all the words in the documents to lower case.
  document = document.lower()
  print("Case Folding Removal: ",document)

  #Punctuation Removal step - We will strip all the punctuation marks from the sentences of the documents with the help of string.punctuation.
  document = document.translate(str.maketrans('', '', string.punctuation))
  print("Punctuation Removal Output: ", document)

  #Converting into tokens
  tokens = word_tokenize(document)
  print("Token output: ",tokens)

  #Stop word removal - We will remove all stop word using nltk library predefined list of english stop words.
  filtered_tokens = [word for word in tokens if word not in stopwords.words('english')]
  print("Stop word removal output: ", filtered_tokens)

  #Lemmatizing the Tokens using WordNetLemmatizer
  lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

  print("Lemmatization output: ", lemmatized_tokens)

  #Stemming the tokens using PorterStemmer
  stemmed_tokens = [stemmer.stem(word) for word in lemmatized_tokens]
  print("Stemming output: ", stemmed_tokens)
  print("\n")

  return stemmed_tokens

In [None]:
preprocessed_docs = [preprocess(document) for document in document_lists]

Original Given Document:  Today OpenAI released ChatGPT!!!!!! Go ChatGPT!
Case Folding Removal:  today openai released chatgpt!!!!!! go chatgpt!
Punctuation Removal Output:  today openai released chatgpt go chatgpt
Token output:  ['today', 'openai', 'released', 'chatgpt', 'go', 'chatgpt']
Stop word removal output:  ['today', 'openai', 'released', 'chatgpt', 'go', 'chatgpt']
Lemmatization output:  ['today', 'openai', 'released', 'chatgpt', 'go', 'chatgpt']
Stemming output:  ['today', 'openai', 'releas', 'chatgpt', 'go', 'chatgpt']


Original Given Document:  The ChatGPT is released today.
Case Folding Removal:  the chatgpt is released today.
Punctuation Removal Output:  the chatgpt is released today
Token output:  ['the', 'chatgpt', 'is', 'released', 'today']
Stop word removal output:  ['chatgpt', 'released', 'today']
Lemmatization output:  ['chatgpt', 'released', 'today']
Stemming output:  ['chatgpt', 'releas', 'today']


Original Given Document:  OpenAI's ChatGPT performed well.
Case 

In [None]:
print(preprocessed_docs)

[['today', 'openai', 'releas', 'chatgpt', 'go', 'chatgpt'], ['chatgpt', 'releas', 'today'], ['openai', 'chatgpt', 'perform', 'well'], ['find', 'chatgpt', 'news', 'last', 'week'], ['chatgpt', 'system', 'releas', 'openai']]


In [None]:
import pandas as pd

# we will get all the unique terms from the preprocessed documents list
unique_terms = sorted(set(term for doc in preprocessed_docs for term in doc))

#we will initialize a matrix and fill 0 in the matrix
incidence_matrix = pd.DataFrame(0, index=unique_terms, columns=[f"Document {i+1}" for i in range(len(preprocessed_docs))])

#we will construct the incidence matrix and add 1 where the word occurs
for i, doc in enumerate(preprocessed_docs):
    for term in doc:
        incidence_matrix.at[term, f"Document {i+1}"] = 1

print("Incidence Matrix is as follows:")
print("\n")
print(incidence_matrix)

Incidence Matrix is as follows:


         Document 1  Document 2  Document 3  Document 4  Document 5
chatgpt           1           1           1           1           1
find              0           0           0           1           0
go                1           0           0           0           0
last              0           0           0           1           0
news              0           0           0           1           0
openai            1           0           1           0           1
perform           0           0           1           0           0
releas            1           1           0           0           1
system            0           0           0           0           1
today             1           1           0           0           0
week              0           0           0           1           0
well              0           0           1           0           0


In [None]:
#Implementing TF-IDF on the term document incident matrix
# We know that the idf of t is given by log(N/df_t) Where N is the total number of documents and df_t is the document frequency of term t

import numpy as np

N = len(preprocessed_docs)

idf_values = np.log(N / incidence_matrix.sum(axis=1))

tf_idf_matrix = incidence_matrix.mul(idf_values, axis=0)

print(tf_idf_matrix)

         Document 1  Document 2  Document 3  Document 4  Document 5
chatgpt    0.000000    0.000000    0.000000    0.000000    0.000000
find       0.000000    0.000000    0.000000    1.609438    0.000000
go         1.609438    0.000000    0.000000    0.000000    0.000000
last       0.000000    0.000000    0.000000    1.609438    0.000000
news       0.000000    0.000000    0.000000    1.609438    0.000000
openai     0.510826    0.000000    0.510826    0.000000    0.510826
perform    0.000000    0.000000    1.609438    0.000000    0.000000
releas     0.510826    0.510826    0.000000    0.000000    0.510826
system     0.000000    0.000000    0.000000    0.000000    1.609438
today      0.916291    0.916291    0.000000    0.000000    0.000000
week       0.000000    0.000000    0.000000    1.609438    0.000000
well       0.000000    0.000000    1.609438    0.000000    0.000000


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim_matrix = cosine_similarity(tf_idf_matrix.T)

cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=[f"Document {i+1}" for i in range(5)], columns=[f"Document {i+1}" for i in range(5)])

print("Cosine Similarity:")
print("\n")
print(cosine_sim_df)

Cosine Similarity:


            Document 1  Document 2  Document 3  Document 4  Document 5
Document 1    1.000000    0.527723    0.056272         0.0    0.148815
Document 2    0.527723    1.000000    0.000000         0.0    0.140998
Document 3    0.056272    0.000000    1.000000         0.0    0.063409
Document 4    0.000000    0.000000    0.000000         1.0    0.000000
Document 5    0.148815    0.140998    0.063409         0.0    1.000000
