In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Tokenization example
from nltk import word_tokenize
import string
text = "I study Machine Learning. Is'nt it a GREAT day!!! "
text = text.lower()
text = text.translate(str.maketrans('', '', string.punctuation))
print(text)
print(word_tokenize(text))

i study machine learning isnt it a great day 
['i', 'study', 'machine', 'learning', 'isnt', 'it', 'a', 'great', 'day']


In [None]:
# stopword removal example
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

example_sent = """This is a sample sentence,
                  showing off the stop words filtration."""

stop_words = set(stopwords.words('english'))

word_tokens = word_tokenize(example_sent)
# converts the words in word_tokens to lower case and then checks whether
#they are present in stop_words or not
filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
#with no lower case conversion

print("Word tokens : ",word_tokens)
print("Tokens after stop word removal",filtered_sentence)

Word tokens :  ['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
Tokens after stop word removal ['sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


In [None]:
#stemming example
from nltk.stem import PorterStemmer
def porter_stemmer(text):
	tokens = word_tokenize(text)
	for index in range(len(tokens)):
		# stem word to each word
		stem_word = stemmer.stem(tokens[index])
		# update tokens list with stem word
		tokens[index] = stem_word
	# join list with space separator as string
	return ' '.join(tokens)
stemmer = PorterStemmer()
ex_stem = "Programmers program with programming languages"
stem_result = porter_stemmer(ex_stem)
print(f"Result after stemming technique :- \n{stem_result}")


Result after stemming technique :- 
programm program with program languag


In [None]:
import nltk
from nltk.stem.snowball import SnowballStemmer

#the stemmer requires a language parameter
snow_stemmer = SnowballStemmer(language='english')

#list of tokenized words
words = ['cared','university','fairly','easily','singing',
       'sings','sung','singer','sportingly']

#stem's of each word
stem_words = []
for w in words:
    x = snow_stemmer.stem(w)
    stem_words.append(x)

#print stemming results
for e1,e2 in zip(words,stem_words):
    print(e1+' ----> '+e2)

cared ----> care
university ----> univers
fairly ----> fair
easily ----> easili
singing ----> sing
sings ----> sing
sung ----> sung
singer ----> singer
sportingly ----> sport


In [5]:
#Lemmatization - without POS tag - Example-1
from nltk.stem import WordNetLemmatizer
def lemmatization(text):
	tokens = word_tokenize(text)
	for index in range(len(tokens)):
		lemma_word = lemma.lemmatize(tokens[index])
		tokens[index] = lemma_word
	return ' '.join(tokens)

lemma = WordNetLemmatizer()
ex_lemma = "Programmers program with programming languages"
lemma_result = lemmatization(ex_lemma)
print(f"Result of lemmatization \n{lemma_result}")


Result of lemmatization 
Programmers program with programming language


In [None]:
#lemmatization example-2 - with POS tag
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize,pos_tag

text = "She jumped into the river and breathed heavily"
wordnet = WordNetLemmatizer()

for token,tag in pos_tag(word_tokenize(text)):
    pos=tag[0].lower()


    if pos not in ['a', 'r', 'n', 'v']:
       pos='n'

    print(token,"--->",wordnet.lemmatize(token,pos))

She ---> She
jumped ---> jump
into ---> into
the ---> the
river ---> river
and ---> and
breathed ---> breathe
heavily ---> heavily


In [None]:
#Example to create inverted index from preprocessed text
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from collections import defaultdict
import string

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Sample documents
documents = {
    1: "This is a sample document.",
    2: "This document is another sample document.",
    3: "And this is a different document."
}

# Initialize stop words, stemmer, and lemmatizer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Function to preprocess and tokenize text
def preprocess(text, use_stemming=True):
    # Normalize text
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize text
    tokens = word_tokenize(text)

    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]

    # Stem or lemmatize tokens
    if use_stemming:
        tokens = [stemmer.stem(word) for word in tokens]
    else:
        tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

# Create the inverted index with document frequency
inverted_index = defaultdict(list)
doc_freq = defaultdict(int)

for doc_id, text in documents.items():
    tokens = preprocess(text)
    unique_tokens = set(tokens)
    for token in unique_tokens:
        doc_freq[token] += 1
        inverted_index[token].append(doc_id)

# Display the inverted index with document frequency
print("Inverted Index with Document Frequency:")
for word, doc_ids in inverted_index.items():
    print(f"{word}: {doc_ids} (Document Frequency: {doc_freq[word]})")



Inverted Index with Document Frequency:
sampl: [1, 2] (Document Frequency: 2)
document: [1, 2, 3] (Document Frequency: 3)
anoth: [2] (Document Frequency: 1)
differ: [3] (Document Frequency: 1)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
from collections import defaultdict
import string

# Sample documents
documents = {
    1: "This is a sample document.",
    2: "This document is another sample document.",
    3: "And this is a different document."
}

# Preprocess the text
def preprocess(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize by splitting on whitespace
    tokens = text.split()
    return tokens

# Create the inverted index
def create_inverted_index(documents):
    inverted_index = defaultdict(set)
    for doc_id, text in documents.items():
        tokens = preprocess(text)
        for token in tokens:
            inverted_index[token].add(doc_id)
    return inverted_index

# Display the inverted index
def display_inverted_index(inverted_index):
    print("Inverted Index:")
    for word, doc_ids in inverted_index.items():
        print(f"{word}: {list(doc_ids)}")

# Boolean query function
def boolean_query(query, inverted_index):
    query = query.lower()
    query = query.translate(str.maketrans('', '', string.punctuation))
    query_tokens = query.split()

    if 'and' in query_tokens:
        query_tokens.remove('and')
        result_docs = inverted_index[query_tokens[0]].copy()
        for token in query_tokens[1:]:
           result_docs = result_docs.intersection(set(inverted_index[token]))
    elif 'or' in query_tokens:
        query_tokens.remove('or')
        result_docs = set()
        for token in query_tokens:
            result_docs = result_docs.union(set(inverted_index[token]))
    else:
        result_docs = inverted_index[query_tokens[0]]

    return list(result_docs)

# Main execution
inverted_index = create_inverted_index(documents)
display_inverted_index(inverted_index)

# Sample boolean queries
queries = [
    "sample AND document",
    "different OR another ",
    "sample"
]

# Process and print the results for each query
for query in queries:
    result = boolean_query(query, inverted_index)
    print(f"Query: '{query}' => Documents: {result}")


Inverted Index:
this: [1, 2, 3]
is: [1, 2, 3]
a: [1, 3]
sample: [1, 2]
document: [1, 2, 3]
another: [2]
and: [3]
different: [3]
Query: 'sample AND document' => Documents: [1, 2]
Query: 'different OR another ' => Documents: [2, 3]
Query: 'sample' => Documents: [1, 2]
