In [5]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to preprocess text
def preprocess_text(text):
    # Tokenize text into sentences
    sentences = sent_tokenize(text)
    
    # Tokenize each sentence into words
    word_tokens = [word_tokenize(sentence) for sentence in sentences]
    
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    filtered_sentences = []
    for sentence_tokens in word_tokens:
        filtered_sentence = [word for word in sentence_tokens if word.lower() not in stop_words]
        filtered_sentences.append(filtered_sentence)
    
    # Flatten the list of words into a single list of all words
    all_words = [word for sentence in filtered_sentences for word in sentence]
    
    # Convert the list of words back to text
    processed_text = " ".join(all_words)
    
    return processed_text, sentences, word_tokens

# Read text data from the .txt file
def read_text_from_file(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    return text

# Example .txt file path
file_path = 'paper.txt'

# Read text from the .txt file
text = read_text_from_file(file_path)

# Preprocess the text
processed_text, sentences, word_tokens = preprocess_text(text)

# Vectorize the preprocessed text
vectorizer = TfidfVectorizer()
vectorized_text = vectorizer.fit_transform([processed_text])

# Calculate cosine similarity for each sentence
sentence_cosine_sim = cosine_similarity(vectorized_text, vectorized_text)

# Calculate cosine similarity for each word
word_vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
word_vectorized_text = word_vectorizer.fit_transform(word_tokens)
word_cosine_sim = cosine_similarity(word_vectorized_text, word_vectorized_text)

print("Preprocessed text:")
print(processed_text)
print("\nVectorized text:")
print(vectorized_text)
print("\nCosine similarity matrix for each sentence:")
print(sentence_cosine_sim)
print("\nCosine similarity matrix for each word:")
print(word_cosine_sim)

Preprocessed text:
Tajmehal Computer science dynamic ever-evolving field encompasses study algorithms , data structures , programming languages , theoretical foundations computing . plays pivotal role shaping modern world , driving innovation across various industries . Computer scientists analyze solve complex problems , develop cutting-edge software , design efficient algorithms enhance computational capabilities . artificial intelligence machine learning cybersecurity software engineering , computer science influences nearly every aspect daily lives . technology continues advance , field computer science remains forefront , paving way transformative breakthroughs shaping digital landscape future

Vectorized text:
  (0, 33)	0.09950371902099892
  (0, 38)	0.09950371902099892
  (0, 19)	0.09950371902099892
  (0, 6)	0.09950371902099892
  (0, 62)	0.09950371902099892
  (0, 64)	0.09950371902099892
  (0, 45)	0.09950371902099892
  (0, 31)	0.09950371902099892
  (0, 50)	0.09950371902099892
  (0,