In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist
from collections import defaultdict
import numpy as np

# Download necessary NLTK data
#nltk.download('punkt')
#nltk.download('stopwords')

# Sample text
text = """
Natural language processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans through natural language. The ultimate objective of NLP is to enable computers to understand, interpret, and generate human languages in a valuable way. In practice, this involves various tasks such as text classification, machine translation, speech recognition, and sentiment analysis, among others.
The complexity of human language makes NLP a particularly challenging field in AI, as it requires a deep understanding of both linguistic theory and computational techniques. Modern NLP often relies on machine learning algorithms, especially deep learning models, to automatically learn the intricacies of language from large datasets.
"""

# Preprocess the text
def preprocess_text(text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # Tokenize the sentences into words
    words = [word_tokenize(sentence.lower()) for sentence in sentences]
    #(list of words from sentence) 
    
    # Remove stopwords and non-alphanumeric words
    stop_words = set(stopwords.words('english'))
    filtered_words = [[word for word in word_list if word.isalnum() and word not in stop_words] for word_list in words]
    
    # word.isalnum() checks if the word is alphanumeric (contains only letters and numbers). 
    # This helps filter out punctuation, special characters, etc. For example, it would remove '(' and ')' from the example above.
    
    return sentences, filtered_words

# Rank sentences based on word frequency
def rank_sentences(sentences, filtered_words):
    # Flatten the list of filtered words
    flat_words = [word for sublist in filtered_words for word in sublist]
    # Flattening the List: The code flattens filtered_words into a single list called flat_words. 
    # Instead of having a list of lists, you'll end up with a single list containing all the words.
    
    # Calculate word frequency distribution
    word_freq = FreqDist(flat_words)

    # Score each sentence by summing the frequency of the words in it
    sentence_scores = defaultdict(int)
    for i, word_list in enumerate(filtered_words):
        # sentence_scores[i] += word_freq[word]: This adds the frequency of the current word to the score of the sentence at index i in the sentence_scores dictionary.
        for word in word_list:
            sentence_scores[i] += word_freq[word]
    
    # Sort sentences by score
    ranked_sentences = sorted(sentence_scores.items(), key=lambda item: item[1], reverse=True)
    # key=lambda item: item[1] above fun gives tuples as o/p,in that item[1] means selecting rank wise means key,value anuko, based on value ani
    return ranked_sentences

# Generate summary
def summarize(text, n=2):
    # Preprocess the text
    sentences, filtered_words = preprocess_text(text)
    
    # Rank sentences
    ranked_sentences = rank_sentences(sentences, filtered_words)
    
    # Extract the top n sentences for the summary
    summary_sentences = [sentences[idx] for idx, score in ranked_sentences[:n]]
    #  means top index lu evaite vasthayo aa indexes tho unna sentences ni idi summary_sentences lo peduthundi
    # Combine the summary sentences into a single string
    summary = ' '.join(summary_sentences)
    
    return summary

# Summarize the text
summary = summarize(text, n=2)
print("Original Text:\n", text)
print("\nSummary:\n", summary)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RANGUMUDRI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RANGUMUDRI\AppData\Roaming\nltk_data...


Original Text:
 
Natural language processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans through natural language. The ultimate objective of NLP is to enable computers to understand, interpret, and generate human languages in a valuable way. In practice, this involves various tasks such as text classification, machine translation, speech recognition, and sentiment analysis, among others.
The complexity of human language makes NLP a particularly challenging field in AI, as it requires a deep understanding of both linguistic theory and computational techniques. Modern NLP often relies on machine learning algorithms, especially deep learning models, to automatically learn the intricacies of language from large datasets.


Summary:
 Modern NLP often relies on machine learning algorithms, especially deep learning models, to automatically learn the intricacies of language from large datasets. 
Natural language processing (NLP) is a

[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import nltk
from nltk.corpus import stopwords

# Sample text data
documents = [
    "Natural language processing is a branch of artificial intelligence.", 
    "Machine learning enables computers to learn from data.",
    "Deep learning is a subset of machine learning.",
    "Artificial intelligence is the future of technology.",
    "Text clustering is an unsupervised learning technique.",
    "K-means is a popular clustering algorithm.",
    "Supervised learning requires labeled data.",
    "Unsupervised learning works with unlabeled data."
]

# Preprocess the text
def preprocess_text(documents):
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    processed_docs = []
    
    for doc in documents:
        words = nltk.word_tokenize(doc.lower())
        filtered_words = [word for word in words if word.isalnum() and word not in stop_words]
        processed_docs.append(" ".join(filtered_words))
        # where the goal is often to group similar documents together based on meaning,
        # preserving the original word form can be more beneficial.
    
    return processed_docs

# Preprocess the documents
processed_docs = preprocess_text(documents) # processed_docs name was not in outside...it was only in def, so initialize it

# Convert documents to TF-IDF matrix
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(processed_docs)

# Apply K-Means clustering
num_clusters = 3  # You can change the number of clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
kmeans.fit(X)

# Print the clusters and their respective documents
clusters = kmeans.labels_.tolist()
for i in range(num_clusters): # 0,1,2 are the range
    print(f"Cluster {i+1}:")  # i = 0 +1 = 1,   i = 1 +1 = 2,  i = 2 +1 = 3....for cluster numbering from 1 to so on
    for idx, label in enumerate(clusters): # [1, 2, 2, 1, 0, 0, 2, 0]
        if label == i:
            print(f" - {documents[idx]}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RANGUMUDRI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Cluster 1:
 - Text clustering is an unsupervised learning technique.
 - K-means is a popular clustering algorithm.
 - Unsupervised learning works with unlabeled data.
Cluster 2:
 - Natural language processing is a branch of artificial intelligence.
 - Artificial intelligence is the future of technology.
Cluster 3:
 - Machine learning enables computers to learn from data.
 - Deep learning is a subset of machine learning.
 - Supervised learning requires labeled data.
