In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Sample text documents
documents = [
    "The car is fast and new.",
    "I love fast cars.",
    "The sun is bright and shining.",
    "I love the warm weather.",
    "Cars are great for fast transportation."
]

# Step 1: TF-IDF Vectorization (using words)
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

# Step 2: Apply KMeans clustering
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X)

# Output the cluster labels
print("Cluster labels:", kmeans.labels_)

# Print which documents belong to which clusters
for cluster_id in np.unique(kmeans.labels_):
    print(f"\nCluster {cluster_id}:")
    cluster_docs = [documents[i] for i in range(len(documents)) if kmeans.labels_[i] == cluster_id]
    for doc in cluster_docs:
        print(f"  {doc}")


Cluster labels: [0 0 1 1 0]

Cluster 0:
  The car is fast and new.
  I love fast cars.
  Cars are great for fast transportation.

Cluster 1:
  The sun is bright and shining.
  I love the warm weather.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from collections import Counter
import spacy
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import string

# Download necessary NLTK resources
nltk.download('stopwords')

# Load Spacy model
nlp = spacy.load("en_core_web_sm")

# Initialize stopwords, stemmer, and punctuation
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
punctuation_table = str.maketrans('', '', string.punctuation)

# Sample documents
documents = [
    "The team scored a goal in the game.",
    "Election results show a victory for the candidate.",
    "The player scored the winning goal.",
    "Climate change is affecting the environment.",
    "The vote count favored the candidate."
]

# Preprocessing function
def preprocess(text):
    text = text.translate(punctuation_table)  # Remove punctuation
    tokens = [word.lower() for word in text.split() if word.lower() not in stop_words]
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return " ".join(stemmed_tokens)

# Preprocess documents
documents = [preprocess(doc) for doc in documents]

# Step 1: Extract Word Clusters using TF-IDF and KMeans
vectorizer = TfidfVectorizer(stop_words=None, ngram_range=(1, 1))
tfidf_matrix = vectorizer.fit_transform(documents)

# Perform clustering on word vectors
num_clusters = 3
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(tfidf_matrix)

# Assign clusters to documents
doc_clusters = kmeans.predict(tfidf_matrix)

# Print document clusters
print("Word-Based Clusters:")
for i, cluster in enumerate(doc_clusters):
    print(f"Document {i + 1}: Cluster {cluster}")

# Step 2: Extract Frequent Phrases using Spacy
def extract_phrases_spacy(documents, n=2):
    phrase_counter = Counter()
    for doc in documents:
        tokens = [token.text for token in nlp(doc)]
        phrases = list(ngrams(tokens, n))
        phrase_counter.update(phrases)
    return phrase_counter

# Extract bigrams (phrases of length 2)
phrase_counts = extract_phrases_spacy(documents, n=2)

# Filter top phrases
top_phrases = phrase_counts.most_common(5)
print("\nTop Phrases:")
for phrase, count in top_phrases:
    print(f"{' '.join(phrase)}: {count}")

# Step 3: Assign documents to phrase-based clusters
phrase_clusters = {}
for phrase, _ in top_phrases:
    phrase_clusters[' '.join(phrase)] = []
    for i, doc in enumerate(documents):
        if ' '.join(phrase) in doc:
            phrase_clusters[' '.join(phrase)].append(f"Document {i + 1}")

print("\nPhrase-Based Clusters:")
for phrase, docs in phrase_clusters.items():
    print(f"Phrase '{phrase}': {', '.join(docs)}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Word-Based Clusters:
Document 1: Cluster 2
Document 2: Cluster 0
Document 3: Cluster 2
Document 4: Cluster 1
Document 5: Cluster 0

Top Phrases:
team score: 1
score goal: 1
goal game: 1
elect result: 1
result show: 1

Phrase-Based Clusters:
Phrase 'team score': Document 1
Phrase 'score goal': Document 1
Phrase 'goal game': Document 1
Phrase 'elect result': Document 2
Phrase 'result show': Document 2


In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict

# Sample documents
documents = [
    "The team scored a goal in the game.",
    "Election results show a victory for the candidate.",
    "The player scored the winning goal.",
    "Climate change is affecting the environment.",
    "The vote count favored the candidate."
]

# Manually defined stopwords (basic list)
stopwords = [
    'the', 'a', 'and', 'for', 'of', 'to', 'is', 'in', 'on', 'with', 'at', 'by', 'as', 'this', 'that', 'from'
]

# Step 1: Extract Word Clusters (Co-occurrence Analysis)

# Manually defined word clusters (for simplicity)
word_clusters = {
    "Sports": {"team", "goal", "game", "player"},
    "Politics": {"election", "vote", "candidate"},
    "Environment": {"climate", "change", "environment"}
}

# Step 2: Represent Documents Using Word Clusters

def represent_documents(documents, word_clusters):
    doc_representation = []
    for doc in documents:
        doc_words = set(doc.lower().split())  # tokenize and make lower case
        doc_cluster = set()
        for cluster_name, words in word_clusters.items():
            if not doc_words.isdisjoint(words):
                doc_cluster.add(cluster_name)
        doc_representation.append(doc_cluster)
    return doc_representation

doc_representation = represent_documents(documents, word_clusters)

# Display the document representation based on word clusters
for i, doc in enumerate(doc_representation):
    print(f"Document {i+1}: {doc}")

# Step 3: Cluster Documents Based on Word Clusters

# Convert the set of clusters into a binary vector
def cluster_vector(doc_cluster, all_clusters):
    return [1 if cluster in doc_cluster else 0 for cluster in all_clusters]

all_clusters = list(word_clusters.keys())
doc_vectors = [cluster_vector(doc, all_clusters) for doc in doc_representation]

# Perform KMeans clustering
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=0).fit(doc_vectors)

# Show clustering results
print("\nDocument Clusters (based on word clusters):")
for i, label in enumerate(kmeans.labels_):
    print(f"Document {i+1} is in Cluster {label}")

# Step 4: Phrase-Based Clustering (Frequent Phrase Mining)

# Define a function for extracting frequent phrases (bigrams) using TF-IDF
def extract_frequent_phrases(documents, n=2, top_n=5):
    vectorizer = TfidfVectorizer(ngram_range=(n, n), stop_words=stopwords)
    X = vectorizer.fit_transform(documents)
    tfidf_scores = np.asarray(X.sum(axis=0)).flatten()
    phrases = vectorizer.get_feature_names_out()
    phrase_score_dict = dict(zip(phrases, tfidf_scores))

    # Sort by TF-IDF score and get top N phrases
    return sorted(phrase_score_dict.items(), key=lambda x: x[1], reverse=True)[:top_n]

# Extract frequent 2-grams (bigrams) from the documents using TF-IDF
frequent_phrases = extract_frequent_phrases(documents)

# Display the most frequent phrases
print("\nFrequent Phrases (using TF-IDF):")
for phrase, score in frequent_phrases:
    print(f"'{phrase}': {score}")

# Step 5: Cluster Documents Based on Phrases
# Use a simple approach where we classify documents based on the presence of the frequent phrases




Document 1: {'Sports'}
Document 2: {'Politics'}
Document 3: {'Sports'}
Document 4: {'Environment'}
Document 5: {'Politics'}

Document Clusters (based on word clusters):
Document 1 is in Cluster 0
Document 2 is in Cluster 1
Document 3 is in Cluster 0
Document 4 is in Cluster 2
Document 5 is in Cluster 1

Frequent Phrases (using TF-IDF):
'affecting environment': 0.5773502691896258
'change affecting': 0.5773502691896258
'climate change': 0.5773502691896258
'count favored': 0.5773502691896258
'favored candidate': 0.5773502691896258

Phrase-Based Clustering Results:
