In [None]:
#tf idf
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

df = pd.read_csv('merged_newsapi_data.csv')  
vectorizer = TfidfVectorizer(
    max_features=1000,  
    stop_words='english',  
    ngram_range=(1, 2)  
)
X = vectorizer.fit_transform(df['Lemmatized'])

k = 5  
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X)

df['Cluster'] = clusters

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X.toarray())

plt.figure(figsize=(10, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='viridis', alpha=0.6)
plt.title('TF-IDF + K-Means Clustering (PCA Reduced)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.colorbar(scatter, label='Cluster')
plt.show()

print("\nTop Keywords per Cluster:")
feature_names = vectorizer.get_feature_names_out()
for i in range(k):
    cluster_words = kmeans.cluster_centers_[i].argsort()[-10:][::-1]  # Top 10 words
    print(f"Cluster {i}:", [feature_names[w] for w in cluster_words])

print("\nSample Titles per Cluster:")
for cluster_num in range(k):
    print(f"\nCluster {cluster_num}:")
    print(df[df['Cluster'] == cluster_num]['Title'].head(3).to_string(index=False))

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

df = pd.read_csv('merged_newsapi_data.csv') 

vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(df['Lemmatized'])

k = 5  
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X)

df['Cluster'] = clusters

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X.toarray())

plt.figure(figsize=(10, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='viridis', alpha=0.6)
plt.title('K-Means Clustering (TF-IDF + PCA)')
plt.show()

print("Top Keywords per Cluster:")
feature_names = vectorizer.get_feature_names_out()
for i in range(k):
    cluster_words = kmeans.cluster_centers_[i].argsort()[-10:][::-1]
    print(f"Cluster {i}: {[feature_names[w] for w in cluster_words]}")

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv('merged_newsapi_data.csv')  

vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(df['Lemmatized'])

cosine_dist = 1 - cosine_similarity(X)

Z = linkage(cosine_dist, method='ward')

num_labels = len(df)
numeric_labels = list(range(1, num_labels + 1))  

plt.figure(figsize=(16, 6))  
dendro = dendrogram(Z, labels=numeric_labels, leaf_rotation=90, leaf_font_size=7)  

plt.title('Hierarchical Clustering Dendrogram (Cosine Distance)')
plt.xlabel('Document Index')
plt.ylabel('Distance')
plt.tight_layout()
plt.show()

mapping = {idx: title for idx, title in zip(numeric_labels, df['Title'])}

print("\n=== Document Index Mapping ===")
for idx, title in mapping.items():
    print(f"{idx}: {title}")


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # For 3D plotting

# Install missing packages if needed
try:
    import plotly
except ImportError:
    !pip install plotly nbformat --quiet
    import plotly

df = pd.read_csv('merged_newsapi_data.csv') 

# 1. TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_tfidf = vectorizer.fit_transform(df['Lemmatized'])

# 2. Dimensionality Reduction to 3D using PCA
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_tfidf.toarray())

# 3. K-Means Clustering on the 3D PCA-reduced data
k = 5  # Number of clusters
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X_pca)

# Add clusters and PCA coordinates to DataFrame
df['Cluster'] = clusters
df['PCA1'] = X_pca[:, 0]  # First principal component
df['PCA2'] = X_pca[:, 1]  # Second principal component
df['PCA3'] = X_pca[:, 2]  # Third principal component

# 4. 3D Visualization using Matplotlib
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')

colors = ['r', 'g', 'b', 'c', 'm', 'y', 'k']
for i in range(k):
    cluster_data = df[df['Cluster'] == i]
    ax.scatter(cluster_data['PCA1'], 
               cluster_data['PCA2'], 
               cluster_data['PCA3'], 
               c=colors[i], 
               label=f'Cluster {i}',
               alpha=0.6)

ax.set_title('K-Means Clustering on 3D PCA-Reduced TF-IDF Vectors')
ax.set_xlabel('PCA Component 1')
ax.set_ylabel('PCA Component 2')
ax.set_zlabel('PCA Component 3')
ax.legend()
plt.tight_layout()
plt.show()

# 5. Print cluster keywords
print("\nTop Keywords per Cluster:")
feature_names = vectorizer.get_feature_names_out()
for i in range(k):
    # Get indices of documents in this cluster
    cluster_indices = df[df['Cluster'] == i].index
    # Get TF-IDF vectors for this cluster and average them
    cluster_tfidf = X_tfidf[cluster_indices].mean(axis=0)
    # Get top 10 words
    top_words = cluster_tfidf.argsort()[0, -10:][::-1]
    print(f"Cluster {i}: {[feature_names[w] for w in top_words.flatten()]}")

# 6. Print sample titles from each cluster
print("\nSample Titles per Cluster:")
for i in range(k):
    print(f"\nCluster {i}:")
    print(df[df['Cluster'] == i]['Title'].head(3).to_string(index=False))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, silhouette_samples
from scipy.cluster.hierarchy import dendrogram, linkage
import seaborn as sns

df = pd.read_csv('merged_newsapi_data.csv')

vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_tfidf = vectorizer.fit_transform(df['Lemmatized'])

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_tfidf.toarray())

k_values = [3, 5, 7]
results = {}

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    clusters = kmeans.fit_predict(X_pca)
    
    results[k] = {
        'model': kmeans,
        'clusters': clusters,
        'silhouette': silhouette_score(X_pca, clusters)
    }
    
    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot(111, projection='3d')
    
    for i in range(k):
        ax.scatter(X_pca[clusters==i, 0], 
                   X_pca[clusters==i, 1], 
                   X_pca[clusters==i, 2], 
                   label=f'Cluster {i}',
                   alpha=0.6)
    
    ax.set_title(f'K-Means Clustering (k={k})\nSilhouette: {results[k]["silhouette"]:.3f}')
    ax.set_xlabel('PCA 1')
    ax.set_ylabel('PCA 2')
    ax.set_zlabel('PCA 3')
    ax.legend()
    plt.tight_layout()
    plt.show()

plt.figure(figsize=(12, 8))
for i, k in enumerate(k_values):
    plt.subplot(3, 1, i+1)
    
    silhouette_vals = silhouette_samples(X_pca, results[k]['clusters'])
    
    y_lower = 10
    for j in range(k):
        jth_cluster_silhouette = silhouette_vals[results[k]['clusters'] == j]
        jth_cluster_silhouette.sort()
        
        y_upper = y_lower + jth_cluster_silhouette.shape[0]
        
        color = plt.cm.nipy_spectral(float(j) / k)
        plt.fill_betweenx(np.arange(y_lower, y_upper),
                          0, jth_cluster_silhouette,
                          facecolor=color, edgecolor=color, alpha=0.7)
        
        plt.text(-0.05, y_lower + 0.5 * jth_cluster_silhouette.shape[0], str(j))
        y_lower = y_upper + 10
    
    plt.title(f'Silhouette Plot for k={k}')
    plt.xlabel("Silhouette coefficient values")
    plt.ylabel("Cluster label")
    plt.axvline(x=results[k]['silhouette'], color="red", linestyle="--")
    plt.yticks([])

plt.tight_layout()
plt.show()

plt.figure(figsize=(15, 7))
Z = linkage(X_pca, method='ward')
dendrogram(Z, truncate_mode='lastp', p=20, show_leaf_counts=True)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Document Index')
plt.ylabel('Distance')
plt.show()
best_k = 3
kmeans = results[best_k]['model']
feature_names = vectorizer.get_feature_names_out()

print("\nTop Keywords per Cluster (k=3):")
for i in range(best_k):
    cluster_mask = (results[best_k]['clusters'] == i)
    cluster_tfidf = X_tfidf[cluster_mask].mean(axis=0)
    top_words = cluster_tfidf.argsort()[0, -10:][::-1]
    print(f"Cluster {i}: {[feature_names[w] for w in top_words.flatten()]}")
    
    print("Sample Titles:")
    print(df[cluster_mask]['Title'].head(3).to_string(index=False))
    print("\n" + "-"*50 + "\n")