In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

In [6]:


# Download NLTK resources if not already done
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

# Load the CSV file
df = pd.read_csv('./book_titles.csv', sep='|')

# Extract the 'simplified_title' column
titles = df['simplified_title'].fillna('')  # Handle any NaN values


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/francesco/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/francesco/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/francesco/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [5]:

# Preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

# Apply preprocessing
processed_titles = titles.apply(preprocess_text)


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/home/francesco/nltk_data'
    - '/home/francesco/miniconda3/envs/py310_data_analysis/nltk_data'
    - '/home/francesco/miniconda3/envs/py310_data_analysis/share/nltk_data'
    - '/home/francesco/miniconda3/envs/py310_data_analysis/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:

# Vectorize using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)  # Limit features for efficiency
X = vectorizer.fit_transform(processed_titles)

# Apply K-Means clustering with 20 clusters
kmeans = KMeans(n_clusters=20, random_state=42)
clusters = kmeans.fit_predict(X)

# Add cluster labels to the dataframe
df['cluster'] = clusters

# Optional: Visualize clusters using PCA (reduce to 2D)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X.toarray())

plt.figure(figsize=(10, 8))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='viridis')
plt.title('Book Titles Clustered into 20 Topics (PCA Visualization)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.colorbar(label='Cluster')
plt.show()

# Print cluster assignments (first 10 for brevity)
print(df[['simplified_title', 'cluster']].head(10))

# To see top terms per cluster
terms = vectorizer.get_feature_names_out()
for i in range(20):
    cluster_center = kmeans.cluster_centers_[i]
    top_terms_idx = cluster_center.argsort()[-10:][::-1]  # Top 10 terms
    top_terms = [terms[idx] for idx in top_terms_idx]
    print(f"Cluster {i}: {', '.join(top_terms)}")