In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from tqdm import tqdm
import seaborn as sns

In [None]:
# Load sentence features dataset
INPUT_FILE = '../data/processed/sentence_features.csv'
df = pd.read_csv(INPUT_FILE)

In [None]:
# Correlation matrix check
feature_cols = ['num_tokens', 'avg_word_length', 'ttr',
                'noun_ratio', 'verb_ratio', 'adj_ratio']
feature_data = df[feature_cols]
corr_matrix = feature_data.corr()

plt.figure(figsize=(10,8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", square=True, cbar_kws={"shrink": 0.8})
plt.title("Feature Correlation Matrix (Cleaned Features)")
plt.show()

In [None]:
# Feature selection
X = df[feature_cols].values

In [None]:
# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Dimensionality reduction (PCA)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

In [None]:
# Clustering (KMeans)
k = 5 # can be changed accordingly
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)

In [None]:
# add clustering + PCA coordinates to DataFrame
df['cluster'] = clusters
df['pca_x'] = X_pca[:,0]
df['pca_y'] = X_pca[:,1]

In [None]:
# Plot PCA scatter plot
plt.figure(figsize=(10, 7))
scatter = plt.scatter(df['pca_x'], df['pca_y'], c=df['cluster'], cmap='viridis', alpha=0.6)
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title(f'Sentence Clusters (K={k})')
plt.colorbar(scatter, label='Cluster')
plt.show()

In [None]:
# Save result
df.to_csv('../data/processed/sentence_clusters.csv', index=False)
print("✅ Clustering complete. Results saved to sentence_clusters.csv")