In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.svm import SVC
from sklearn.metrics import silhouette_score
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Load data from 'features_generated.csv'
df1 = pd.read_csv('features_generated_for_clustering.csv')
# df2 = pd.read_csv('features_generated_known_data.csv')

#concatenating known data and unknown data
# result = pd.concat([df1, df2], axis=0, ignore_index=True)
result = df1

# # Separate the features and the 'File' column
# file_names = df1['File']  # Optional: store file names if needed separately
# features = df1.drop(columns=['File'])  # Drop the 'File' column to keep only features


# Separate the features and the 'File' column
file_names = result['File']  # Optional: store file names if needed separately
features = result.drop(columns=['File'])  # Drop the 'File' column to keep only features



# Standardize the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

normalizer = MinMaxScaler()
features_scaled = normalizer.fit_transform(features_scaled)


# If you want to store the standardized features back into a DataFrame
features_scaled_df = pd.DataFrame(features_scaled, columns=features.columns)


In [None]:
print(features_scaled_df.shape)
features_scaled_df


In [None]:
file_names

In [None]:
correlation_matrix_features_scaled = features_scaled_df.corr()

In [None]:
# correlation heat map for feature just after scaling
plt.figure(figsize=(100,100))
sns.heatmap(correlation_matrix_features_scaled, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title("Correlation Heatmap (before reomving high-correlation features)")
plt.show()

In [None]:

# correlation heat map for feature after rremoving high correlation features

# Step 1: Compute the correlation matrix
corr_matrix = correlation_matrix_features_scaled.abs()

# Step 2: Select the upper triangle of the correlation matrix
upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Step 3: Identify and remove features with a high correlation (> 0.95)
to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.95)]
features_reduced_df = features_scaled_df.drop(columns=to_drop)


In [None]:

# Step 4: Generate and plot the correlation heatmap
plt.figure(figsize=(100, 100))
sns.heatmap(features_reduced_df.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
plt.title("Correlation Heatmap (with high-correlation features removed)")
plt.show()


In [None]:
print(features_reduced_df.shape)
features_reduced_df

In [None]:
# import pandas as pd
# import matplotlib.pyplot as plt
# from statsmodels.stats.outliers_influence import variance_inflation_factor


In [None]:

# # Calculate VIF for each feature in the DataFrame
# vif_data = pd.DataFrame()
# vif_data['feature'] = features_reduced_df.columns
# vif_data['VIF'] = [variance_inflation_factor(features_reduced_df.values, i) 
#                    for i in range(features_reduced_df.shape[1])]

# # Plot VIF scores for all features
# plt.figure(figsize=(10, 8))
# plt.barh(vif_data['feature'], vif_data['VIF'], color="skyblue")
# plt.xlabel('VIF Score')
# plt.ylabel('Features')
# plt.title('VIF Scores for All Features')
# plt.legend()
# plt.gca().invert_yaxis()
# plt.show()


In [None]:
# from sklearn.neighbors import LocalOutlierFactor
# from sklearn.ensemble import IsolationForest
# from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
# import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns


In [None]:

# # Step 1: Initialize Models
# lof = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
# isoforest = IsolationForest(contamination=0.1, random_state=42)

# # Step 2: Apply LOF
# lof_outliers = lof.fit_predict(features_reduced_df)
# lof_scores = -lof.negative_outlier_factor_  # LOF scores, higher means more outlier-like

# # Step 3: Apply Isolation Forest
# isoforest.fit(features_reduced_df)
# iso_outliers = isoforest.predict(features_reduced_df)
# iso_scores = isoforest.decision_function(features_reduced_df)  # Higher means less outlier-like


In [None]:

# # Converting LOF results to 0s and 1s (1 for inliers, -1 for outliers)
# lof_outliers = np.where(lof_outliers == 1, 0, 1)
# iso_outliers = np.where(iso_outliers == 1, 0, 1)

# # Step 4: Compare Outlier Detection Counts
# print(f"LOF Outliers: {np.sum(lof_outliers)}")
# print(f"Isolation Forest Outliers: {np.sum(iso_outliers)}")


In [None]:

# # Step 5: Plot the distribution of scores
# plt.figure(figsize=(12, 6))

# # LOF Scores
# plt.subplot(1, 2, 1)
# sns.histplot(lof_scores, bins=30, kde=True)
# plt.title("LOF Outlier Scores Distribution")
# plt.xlabel("LOF Score (Higher is more outlier-like)")

# # Isolation Forest Scores
# plt.subplot(1, 2, 2)
# sns.histplot(iso_scores, bins=30, kde=True)
# plt.title("Isolation Forest Scores Distribution")
# plt.xlabel("Isolation Score (Lower is more outlier-like)")

# plt.tight_layout()
# plt.show()

# # Step 6: (Optional) Precision, Recall, F1 Score
# # If you have labeled data for outliers, use these metrics to evaluate model performance:
# # Assume `true_labels` contains 0 for inliers, 1 for actual outliers (if available)
# # print("LOF Precision, Recall, F1:", precision_score(true_labels, lof_outliers),
# #       recall_score(true_labels, lof_outliers), f1_score(true_labels, lof_outliers))
# # print("Isolation Forest Precision, Recall, F1:", precision_score(true_labels, iso_outliers),
# #       recall_score(true_labels, iso_outliers), f1_score(true_labels, iso_outliers))


In [None]:

# Optional: If you want to include the 'File' column back in the standardized DataFrame
# features_reduced_df.insert(0, 'File', file_names)
features_reduced_df


In [None]:
import umap
from sklearn.decomposition import KernelPCA
from sklearn.manifold import TSNE

In [None]:
# features_scaled = features_reduced_df
features_scaled_df = features_scaled_df

# Feature Engineering
# 1. Dimensionality Reduction using PCA
pca = PCA(n_components=0.95)  # Retain 95% of the variance
features_pca = pca.fit_transform(features_scaled)
explained_variance = pca.explained_variance_ratio_
cumulative_variance = explained_variance.cumsum()
# Find the number of components that retain 95% of the variance
n_components_95 = np.argmax(cumulative_variance >= 0.95) + 1  # Add 1 because index starts from 0
# Plot the cumulative explained variance
plt.figure(figsize=(8, 6))
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Cumulative Explained Variance in PCA')
plt.axhline(y=0.95, color='r', linestyle='--')  # 95% variance threshold
plt.show()
print(f"Number of components that retain 95% of the variance: {n_components_95}")



In [None]:

# 2. Feature Selection
selector = SelectKBest(mutual_info_classif, k=200)  # Select top k features
features_selected = selector.fit_transform(features_scaled, np.random.randint(0, 2, size=features.shape[0]))  # Random target for selection

# 3. t-SNE 
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
features_tsne = tsne.fit_transform(features_scaled)

# 4. UMAP
umap_reducer = umap.UMAP(n_components=55, random_state=42)  # For 2D, change n_components to 3 for 3D
features_umap = umap_reducer.fit_transform(features_pca)

# 5. Kernal PCA
kpca = KernelPCA(n_components=50, kernel='rbf', gamma=0.1)
features_kpca = kpca.fit_transform(features_scaled)

# # Choosing the best feature set
# if features_selected.shape[1] < features_pca.shape[1]:  # Choose based on lower dimensionality
#     print("Using SelectKbest")
#     features_for_clustering = features_selected
# else:
#     print("Using PCA")
#     features_for_clustering = features_pca

### Selecting the required features
features_for_clustering = features_umap


In [None]:
from scipy.spatial.distance import pdist, squareform
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:

# Calculate Euclidean distance matrix
distance_matrix = squareform(pdist(features_for_clustering, metric='euclidean'))

# Plot as heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(distance_matrix, cmap='viridis', cbar=True)
plt.title("Euclidean Distance Matrix")
plt.xlabel("Sample Index")
plt.ylabel("Sample Index")
plt.show()


In [None]:
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score


In [None]:
clusters_needed = 6

In [None]:

# Clustering
# 1. KMeans Clustering
kmeans = KMeans(n_clusters=clusters_needed, random_state=42)  # Adjust clusters based on data
kmeans_labels = kmeans.fit_predict(features_for_clustering)

# Evaluate KMeans
print("KMeans Clustering:")
# Calculate clustering metrics for K-means
silhouette_kmeans = silhouette_score(features_for_clustering, kmeans_labels)
dbi_kmeans = davies_bouldin_score(features_for_clustering, kmeans_labels)
ch_kmeans = calinski_harabasz_score(features_for_clustering, kmeans_labels)
print("K-means Clustering Metrics:")
print(f"Silhouette Score: {silhouette_kmeans:.4f}")
print(f"Davies-Bouldin Index: {dbi_kmeans:.4f}")
print(f"Calinski-Harabasz Index: {ch_kmeans:.4f}")


In [None]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score


In [None]:

# for linkage in ['ward', 'complete', 'average', 'single']:
#     hierarchical = AgglomerativeClustering(n_clusters=clusters_needed, linkage=linkage)
#     labels = hierarchical.fit_predict(features_for_clustering)
    
#     print(f"Linkage: {linkage}")
#     print(f"Silhouette Score: {silhouette_score(features_for_clustering, labels):.4f}")
#     print(f"Davies-Bouldin Index: {davies_bouldin_score(features_for_clustering, labels):.4f}")
#     print(f"Calinski-Harabasz Index: {calinski_harabasz_score(features_for_clustering, labels):.4f}\n")

#     # Plot the Dendrogram
#     plt.figure(figsize=(10, 7))
#     dendrogram = sch.dendrogram(sch.linkage(features_for_clustering, method=linkage))

#     # Add a horizontal line to show where the dendrogram is being cut for 6 clusters
#     plt.axhline(y=150, color='r', linestyle='--')  # Adjust the height where the cut is made
#     plt.title(f'Hierarchical Clustering Dendrogram: {linkage}')
#     plt.xlabel('Sample Index')
#     plt.ylabel('Distance')
#     plt.show()


In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt
import seaborn as sns

# Define PCA and t-SNE transformers outside the loop to avoid redundant transformations
pca = PCA(n_components=2)
tsne = TSNE(n_components=2, random_state=42)

# Perform PCA and t-SNE transformations
features_pca_2d = pca.fit_transform(features_for_clustering)
features_tsne_2d = tsne.fit_transform(features_for_clustering)

# Clustering loop with additional PCA and t-SNE plots
for linkage in ['ward', 'complete', 'average', 'single']:
    hierarchical = AgglomerativeClustering(n_clusters=clusters_needed, linkage=linkage)
    labels = hierarchical.fit_predict(features_for_clustering)
    
    print(f"Linkage: {linkage}")
    print(f"Silhouette Score: {silhouette_score(features_for_clustering, labels):.4f}")
    print(f"Davies-Bouldin Index: {davies_bouldin_score(features_for_clustering, labels):.4f}")
    print(f"Calinski-Harabasz Index: {calinski_harabasz_score(features_for_clustering, labels):.4f}\n")

    # Plot the Dendrogram
    plt.figure(figsize=(10, 7))
    dendrogram = sch.dendrogram(sch.linkage(features_for_clustering, method=linkage))
    plt.axhline(y=150, color='r', linestyle='--')  # Adjust the height where the cut is made
    plt.title(f'Hierarchical Clustering Dendrogram: {linkage}')
    plt.xlabel('Sample Index')
    plt.ylabel('Distance')
    plt.show()
    
    # Plot 2D PCA
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    sns.scatterplot(x=features_pca_2d[:, 0], y=features_pca_2d[:, 1], hue=labels, palette='viridis')
    plt.title(f'2D PCA: {linkage} Linkage')
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    
    # Plot 2D t-SNE
    plt.subplot(1, 2, 2)
    sns.scatterplot(x=features_tsne_2d[:, 0], y=features_tsne_2d[:, 1], hue=labels, palette='viridis')
    plt.title(f'2D t-SNE: {linkage} Linkage')
    plt.xlabel('t-SNE Component 1')
    plt.ylabel('t-SNE Component 2')
    
    plt.tight_layout()
    plt.show()


In [None]:
import scipy.cluster.hierarchy as sch

In [None]:

# 2. Hierarchical Clustering
hierarchical = AgglomerativeClustering(n_clusters=clusters_needed,linkage="complete")
hierarchical_labels = hierarchical.fit_predict(features_for_clustering)

# Evaluate Hierarchical
print("\nHierarchical Clustering:")
# Calculate clustering metrics for Hierarchical Clustering
silhouette_hierarchical = silhouette_score(features_for_clustering, hierarchical_labels)
dbi_hierarchical = davies_bouldin_score(features_for_clustering, hierarchical_labels)
ch_hierarchical = calinski_harabasz_score(features_for_clustering, hierarchical_labels)
print("\nHierarchical Clustering Metrics:")
print(f"Silhouette Score: {silhouette_hierarchical:.4f}")
print(f"Davies-Bouldin Index: {dbi_hierarchical:.4f}")
print(f"Calinski-Harabasz Index: {ch_hierarchical:.4f}")

# Plot the Dendrogram
plt.figure(figsize=(10, 7))
dendrogram = sch.dendrogram(sch.linkage(features_for_clustering, method='complete'))

# Add a horizontal line to show where the dendrogram is being cut for 6 clusters
plt.axhline(y=150, color='r', linestyle='--')  # Adjust the height where the cut is made
plt.title('Hierarchical Clustering Dendrogram: complete')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.show()


In [None]:
# from sklearn.cluster import DBSCAN

# dbscan = DBSCAN(eps=12, min_samples=3)
# dbscan_labels = dbscan.fit_predict(features_for_clustering)

# print("DBSCAN metrics")
# print(dbscan_labels)

# # Filter out noise points for metrics calculation
# core_samples_mask = dbscan_labels != -1
# core_features = features_for_clustering[core_samples_mask]
# core_labels = dbscan_labels[core_samples_mask]

# # Calculate metrics only on core points
# if len(np.unique(core_labels)) > 1:  # Ensure there's more than 1 cluster
#     silhouette = silhouette_score(core_features, core_labels)
#     davies_bouldin = davies_bouldin_score(core_features, core_labels)
#     calinski_harabasz = calinski_harabasz_score(core_features, core_labels)
#     print("\nDBScan Clustering Metrics:")
#     print(f"Silhouette Score (core points): {silhouette:.4f}")
#     print(f"Davies-Bouldin Index (core points): {davies_bouldin:.4f}")
#     print(f"Calinski-Harabasz Index (core points): {calinski_harabasz:.4f}")
# else:
#     print("Not enough clusters for metric calculation.")



In [None]:

# # 3. SVC-based Clustering (using linear kernel for simplicity)
# svc = SVC(kernel='linear')
# svc.fit(features_for_clustering, kmeans_labels)  # Fit SVC to predict cluster labels
# svc_labels = svc.predict(features_for_clustering)

# # Evaluate SVC-based clustering
# print("\nSVC-based Clustering:")
# silhouette_svc = silhouette_score(features_for_clustering, svc_labels)
# dbi_svc = davies_bouldin_score(features_for_clustering, svc_labels)
# ch_svc = calinski_harabasz_score(features_for_clustering, svc_labels)
# print(f"Silhouette Score: {silhouette_score(features_for_clustering, svc_labels)}")


In [None]:
# Print the clustering evaluation metrics for both methods
print("K-means Clustering Metrics:")
print(f"Silhouette Score: {silhouette_kmeans:.4f}")
print(f"Davies-Bouldin Index: {dbi_kmeans:.4f}")
print(f"Calinski-Harabasz Index: {ch_kmeans:.4f}")

print("\nHierarchical Clustering Metrics:")
print(f"Silhouette Score: {silhouette_hierarchical:.4f}")
print(f"Davies-Bouldin Index: {dbi_hierarchical:.4f}")
print(f"Calinski-Harabasz Index: {ch_hierarchical:.4f}")

# Calculate metrics

# print("\nSVC Clustering Metrics:")
# print(f"Silhouette Score: {silhouette_svc:.4f}")
# print(f"Davies-Bouldin Index: {dbi_svc:.4f}")
# print(f"Calinski-Harabasz Index: {ch_svc:.4f}")

In [None]:
# Display clustering results
results_df = pd.DataFrame({
    'File': file_names,
    'KMeans_Label': kmeans_labels,
    'Hierarchical_Label': hierarchical_labels,
    # 'SVC_Label': svc_labels
})

results_df.to_csv(f'labels.csv', index=False)


# print("\nClustering Results:")
# print(results_df.head())

# # Visualization of clustering (Optional)
# plt.figure(figsize=(12, 6))
# plt.subplot(1, 3, 1)
# sns.scatterplot(x=features_for_clustering[:, 0], y=features_for_clustering[:, 1], hue=kmeans_labels, palette='viridis')
# plt.title('KMeans Clustering')
# plt.subplot(1, 3, 2)
# sns.scatterplot(x=features_for_clustering[:, 0], y=features_for_clustering[:, 1], hue=hierarchical_labels, palette='viridis')
# plt.title('Hierarchical Clustering')
# plt.subplot(1, 3, 3)
# sns.scatterplot(x=features_for_clustering[:, 0], y=features_for_clustering[:, 1], hue=svc_labels, palette='viridis')
# plt.title('SVC-based Clustering')
# plt.show()


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.svm import SVC
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns


I have a labels array kmeans_labels with labels for various files in file_names array. Now i want to change the labels in kmeans_values(integer values) corresponding to file names starting with 'Jana-Gana' as 6, starting with 'Michael-Jackson' as 7, 'Asha-Bhosle' as 8, 'Kishore-Kumar' as 8, 'Bhavgeet' as 9, 'Lavni' as 10.

In [None]:
# # Iterate over file_names and update kmeans_labels based on conditions
# for idx, file_name in enumerate(file_names):
#     if file_name.startswith('Jana-Gana'):
#         kmeans_labels[idx] = 6
#         hierarchical_labels[idx] = 6
#     elif file_name.startswith('Michael-Jackson'):
#         kmeans_labels[idx] = 7
#         hierarchical_labels[idx] = 7
#     elif file_name.startswith('Asha-Bhosle') or file_name.startswith('Kishore-Kumar'):
#         kmeans_labels[idx] = 8
#         hierarchical_labels[idx] = 8
#     elif file_name.startswith('Bhavgeet'):
#         kmeans_labels[idx] = 9
#         hierarchical_labels[idx] = 9
#     elif file_name.startswith('Lavni'):
#         kmeans_labels[idx] = 10
#         hierarchical_labels[idx] = 10

# # kmeans_labels now has the updated labels

In [None]:
from matplotlib.colors import ListedColormap

In [None]:

colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', 
          '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf', '#ff9896']
custom_palette = sns.color_palette(colors)

In [None]:

# Apply PCA to reduce data to 2D
pca_2d = PCA(n_components=2, random_state=42)
features_pca_2d = pca_2d.fit_transform(features_for_clustering)

# Plot the 2D PCA visualization for KMeans and Hierarchical clustering results
plt.figure(figsize=(18, 6))

# KMeans Clustering
plt.subplot(1, 2, 1)
sns.scatterplot(x=features_pca_2d[:, 0], y=features_pca_2d[:, 1], hue=kmeans_labels, palette='viridis')
plt.title('KMeans Clustering (2D PCA)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')

# Hierarchical Clustering
plt.subplot(1, 2, 2)
sns.scatterplot(x=features_pca_2d[:, 0], y=features_pca_2d[:, 1], hue=hierarchical_labels, palette='viridis')
plt.title('Hierarchical Clustering (2D PCA)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')

plt.tight_layout()
plt.show()


In [None]:
# 2D t-SNE Visualization
tsne_2d = TSNE(n_components=2, random_state=10)
features_tsne_2d = tsne_2d.fit_transform(features_for_clustering)

plt.figure(figsize=(18, 6))
plt.subplot(1, 2, 1)
sns.scatterplot(x=features_tsne_2d[:, 0], y=features_tsne_2d[:, 1], hue=kmeans_labels, palette='viridis')
plt.title('KMeans Clustering (2D t-SNE)')
plt.subplot(1, 2, 2)
sns.scatterplot(x=features_tsne_2d[:, 0], y=features_tsne_2d[:, 1], hue=hierarchical_labels, palette='viridis')
plt.title('Hierarchical Clustering (2D t-SNE)')


I have a labels array kmeans_labels with labels for various files in file_names array. Now i want to change the labels in kmeans_values(integer values) corresponding to file names starting with 
'Jana-Gana' as 6,
'Michael-Jackson' as 7,
'Asha-Bhosle' as 8,
'Kishore-Kumar' as 8,
'Bhavgeet' as 9,
'Lavni' as 10.

In [None]:

# 3D t-SNE Visualization
tsne_3d = TSNE(n_components=3, random_state=42)
features_tsne_3d = tsne_3d.fit_transform(features_for_clustering)

fig = plt.figure(figsize=(18, 6))
ax1 = fig.add_subplot(131, projection='3d')
ax1.scatter(features_tsne_3d[:, 0], features_tsne_3d[:, 1], features_tsne_3d[:, 2], c=kmeans_labels, cmap='viridis')
ax1.set_title('KMeans Clustering (3D t-SNE)')

ax2 = fig.add_subplot(132, projection='3d')
ax2.scatter(features_tsne_3d[:, 0], features_tsne_3d[:, 1], features_tsne_3d[:, 2], c=hierarchical_labels, cmap='viridis')
ax2.set_title('Hierarchical Clustering (3D t-SNE)')

# ax3 = fig.add_subplot(133, projection='3d')
# ax3.scatter(features_tsne_3d[:, 0], features_tsne_3d[:, 1], features_tsne_3d[:, 2], c=svc_labels, cmap='viridis')
# ax3.set_title('SVC-based Clustering (3D t-SNE)')

plt.show()