In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Load the data
excel_file = '/home/rudra/Documents/GitHub/Articulation-Meter/Project Files/merged_excel_file.xlsx'
df = pd.read_excel(excel_file)

# Preprocess the data
# Convert MFCCs column from string to list of floats
df['MFCCs'] = df['MFCCs'].apply(lambda x: [float(i) for i in x.strip('[]').split(',')])

# Remove outliers using IQR method
def remove_outliers_iqr(df, column):
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    df_filtered = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df_filtered

# Remove outliers from numerical columns
numerical_cols = ['duration', 'likes', 'views', 'Energy', 'Pitch', 'Speech Rate Variation', 'Articulation Rate', 'Frequency']
for col in numerical_cols:
    df = remove_outliers_iqr(df, col)

# Feature Scaling
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df.drop(columns=['MFCCs']))

# Clustering
# Choose the number of clusters (you can tune this)
n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(df_scaled)

# Dimensionality Reduction (if needed)
pca = PCA(n_components=2)
df_pca = pca.fit_transform(df_scaled)

# Visualization (Optional)
plt.figure(figsize=(10, 6))
plt.scatter(df_pca[:, 0], df_pca[:, 1], c=df['cluster'], cmap='viridis', marker='o', alpha=0.5)
plt.title('Clustering of Data')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(label='Cluster')
plt.show()

# Output the clustered data
output_file = 'clustered_data.xlsx'
df.to_excel(output_file, index=False)
