In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

# Function to load the data from Excel
def load_data(file_path):
    df = pd.read_excel(file_path)
    english_texts = df['ENGLISH'].astype(str).values  # Convert all values to strings
    return english_texts

# Function to convert text data to numerical features using TF-IDF
def vectorize_text(texts):
    vectorizer = TfidfVectorizer()
    x = vectorizer.fit_transform(texts).toarray()  # Convert text to TF-IDF feature vectors
    return x

# Function to perform K-means clustering
def perform_kmeans_clustering(data, n_clusters=2):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
    kmeans.fit(data)
    return kmeans.labels_, kmeans.cluster_centers_

# Main function to execute the code
def main():
    # Load the data from 'Book1.xlsx'
    file_path = 'Book1.xlsx'
    english_texts = load_data(file_path)

    # Convert text to numerical features using TF-IDF
    x = vectorize_text(english_texts)

    # Perform K-means clustering
    kmeans_labels, kmeans_centers = perform_kmeans_clustering(x, n_clusters=2)

    # Calculate clustering metrics
    silhouette_avg = silhouette_score(x, kmeans_labels)
    calinski_harabasz = calinski_harabasz_score(x, kmeans_labels)
    davies_bouldin = davies_bouldin_score(x, kmeans_labels)

    # Print clustering metrics
    print("\nK-means Clustering Metrics:")
    print(f"Silhouette Score: {silhouette_avg}")
    print(f"Calinski-Harabasz Score: {calinski_harabasz}")
    print(f"Davies-Bouldin Score: {davies_bouldin}")

# Call the main function
if __name__ == "__main__":
    main()



K-means Clustering Metrics:
Silhouette Score: 0.006594347664685507
Calinski-Harabasz Score: 24.405015805544664
Davies-Bouldin Score: 9.267423933444435
