In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Function to load the data from Excel
def load_data(file_path):
    df = pd.read_excel(file_path)

    # Extract the 'ENGLISH' column
    english_texts = df['ENGLISH'].astype(str).values  # Convert all values to strings
    return english_texts

# Function to convert text data to numerical features using TF-IDF
def vectorize_text(texts):
    vectorizer = TfidfVectorizer()
    x = vectorizer.fit_transform(texts).toarray()  # Convert text to TF-IDF feature vectors
    return x

# Function to perform K-means clustering
def perform_kmeans_clustering(data, n_clusters=2):
    kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init='auto')
    kmeans.fit(data)
    return kmeans.labels_, kmeans.cluster_centers_

# Main function to execute the code
def main():
    # Load the data from 'Book1.xlsx'
    file_path = 'Book1.xlsx'
    english_texts = load_data(file_path)

    # Convert text to numerical features using TF-IDF
    x = vectorize_text(english_texts)

    # Perform K-means clustering
    kmeans_labels, kmeans_centers = perform_kmeans_clustering(x, n_clusters=2)

    # Print clustering results
    print("\nK-means Clustering Results:")
    print("Cluster Labels:", kmeans_labels)
    print("Cluster Centers:\n", kmeans_centers)

# Call the main function
if __name__ == "__main__":
    main()



K-means Clustering Results:
Cluster Labels: [1 1 1 ... 1 1 1]
Cluster Centers:
 [[1.40946282e-18 1.62630326e-18 8.62450428e-04 ... 1.44198889e-17
  2.38524478e-18 2.52077005e-18]
 [3.79412550e-04 1.69384551e-04 5.48566003e-04 ... 1.20136209e-03
  4.50062213e-04 2.83569288e-04]]
