In [1]:
import pandas as pd

In [2]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def clean(df: pd.DataFrame) -> pd.DataFrame:
    # Convert to lowercase
    df['documents'] = df['documents'].apply(lambda x: [i.lower() for i in x])

    # Remove special characters
    df['documents'] = df['documents'].apply(lambda x: [re.sub(r'[^a-zA-Z0-9!.?]', ' ', i) for i in x])

    # Remove multiple spaces
    df['documents'] = df['documents'].apply(lambda x: [re.sub(r' +', ' ', i) for i in x])

    # Remove stopwords
    stop_words = stopwords.words('english')
    df['documents'] = df['documents'].apply(lambda x: [i for i in x if i not in stop_words])

    # Lemmatize words
    lmtzr = WordNetLemmatizer()
    df['documents'] = df['documents'].apply(lambda x: [lmtzr.lemmatize(i) for i in x])

    return df

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

def get_similar(clean_df: pd.DataFrame, df: pd.DataFrame, filename: str, num_clusters: int) -> pd.DataFrame:
    tfidf = TfidfVectorizer()

    # Create a set of all documents
    documents = set()
    for one_topic in clean_df['documents']:
        documents.update(one_topic)

    # Compute TF-IDF matrix for the documents
    tfidf_matrix = tfidf.fit_transform(documents)

    # Apply KMeans clustering to the TF-IDF matrix
    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(tfidf_matrix)

    # Predict the cluster labels for the documents and store them in 'topic' column
    clean_df['topic'] = clean_df['documents'].apply(lambda x: kmeans.predict(tfidf.transform(x)))

    groups = []
    similar_df = df.copy(deep=True)
    for i in range(clean_df.shape[0]):
        # Group documents by their cluster labels
        grouped_doc = {i: [] for i in range(num_clusters)}
        for idx, j in enumerate(clean_df['topic'][i]):
            grouped_doc[j].append(df['documents'][i][idx])
        # Find the cluster with the most documents and store its documents in 'groups'
        max_topic = max(grouped_doc, key=lambda x: len(grouped_doc[x]))
        groups.append(grouped_doc[max_topic])
    # Replace 'documents' column with the grouped documents and count the number of documents in each group
    similar_df['documents'] = groups
    similar_df['num_documents'] = similar_df['documents'].apply(lambda x: len(x))

    # Save the DataFrame to a CSV file
    similar_df.to_csv(f'tf_idf_method_1_{filename}')

In [None]:
import os

num_clusters = 3  # Number of clusters for KMeans

dir_path = '../../multi_news/'  # Directory path for the files
files = ['sample_train.csv', 'sample_validation.csv', 'sample_test.csv']  # List of files to process

for file in files:
    df = pd.read_csv(os.path.join(dir_path, file))  # Read the file into a DataFrame
    df['documents'] = df['documents'].apply(lambda x: eval(x))  # Convert 'documents' column from string to list
    clean_df = clean(df.copy(deep=True))  # Clean the DataFrame
    get_similar(clean_df, df, file, num_clusters)  # Apply KMeans clustering and save the result