In [1]:
import pandas as pd
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
import re

def clean(df: pd.DataFrame) -> pd.DataFrame:
    # convert to lowercase
    df['documents'] = df['documents'].apply(lambda x: [i.lower() for i in x])

    # remove special characters
    df['documents'] = df['documents'].apply(lambda x: [re.sub(r'[^a-zA-Z0-9!.?]', ' ', i) for i in x])

    # remove multiple spaces
    df['documents'] = df['documents'].apply(lambda x: [re.sub(r' +', ' ', i) for i in x])

    # remove stopwords
    stop_words = stopwords.words('english')
    df['documents'] = df['documents'].apply(lambda x: [i for i in x if i not in stop_words])

    # Lemmatize
    lmtzr = WordNetLemmatizer()
    df['documents'] = df['documents'].apply(lambda x: [lmtzr.lemmatize(i) for i in x])

    return df

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

def get_similar(clean_df: pd.DataFrame, df: pd.DataFrame, filename: str, num_clusters: int) -> pd.DataFrame:
    tf = CountVectorizer()

    clean_df['topic'] = clean_df['documents'].apply(lambda x: KMeans(n_clusters=num_clusters, random_state=0).fit(tf.fit_transform(x)).labels_)

    groups = []
    similar_df = df.copy(deep=True)
    for i in range(clean_df.shape[0]):
        grouped_doc = {i: [] for i in range(num_clusters)}
        for idx, j in enumerate(clean_df['topic'][i]):
            grouped_doc[j].append(df['documents'][i][idx])
        max_topic = max(grouped_doc, key=lambda x: len(grouped_doc[x]))
        groups.append(grouped_doc[max_topic])
    similar_df['documents'] = groups
    similar_df['num_documents'] = similar_df['documents'].apply(lambda x: len(x))

    similar_df.to_csv(f'tf_{filename}')

In [17]:
import os

num_clusters = 3

dir_path = '../../multi_news/'
files = ['sample_train.csv', 'sample_validation.csv', 'sample_test.csv']

for file in files:
    df = pd.read_csv(os.path.join(dir_path, file))
    df['documents'] = df['documents'].apply(lambda x: eval(x))
    clean_df = clean(df.copy(deep=True))
    get_similar(clean_df, df, file, num_clusters)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
