In [2]:
import numpy as np
import pandas as pd

# Read the data into a pandas DataFrame
df = pd.read_excel('gof-problem-domain.xlsx', dtype='string')

# Merge different parts of the problem domain (intent, motivation, etc.)
series = df.agg(' '.join, axis=1).astype('string')

# Replace all non-alphabetical characters with whitespace
series = series.str.replace('[^A-Za-z]', repl=' ', regex=True)
# Replace all repeated whitespace with a single whitespace character
series = series.str.replace('\s+', repl=' ', regex=True)
# Lowercase the input
series = series.str.lower()
# Remove leading and trailing whitespace
series = series.str.strip()

series[:5]

0    abstract factory provide an interface for crea...
1    builder separate the construction of a complex...
2    factory method define an interface for creatin...
3    prototype specify the kinds of objects to crea...
4    singleton ensure a class only has one instance...
dtype: string

In [7]:
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

# Put stop words in a set to improve performance
stop_words = set(stopwords.words('english'))
# Use Snowball stemmer to replicate Dr. Hussain's stemming
stemmer = SnowballStemmer('english')

series = series.apply(lambda row: ' '.join(
    [stemmer.stem(word) for word in row.split() if word not in stop_words])).astype('string')

series[:5]

0    abstract factori provid interfac creat famili ...
1    builder separ construct complex object repr co...
2    factori method defin interfac creat object let...
3    prototyp specifi kind object creat use prototy...
4    singleton ensur class one instanc provid globa...
dtype: string

In [124]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(series)
display(vectorizer.get_feature_names_out().tolist()[:10])
print(X.shape)

['abl',
 'abouttoopendocu',
 'abstract',
 'abstractlist',
 'accept',
 'access',
 'accommod',
 'accomplish',
 'accord',
 'account']

(23, 1080)


In [125]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=3, random_state=0, n_init='auto').fit(X)
km.labels_.tolist()

[2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 1, 2, 0, 0, 2, 2, 0, 1, 1, 1, 2, 2, 2]

In [126]:
true_1 = [0]*5 + [1]*7 + [2]*11
true_2 = [0]*5 + [2]*7 + [1]*11
true_3 = [1]*5 + [0]*7 + [2]*11
true_4 = [1]*5 + [2]*7 + [0]*11
true_5 = [2]*5 + [0]*7 + [1]*11
true_6 = [2]*5 + [1]*7 + [0]*11

from sklearn.metrics import f1_score

fscores = [
    f1_score(true_1, km.labels_.tolist(),average='micro'),
    f1_score(true_2, km.labels_.tolist(),average='micro'),
    f1_score(true_3, km.labels_.tolist(),average='micro'),
    f1_score(true_4, km.labels_.tolist(),average='micro'),
    f1_score(true_5, km.labels_.tolist(),average='micro'),
    f1_score(true_6, km.labels_.tolist(),average='micro')
]

km_best = np.around(max(fscores),3)
print('Best fscore is:', km_best, 'from true_' + str(np.argmax(fscores) + 1))
display(globals()['true_' + str(np.argmax(fscores) + 1)])

Best fscore is: 0.435 from true_2


[0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]