In [1]:
import os
import re
import sys

import numpy as np
import pandas as pd
from fcmeans import FCM
from nltk import PorterStemmer
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from sklearn import cluster
from sklearn.cluster import AgglomerativeClustering, BisectingKMeans, KMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn_extra.cluster import KMedoids

In [2]:
def preprocess(series: pd.Series, verbs: bool=False):
    stop_words = set(stopwords.words("english"))

    stemmer = PorterStemmer()

    # Lowercase the string
    series = series.str.lower()

    # Replace all non-alphabetical characters with whitespace
    series = series.str.replace("[^A-Za-z]", repl=" ", regex=True)

    # Remove stop words and stem
    if verbs:
        print("verbs only")
        series = series.map(
            lambda x: " ".join(
                [stemmer.stem(word) for word in x.split() if word not in stop_words and pos_tag([word], tagset="universal")[0][1] == "VERB"]
            )
        )
        display(series[:5])
    else:
        print("no verbs change")
        series = series.map(
            lambda x: " ".join(
                [stemmer.stem(word) for word in x.split() if word not in stop_words]
            )
        )
        display(series[:5])
    
    return series

In [3]:
FILENAME = "GOF Patterns (2.0).csv"
file_path = os.path.join(os.path.dirname(__file__), f"../data/{FILENAME}")

df = pd.read_csv(file_path)
    
# ADAPTER
design_problem = "Design a drawing editor. A design is composed of te graphics (lines, rectangles and roses), positioned at precise positions. Each graphic form must be modeled by a class that provides a method draw(): void. A rose is a complex graphic designed by a black-box class component. This component performs this drawing in memory, and provides access through a method getRose(): int that returns the address of the drawing. It is probable that the system evolves in order to draw circles"

del df["correct_category"]

df

Unnamed: 0,name,overview
0,abstract_factory,Provide an interface for creating families of ...
1,builder,Separate the construction of a complex object ...
2,factory_method,Define an interface for creating an object but...
3,prototype,Specify the kinds of objects to create using a...
4,singleton,Ensure a class only has one instance and provi...
5,adapter,Convert the interface of a class into another ...
6,bridge,Decouple an abstraction from its implementatio...
7,composite,Compose objects into tree structures to repres...
8,decorator,Attach additional responsibilities to an objec...
9,facade,Provide a unified interface to a set of interf...


In [4]:
vectorizer = TfidfVectorizer()
df_tfidf = vectorizer.fit_transform(preprocess(df["overview"]))
df_tfidf = pd.DataFrame.sparse.from_spmatrix(df_tfidf, columns=vectorizer.get_feature_names_out()).sparse.to_dense()

df_tfidf2 = vectorizer.fit_transform(preprocess(df["overview"], verbs=True))
df_tfidf2 = pd.DataFrame.sparse.from_spmatrix(df_tfidf2, columns=vectorizer.get_feature_names_out()).sparse.to_dense()

no verbs change


0    provid interfac creat famili relat depend obje...
1    separ construct complex object represent const...
2    defin interfac creat object let subclass decid...
3    specifi kind object creat use prototyp instanc...
4    ensur class one instanc provid global point ac...
Name: overview, dtype: object

verbs only


0    creat specifi consid instanti make defin creat...
1    might edit end add modifi perform repres speci...
2    creat let creat consid realiz draw manag requi...
3    use could custom ad may ad would select move m...
4    ensur dedic serv ensur make keep instanti make...
Name: overview, dtype: object

In [5]:
true_1 = [0]*5 + [1]*7 + [2]*11
true_2 = [0]*5 + [2]*7 + [1]*11
true_3 = [1]*5 + [0]*7 + [2]*11
true_4 = [1]*5 + [2]*7 + [0]*11
true_5 = [2]*5 + [0]*7 + [1]*11
true_6 = [2]*5 + [1]*7 + [0]*11

km = KMeans(n_clusters=3,n_init='auto',random_state=0).fit(df_tfidf)
km_verbs = KMeans(n_clusters=3,n_init='auto',random_state=0).fit(df_tfidf2)

print('===========KMEANS no change ===========')
print('Predicted labels:')
display(km.labels_.tolist())

fscores = [
    f1_score(true_1, km.labels_.tolist(),average='micro'),
    f1_score(true_2, km.labels_.tolist(),average='micro'),
    f1_score(true_3, km.labels_.tolist(),average='micro'),
    f1_score(true_4, km.labels_.tolist(),average='micro'),
    f1_score(true_5, km.labels_.tolist(),average='micro'),
    f1_score(true_6, km.labels_.tolist(),average='micro')
]

km_best = np.around(max(fscores),3)
print('\nBest fscore is:', km_best, 'from true_' + str(np.argmax(fscores) + 1))
display(globals()['true_' + str(np.argmax(fscores) + 1)])

print('===========KMEANS verbs only ===========')
print('Predicted labels:')
display(km_verbs.labels_.tolist())

fscores_verbs = [
    f1_score(true_1, km_verbs.labels_.tolist(),average='micro'),
    f1_score(true_2, km_verbs.labels_.tolist(),average='micro'),
    f1_score(true_3, km_verbs.labels_.tolist(),average='micro'),
    f1_score(true_4, km_verbs.labels_.tolist(),average='micro'),
    f1_score(true_5, km_verbs.labels_.tolist(),average='micro'),
    f1_score(true_6, km_verbs.labels_.tolist(),average='micro')
]

km_verbs_best = np.around(max(fscores_verbs),3)
print('\nBest fscore is:', km_verbs_best, 'from true_' + str(np.argmax(fscores_verbs) + 1))
display(globals()['true_' + str(np.argmax(fscores_verbs) + 1)])

Predicted labels:


[2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 1, 2, 0, 0, 2, 2, 0, 1, 1, 1, 2, 2, 2]


Best fscore is: 0.435 from true_2


[0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

Predicted labels:


[1, 1, 2, 2, 0, 2, 0, 0, 2, 0, 2, 0, 0, 2, 1, 0, 0, 1, 0, 1, 1, 2, 2]


Best fscore is: 0.435 from true_5


[2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]