In [1]:
from IPython.display import display
import pandas as pd

pd.set_option('display.max_colwidth', 300)

df = pd.read_excel('gof-problem-domain.xlsx', dtype='string')

df.iloc[:2]

Unnamed: 0,name,intent,motivation,applicability
0,abstract factory,Provide an interface for creating families of related or dependent objects without specifying their concrete classes.,"Consider a user interface toolkit that supports multiple look-and-feel standards, such as Motif and Presentation Manager. Different look-and-feels define different appearances and behaviors for user interface “widgets” like scroll bars, windows, and buttons. To be portable across look-and-feel s...","Use the Abstract Factory pattern when • a system should be independent of how its products are created, composed, and represented. • a system should be configured with one of multiple families of products. • a family of related product objects is designed to be used together, and you need to ..."
1,builder,Separate the construction of a complex object from its representation so that the same construction process can create different representations.,"A reader for the RTF (Rich Text Format) document exchange format should be able to convert RTF to many text formats. The reader might convert RTF documents into plain ASCII text or into a text widget that can be edited interactively. The problem, however, is that the number of possible conversio...",Use the Builder pattern when • the algorithm for creating a complex object should be independent of the parts that make up the object and how they’re assembled. • the construction process must allow different representations for the object that’s constructed.


In [2]:
df = df.apply(lambda x: x.str.lower()) \
    .replace(to_replace=r'[^\w]', value=' ', regex=True)
    # .replace(to_replace=u'(\u2018|\u2019)', value='\'', regex=True) \
    # .replace(to_replace=r'[^\w\']', value=' ', regex=True)
    

df.iloc[:2]

Unnamed: 0,name,intent,motivation,applicability
0,abstract factory,provide an interface for creating families of related or dependent objects without specifying their concrete classes,consider a user interface toolkit that supports multiple look and feel standards such as motif and presentation manager different look and feels define different appearances and behaviors for user interface widgets like scroll bars windows and buttons to be portable across look and feel s...,use the abstract factory pattern when a system should be independent of how its products are created composed and represented a system should be configured with one of multiple families of products a family of related product objects is designed to be used together and you need to ...
1,builder,separate the construction of a complex object from its representation so that the same construction process can create different representations,a reader for the rtf rich text format document exchange format should be able to convert rtf to many text formats the reader might convert rtf documents into plain ascii text or into a text widget that can be edited interactively the problem however is that the number of possible conversio...,use the builder pattern when the algorithm for creating a complex object should be independent of the parts that make up the object and how they re assembled the construction process must allow different representations for the object that s constructed


In [3]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Don't transform the name column
df.loc[:, 'intent':'applicability'] = \
    df.loc[:, 'intent':'applicability'].applymap(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)])) \
    .applymap(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

df.iloc[:2]

Unnamed: 0,name,intent,motivation,applicability
0,abstract factory,provid interfac creat famili relat depend object without specifi concret class,consid user interfac toolkit support multipl look feel standard motif present manag differ look feel defin differ appear behavior user interfac widget like scroll bar window button portabl across look feel standard applic hard code widget particular look feel instanti look feel specif class widg...,use abstract factori pattern system independ product creat compos repres system configur one multipl famili product famili relat product object design use togeth need enforc constraint want provid class librari product want reveal interfac implement
1,builder,separ construct complex object represent construct process creat differ represent,reader rtf rich text format document exchang format abl convert rtf mani text format reader might convert rtf document plain ascii text text widget edit interact problem howev number possibl convers open end easi add new convers without modifi reader solut configur rtfreader class textconvert ob...,use builder pattern algorithm creat complex object independ part make object assembl construct process must allow differ represent object construct


In [4]:
unique = set()

df.loc[:, 'intent':'applicability'].applymap(lambda x: unique.update(x.split()))

print('Vocabulary size is', len(unique))

Vocabulary size is 1101


In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
import sys

# Merge all the problem domain columns into a list
problem_domain = df.loc[:, 'intent':'applicability'].agg(' '.join, axis=1).tolist()

count_vectorizer = CountVectorizer()

# np.set_printoptions(threshold=sys.maxsize)

count_matrix = count_vectorizer.fit_transform(problem_domain)

"""
Here's a quick demonstration on how the CountVectorizer works.
- The first array that's printed is a truncated list of all the words in the vocabulary, i.e., the features.
- The second output is the truncated counts of each word in the problem domain for Abstract Factory. Observe that 'abstract' is counted 5 times.
- The third output is the manual count of 'abstract' in the problem domain, which is indeed 5!
"""
display(count_vectorizer.get_feature_names_out()[:10])
display(count_matrix.toarray()[0][:10])
display(problem_domain[0].count('abstract'))


array(['10', '100', '107', '88', '91', 'abl', 'abouttoopendocu',
       'abstract', 'abstractlist', 'accept'], dtype=object)

array([0, 0, 0, 0, 0, 0, 0, 5, 0, 0])

5

In [None]:
terms = tfidf_vectorizer.get_feature_names_out()

import numpy as np
np.set_printoptions(threshold=np.inf)

display(terms[:100])

from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [None]:

from sklearn.cluster import KMeans

num_clusters = 3

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

In [None]:
import joblib

#uncomment the below to save your model 
#since I've already run my model I am loading from the pickle

joblib.dump(km,  'doc_cluster1.pkl')

km = joblib.load('doc_cluster1.pkl')
clusters = km.labels_.tolist()

In [None]:
films = {
    'name': df['name'].tolist(), 'intent': df['intent'].tolist(), 'motivation': df['motivation'].tolist(), 'applicability': df['applicability'].tolist(), 'cluster': clusters }

frame = pd.DataFrame(films, index = [clusters] , columns = ['name', 'intent', 'motivation', 'applicability', 'cluster'])

In [None]:
display(frame['cluster'].value_counts()) #number of films per cluster (clusters from 0 to 4)

frame

In [None]:
from __future__ import print_function

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print() #add whitespace
    print() #add whitespace
    
    print("Cluster %d names:" % i, end='')
    for title in frame.loc[i]['name'].values.tolist():
        print(' %s,' % title, end='')
    print() #add whitespace
    print() #add whitespace
    
print()
print()

In [None]:
import os  # for os.path.basename

import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.manifold import MDS

MDS()

# convert two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]
print()
print()

In [None]:
#set up colors per clusters using a dict
cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e'}

#set up cluster names using a dict
cluster_names = {0: 'graphical, request, interface, user, drawing, editor', 
                 1: 'document, application, creating, subclasses, request, text', 
                 2: 'interface, clients, instances, define, change, abstract'}

In [None]:

#some ipython magic to show the matplotlib plots inline
%matplotlib inline 

#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=df['title'].tolist())) 

#group by cluster
groups = df.groupby('label')


# set up plot
fig, ax = plt.subplots(figsize=(17, 9)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, 
            label=cluster_names[name], color=cluster_colors[name], 
            mec='none')
    ax.set_aspect('auto')
    ax.tick_params(\
        axis= 'x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelbottom='off')
    ax.tick_params(\
        axis= 'y',         # changes apply to the y-axis
        which='both',      # both major and minor ticks are affected
        left='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelleft='off')
    
ax.legend(numpoints=1)  #show legend with only 1 point

#add label in x,y position with the label as the film title
for i in range(len(df)):
    ax.text(df.loc[i]['x'], df.loc[i]['y'], df.loc[i]['title'], size=8)  

    
    
plt.show() #show the plot

#uncomment the below to save the plot if need be
#plt.savefig('clusters_small_noaxes.png', dpi=200)