# Initialization

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import string

In [None]:
! ls 'drive/My Drive/Akademik/Tahun 4/KOM431'

cluster.csv  cluster.gsheet  data.csv  stopwords.txt


In [None]:
df = pd.read_csv('drive/My Drive/Akademik/Tahun 4/KOM431/cluster.csv', encoding="ISO-8859-1")
df.head()

Unnamed: 0,ID,CLASS,ABSTRACT
0,D001,HSO,Multiple sequence alignment (MSA) is a central...
1,D002,HSO,Although the heuristic search algorithm A* is ...
2,D003,HSO,The increased demand for distributed computati...
3,D004,HSO,Declarative logic programs (LP) based on the w...
4,D005,HSO,Recent years have witnessed the success of has...


In [None]:
class_list = df['CLASS']
class_list.value_counts()

Machine Learning            49
Multiagent Systems          31
Other                       25
Knowledge                   22
HSO                         14
NLP                          8
Multidisciplinary Topics     1
Name: CLASS, dtype: int64

In [None]:
with open('drive/My Drive/Akademik/Tahun 4/KOM431/stopwords.txt', "r") as f:
  stopwords = f.read()

print(stopwords)
stopwords = re.sub(r"\"", "", stopwords)
stopwords = stopwords.split(", ")
print(stopwords)

"a, about, above, across, after, again, against, all, almost, alone, along, already, also, although, always, am, among, an, and, another, any, anybody, anyone, anything, anywhere, are, area, areas, aren't, around, as, ask, asked, asking, asks, at, away, b, back, backed, backing, backs, be, became, because, become, becomes, been, before, began, behind, being, beings, below, best, better, between, big, both, but, by, c, came, can, cannot, can't, case, cases, certain, certainly, clear, clearly, come, could, couldn't, d, did, didn't, differ, different, differently, do, does, doesn't, doing, done, don't, down, downed, downing, downs, during, e, each, early, either, end, ended, ending, ends, enough, even, evenly, ever, every, everybody, everyone, everything, everywhere, f, face, faces, fact, facts, far, felt, few, find, finds, first, for, four, from, full, fully, further, furthered, furthering, furthers, g, gave, general, generally, get, gets, give, given, gives, go, going, good, goods, got,

# Preprocess Text

In [None]:
import re
import string

def preprocess_text(x):
  x = x.lower()
  x = x.encode('ascii', 'ignore').decode()
  x = ' '.join([i for i in x.split() if i not in stopwords])
  x = re.sub('\n', ' ', x)
  x = re.sub("[%s]" % re.escape(string.punctuation), ' ', x)
  x = re.sub("\d+", " ", x)
  x = re.sub("\s{2,}", " ", x)
  return x

df['clean_abstract'] = df['ABSTRACT'].apply(preprocess_text)
df.head()

Unnamed: 0,ID,CLASS,ABSTRACT,clean_abstract
0,D001,HSO,Multiple sequence alignment (MSA) is a central...,multiple sequence alignment msa central comput...
1,D002,HSO,Although the heuristic search algorithm A* is ...,heuristic search algorithm a well known optima...
2,D003,HSO,The increased demand for distributed computati...,increased demand distributed computations data...
3,D004,HSO,Declarative logic programs (LP) based on the w...,declarative logic programs lp based well found...
4,D005,HSO,Recent years have witnessed the success of has...,recent witnessed success hashing techniques ap...


In [None]:
df.CLASS.value_counts()

Machine Learning            49
Multiagent Systems          31
Other                       25
Knowledge                   22
HSO                         14
NLP                          8
Multidisciplinary Topics     1
Name: CLASS, dtype: int64

# Create Count Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['clean_abstract'])
df_new = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
print(df_new.shape)
df_new.head()

(150, 3307)


Unnamed: 0,aam,abduction,abilities,ability,able,abs,absent,absorb,abstract,abstraction,accelerated,accelerates,access,accommodate,accommodates,accomplish,accomplished,according,account,accounts,accumulate,accuracy,accurate,accurately,achievable,achieve,achieved,achievement,achieves,achieving,acquire,acquires,acquisition,action,actions,active,actively,activities,activity,actual,...,wfs,wggs,whenever,whereby,which,whole,whom,wide,widely,widespread,width,will,willingness,winner,winners,winning,wise,wishes,with,witnessed,word,wordnet,words,work,workers,workflow,workflows,world,worlds,worst,wr,wsat,xsb,years,yes,yield,yielding,yields,zilberstein,zilbersteins
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
from sklearn.metrics.cluster import contingency_matrix

def purity_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix_ = contingency_matrix(y_true, y_pred)
    # return purity
    return np.sum(np.amax(contingency_matrix_, axis=0)) / np.sum(contingency_matrix_)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_true = le.fit_transform(df['CLASS'])

from sklearn.cluster import AgglomerativeClustering

In [None]:
model = AgglomerativeClustering(n_clusters=7, linkage='single', affinity='cosine')

y_pred_single = model.fit_predict(df_new.values)

print("Purity value of Single Linkage =", purity_score(y_true, y_pred_single))

pd.DataFrame(contingency_matrix(y_true, y_pred_single))

Purity value of Single Linkage = 0.36


Unnamed: 0,0,1,2,3,4,5,6
0,14,0,0,0,0,0,0
1,21,0,0,0,0,1,0
2,48,0,0,1,0,0,0
3,28,0,1,0,1,0,1
4,1,0,0,0,0,0,0
5,7,0,1,0,0,0,0
6,24,1,0,0,0,0,0


In [None]:
table_single = pd.DataFrame(contingency_matrix(y_true, y_pred_single))
true = []
for i in range(7):
  true.append(table_single.iloc[i, i])

print(np.array(true))
print(np.array(true).sum() / table_single.values.sum())

[14  0  0  0  0  0  0]
0.09333333333333334


In [None]:
model = AgglomerativeClustering(n_clusters=7, linkage='average', affinity='cosine')

y_pred_average = model.fit_predict(df_new.values)

print("Purity value of Average Linkage =", purity_score(y_true, y_pred_average))

pd.DataFrame(contingency_matrix(y_true, y_pred_average))

Purity value of Average Linkage = 0.4666666666666667


Unnamed: 0,0,1,2,3,4,5,6
0,0,2,12,0,0,0,0
1,3,4,13,1,1,0,0
2,1,2,46,0,0,0,0
3,4,16,9,0,1,0,1
4,0,0,1,0,0,0,0
5,0,0,6,0,1,0,1
6,0,1,22,0,1,1,0


In [None]:
table_average = pd.DataFrame(contingency_matrix(y_true, y_pred_average))
true = []
for i in range(7):
  true.append(table_average.iloc[i, i])

print(np.array(true))
print(np.array(true).sum() / table_average.values.sum())

[ 0  4 46  0  0  0  0]
0.3333333333333333
