# 3.1 Clustering: TFIDF + SVD; KNN and DOC2VEC; KNN

In [1]:
import json
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import *
from nltk.corpus import stopwords
import string
import pickle

In [3]:
with open ('../data/raw/train-claims.json') as f:
    train_claims = json.load(f)

In [4]:
with open ('../data/raw/dev-claims.json') as f:
    dev_claims = json.load(f)

In [5]:
definitely_related_evidence = set()
for claim in train_claims:
    for evidence in train_claims[claim]['evidences']:
        definitely_related_evidence.add(evidence)

definitely_related_evidence2 = set()
for claim in dev_claims:
    for evidence in dev_claims[claim]['evidences']:
        definitely_related_evidence2.add(evidence)

In [6]:
cluster_train = definitely_related_evidence.difference(definitely_related_evidence2)

In [7]:
cluster_dev = definitely_related_evidence2

In [8]:
len(definitely_related_evidence)

3121

In [9]:
len(definitely_related_evidence2)

463

In [10]:
len(cluster_train)

2980

In [11]:
len(cluster_dev)

463

In [12]:
cluster_full = definitely_related_evidence.union(definitely_related_evidence2)

In [13]:
len(cluster_full)

3443

In [14]:
with open ('../data/raw/evidence.json') as f:
    evidence = json.load(f)

In [15]:
evidence = list(evidence.items())
evidence.sort()

With punctuation removal, stopword removal, stemming/lemmatization, 

In [16]:
# remove punctuation

In [17]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tg.chenny/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
ALL_PUNCT = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
stemmer = PorterStemmer()
STOP_WORDS = set(stopwords.words('english'))

In [19]:
def all_alpha(word):
    for char in word:
        if char not in string.ascii_lowercase:
            return False
    return True

In [20]:
def preprocess(text):
    text = text.lower()

    no_punct_text = str()
    for char in text:
        if char not in ALL_PUNCT:
            no_punct_text += char

    word_list = no_punct_text.split()

    no_stop_words_word_list = [word for word in word_list if word not in STOP_WORDS]
    # has_alpha_words_word_list = [word for word in no_stop_words_word_list if all_alpha(word)]
    
    # stemmed_word_list = [stemmer.stem(word) for word in has_alpha_words_word_list]
    stemmed_word_list = [stemmer.stem(word) for word in no_stop_words_word_list]

    output = ' '.join(stemmed_word_list)
    
    return output

In [21]:
EVIDENCE_ID_MAP = {x[1][0]:x[0] for x in enumerate(evidence)}

In [22]:
preprocessed_evidence = [preprocess(evidence[i][1]) for i in range(len(evidence))]

In [120]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_evidence)

In [24]:
with open('../data/curated/processed_tfidf_nonalpha.pickle', 'wb') as f:
    pickle.dump(X, f)

In [25]:
with open('../data/curated/processed_tfidf_nonalpha.pickle', 'rb') as f:
    X = pickle.load(f)

In [121]:
from sklearn.decomposition import TruncatedSVD

In [122]:
svd = TruncatedSVD(5)

In [123]:
X_svd = svd.fit_transform(X)

In [124]:
X_svd

array([[ 0.03371063, -0.01053843, -0.00277743, -0.00479351, -0.01454203],
       [ 0.10978817, -0.00981249,  0.01203098, -0.00577253, -0.03991667],
       [ 0.01526762, -0.00211903, -0.00316659,  0.00983776,  0.00164049],
       ...,
       [ 0.02228418, -0.00043714,  0.00054293,  0.00719538,  0.00101908],
       [ 0.02621328,  0.0158668 ,  0.02394255,  0.00499125, -0.01267239],
       [ 0.03723151,  0.00129035, -0.01057103,  0.01361405,  0.00491174]])

In [127]:
from sklearn import preprocessing

stdardiser = preprocessing.Normalizer()

X_svd_std = stdardiser.fit_transform(X_svd)

In [134]:
from sklearn.cluster import KMeans
K = 2
cluster = KMeans(n_clusters = K, random_state = 19260817, n_init = 'auto').fit(X_svd)

In [135]:
labels = []
for evid in cluster_full:

    x = [X_svd[EVIDENCE_ID_MAP[evid]]]

    labels.append(cluster.predict(x)[0])


In [136]:
for i in set(cluster.labels_):
    print(i, labels.count(i))

0 253
1 3190


In [137]:
all_labels = cluster.predict(X_svd)

In [138]:
for i in set(cluster.labels_):
    print(i, list(all_labels).count(i))

0 111796
1 1097031


In [145]:
X_svd_new = []

for i in range(len(all_labels)):
    if all_labels[i]:
        X_svd_new.append(X_svd[i])

for i in range(len(labels)):
    if labels[i] == 0:
        X_svd_new.append(X_svd[i])

In [147]:
K = 2
cluster = KMeans(n_clusters = K, random_state = 19260817, n_init = 'auto').fit(X_svd_new)

# Doc2Vec

In [52]:
!pip install gensim


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [61]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [65]:
common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [71]:
evidence_tokenised = [evidence[i][1].split() for i in range(len(evidence))]

In [106]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(evidence_tokenised)]
model = Doc2Vec(documents, vector_size=2, window=2, min_count=1, workers=4)

In [107]:
X = list()

for i in range(len(evidence_tokenised)):
    X.append(model.infer_vector(evidence_tokenised[i]))

X

[array([0.05407193, 0.10634411], dtype=float32),
 array([-0.17542973,  0.29466367], dtype=float32),
 array([0.18979324, 0.03877317], dtype=float32),
 array([0.24666642, 0.07983188], dtype=float32),
 array([-0.255418  ,  0.07962706], dtype=float32),
 array([-0.36089084, -0.11943104], dtype=float32),
 array([-0.24730128,  0.22671086], dtype=float32),
 array([-0.23802646,  0.07177278], dtype=float32),
 array([0.14242095, 0.19038762], dtype=float32),
 array([ 0.00719903, -0.01282269], dtype=float32),
 array([ 0.08877952, -0.00926668], dtype=float32),
 array([0.0318253 , 0.31380787], dtype=float32),
 array([-0.08545659,  0.20410061], dtype=float32),
 array([-0.04202587,  0.2667783 ], dtype=float32),
 array([-0.37363526,  0.15474255], dtype=float32),
 array([-0.13503583,  0.14899532], dtype=float32),
 array([0.02178347, 0.3485028 ], dtype=float32),
 array([ 0.07828029, -0.02611786], dtype=float32),
 array([-0.16017516, -0.16866736], dtype=float32),
 array([0.25205112, 0.00077117], dtype=floa

In [110]:
from sklearn.cluster import KMeans
K = 2
cluster = KMeans(n_clusters = K, random_state = 19260817, n_init = 'auto').fit(X)

In [111]:
labels = []
for evid in cluster_full:

    x = [X[EVIDENCE_ID_MAP[evid]]]

    labels.append(cluster.predict(x)[0])


In [112]:
for i in set(cluster.labels_):
    print(i, labels.count(i))

0 517
1 2926


In [114]:
all_labels = cluster.predict(X)

In [115]:
for i in set(cluster.labels_):
    print(i, list(all_labels).count(i))

0 653419
1 555408
