In [36]:
import flair
import pickle
import sys
import sklearn
import sklearn.cluster
import scipy
import scipy.cluster
import matplotlib
import nltk
nltk.download('punkt')

import pyclustering

TEXT_FILENAME = '1056.txt'
EMBEDDINGS_FILENAME = '1056.model'

def clustering_filename(radius, neighbors):
  return f'1056_radius{radius}_neighbors{neighbors}.optics'

def save_clustering(radius, neighbors, clustering):
  with open(clustering_filename(radius, neighbors), 'wb') as clustering_file:
    data = (clustering.get_clusters(), clustering.get_noise(), clustering.get_ordering())
    pickle.dump(data, clustering_file)
  
def load_clustering(radius, neighbors):  
  with open(clustering_filename(radius, neighbors), 'rb') as clustering_file:
    data = pickle.load(clustering_file)
  
  return data[0], data[1], data[2]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
with open(TEXT_FILENAME, 'r') as text_file:
  corpus = text_file.read()

sentences = nltk.tokenize.sent_tokenize(corpus)

glove_embedding = flair.embeddings.WordEmbeddings('glove')
flair_embedding_forward = flair.embeddings.FlairEmbeddings('news-forward')
flair_embedding_backward = flair.embeddings.FlairEmbeddings('news-backward')
document_embeddings = flair.embeddings.DocumentPoolEmbeddings([glove_embedding, flair_embedding_backward, flair_embedding_forward])

embeddings = []

count = len(sentences)
for i, sentence in enumerate(sentences):
  flair_sentence = flair.data.Sentence(sentence)
  document_embeddings.embed(flair_sentence)
  embeddings.append(scipy.cluster.vq.whiten(flair_sentence.get_embedding()))

with open(EMBEDDINGS_FILENAME, 'wb') as file:
  pickle.dump(embeddings, file)

2019-05-15 07:20:21,960 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim.vectors.npy not found in cache, downloading to /tmp/tmpik2bq0um


100%|██████████| 160000128/160000128 [00:17<00:00, 9057088.16B/s]

2019-05-15 07:20:40,808 copying /tmp/tmpik2bq0um to cache at /root/.flair/embeddings/glove.gensim.vectors.npy





2019-05-15 07:20:41,086 removing temp file /tmp/tmpik2bq0um
2019-05-15 07:20:42,196 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim not found in cache, downloading to /tmp/tmpw1pz9wn0


100%|██████████| 21494764/21494764 [00:03<00:00, 5478402.01B/s]

2019-05-15 07:20:47,263 copying /tmp/tmpw1pz9wn0 to cache at /root/.flair/embeddings/glove.gensim
2019-05-15 07:20:47,295 removing temp file /tmp/tmpw1pz9wn0
2019-05-15 07:20:47,297 this function is deprecated, use smart_open.open instead





2019-05-15 07:20:49,884 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4.1/big-news-forward--h2048-l1-d0.05-lr30-0.25-20/news-forward-0.4.1.pt not found in cache, downloading to /tmp/tmpz5w4lbcj


100%|██████████| 73034624/73034624 [00:09<00:00, 7576548.83B/s]

2019-05-15 07:21:00,694 copying /tmp/tmpz5w4lbcj to cache at /root/.flair/embeddings/news-forward-0.4.1.pt
2019-05-15 07:21:00,789 removing temp file /tmp/tmpz5w4lbcj





2019-05-15 07:21:09,010 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4.1/big-news-backward--h2048-l1-d0.05-lr30-0.25-20/news-backward-0.4.1.pt not found in cache, downloading to /tmp/tmp_th6_a0g


100%|██████████| 73034575/73034575 [00:07<00:00, 9132416.69B/s] 

2019-05-15 07:21:18,235 copying /tmp/tmp_th6_a0g to cache at /root/.flair/embeddings/news-backward-0.4.1.pt





2019-05-15 07:21:18,338 removing temp file /tmp/tmp_th6_a0g


In [0]:
with open(EMBEDDINGS_FILENAME, 'rb') as embeddings_file:
  embeddings = pickle.load(embeddings_file)
  
pyclustering_formatted_data = []
for sentence in embeddings:
  pyclustering_formatted_data.append(sentence.tolist())

In [0]:
radius = 0.5
neighbors = 3
optics_instance = pyclustering.cluster.optics.optics(pyclustering_formatted_data, radius, neighbors)

optics_instance.process()

save_clustering(radius, neighbors, optics_instance)

In [55]:
clusters, noise, ordering = load_clustering(radius, neighbors)

print(len(clusters))
print(len(noise))

8
9176


In [56]:
for cluster in clusters:
  print(len(cluster))

9
5
5
4
11
5
4
4


In [53]:
with open(TEXT_FILENAME, 'r') as text_file:
  corpus = text_file.read()
  
sentences = nltk.tokenize.sent_tokenize(corpus)

for i, cluster in enumerate(clusters):
  print(f'cluster {i}')
  for sentence_index in cluster:
    print(sentences[sentence_index])
  print()

cluster 0
.
.
.
.
.
.
.
.
.

cluster 1
he demanded.
he demanded.
he demanded.
he demanded.
he demanded.

cluster 2
Martin shook his head.
Martin shook his head.
Martin shook his head.
Martin shook his head.
Martin shook his head.

cluster 3
she queried.
she queried.
she queried.
she queried.

cluster 4
he asked.
he asked.
he asked.
he asked.
he asked.
he asked.
he asked.
he asked.
he asked.
he asked.
he asked.

cluster 5
He shook his head.
He shook his head.
He shook his head.
He shook his head.
He shook his head.

cluster 6
Martin asked.
Martin asked.
Martin asked.
Martin asked.

cluster 7
She shook her head.
She shook her head.
She shook her head.
She shook her head.



In [54]:
for i, cluster in enumerate(clusters):
  print(f'cluster {i}')
  for sentence_index in cluster:
    print(sentences[sentence_index+1])
  print()

cluster 0
.
"There is no use trying to see me," she said toward the last.
.
It was
soft because she had never used it to work with.
.
. "
I guess that's all."
.
yes, he had written other
books; well, he would go to the free library the first thing in the
morning and try to get hold of some of Swinburne's stuff.

cluster 1
Mrs. Higginbotham made no reply.
"It is."
"It came to me suddenly."
Or were they afraid of life, these writers and
editors and readers?
"You never laid eyes on me before."

cluster 2
He was oppressed by the utter squalidness of it
all.
"That night was the one night for
me.
"Why, the papers were full of it.
"Never mind.
"She's my lady friend," Jim explained, "and she's a peach.

cluster 3
"It ain't Bill at all," the other broke in.
"No, for less than a week's work.
"All of them--the whole kit and crew."
"That sounds wrong," he said slowly.

cluster 4
"How'd yeh know?"
"Is it a chill?
Martin shook his head.
She shook her head.
"I do!
"Yes.
"I had hoped and planned other

In [0]:
radius = 10
neighbors = 3
optics_instance = pyclustering.cluster.optics.optics(pyclustering_formatted_data, radius, neighbors)
# Performs cluster analysis.
optics_instance.process()

save_clustering(radius, neighbors, optics_instance)

In [60]:
clusters, noise, ordering = load_clustering(radius, neighbors)

print(len(clusters))
print(len(noise))

10
9164


In [61]:
with open(TEXT_FILENAME, 'r') as text_file:
  corpus = text_file.read()
  
sentences = nltk.tokenize.sent_tokenize(corpus)

for i, cluster in enumerate(clusters):
  print(f'cluster {i}')
  for sentence_index in cluster:
    print(sentences[sentence_index])
  print()

cluster 0
.
.
.
.
.
.
.
.
.

cluster 1
he demanded.
he demanded.
he demanded.
he demanded.
he demanded.

cluster 2
Martin shook his head.
Martin shook his head.
Martin shook his head.
Martin shook his head.
Martin shook his head.

cluster 3
she queried.
she queried.
she queried.
she queried.

cluster 4
he asked.
he asked.
he asked.
he asked.
he asked.
he asked.
he asked.
he asked.
he asked.
he asked.
he asked.

cluster 5
He shook his head.
He shook his head.
He shook his head.
He shook his head.
He shook his head.

cluster 6
Martin asked.
Martin asked.
Martin asked.
Martin asked.

cluster 7
She shook her head.
She shook her head.
She shook her head.
She shook her head.

cluster 8
1.E.3.
1.E.4.
1.E.6.
1.E.7.
1.E.8.
1.E.5.
1.E.9.

cluster 9
1.F.2.
1.F.3.
1.F.4.
1.F.6.
1.F.5.



In [0]:
radius = 25
neighbors = 3
optics_instance = pyclustering.cluster.optics.optics(pyclustering_formatted_data, radius, neighbors)
# Performs cluster analysis.
optics_instance.process()

save_clustering(radius, neighbors, optics_instance)

In [63]:
clusters, noise, ordering = load_clustering(radius, neighbors)

print(len(clusters))
print(len(noise))

24
5695


In [64]:
for cluster in clusters:
  print(len(cluster))

3385
9
13
5
7
12
9
12
6
7
5
8
5
5
3
4
3
4
4
4
4
1
7
6


In [66]:
with open(TEXT_FILENAME, 'r') as text_file:
  corpus = text_file.read()
  
sentences = nltk.tokenize.sent_tokenize(corpus)

for i, cluster in enumerate(clusters):
  print(f'cluster {i}')
  for sentence_index in cluster[:10]:
    print(sentences[sentence_index])
  print()

cluster 0
He wore rough clothes that smacked
of the sea, and he was manifestly out of place in the spacious hall in
which he found himself.
It was the
second time he had been out with her alone, and as they rode along
through the balmy warmth, just chilled by she sea-breeze to refreshing
coolness, he was profoundly impressed by the fact that it was a very
beautiful and well-ordered world and that it was good to be alive and to
love.
He had no second-best
suit that was presentable, and though he could go to the butcher and the
baker, and even on occasion to his sister's, it was beyond all daring to
dream of entering the Morse home so disreputably apparelled.
He was stirred profoundly by the passing glimpse at the
secret, and he was again caught up in the vision of sunlit spaces and
starry voids--until it came to him that it was very quiet, and he saw
Ruth regarding him with an amused expression and a smile in her eyes.
It seemed to him that he had intruded
upon the holy of holies, and s