In [1]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import re
import nltk
import os
import pickle

from sqlalchemy import column

from gensim.models import Word2Vec
from time import time  # To time our operations
from collections import defaultdict  # For word frequency
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

import matplotlib.pyplot as plt

In [3]:
import boto3
from sagemaker import get_execution_role

role = get_execution_role()
bucket_name = 'deploy-sagemaker-conversation'


s3_url = 's3://deploy-sagemaker-conversation/floop_data_15k.json'
conn = boto3.client('s3')
contents = conn.list_objects(Bucket = bucket_name)['Contents']

s3 = boto3.resource('s3')

In [4]:
import json

In [5]:
dataset = conn.get_object(Bucket = bucket_name, Key = 'floop_data_15k.json')['Body'].read()
data = pd.read_json(dataset)

In [6]:
data.columns= ["Field1"]

data.shape


(15617, 1)

In [7]:


#To remove special characters and punctuation from our dataset
from string import punctuation

punctuations = punctuation

def solution(sentence):
    for p in punctuations:
        sentence = sentence.replace(p, '')
    return sentence

x = data["Field1"].apply(solution)
pattern = "[^a-zA-Z0-9]"
x_cleaned = [re.sub(pattern," ",text) for text in x]

x_lowered = [text.lower() for text in x_cleaned]
x_lowered

x_lowered[0]

nltk.download('punkt')
nltk.download('omw-1.4')

x_tokenized = [nltk.word_tokenize(text) for text in x_lowered]

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()

x_lemmatized = [[lemma.lemmatize(word) for word in text] for text in x_tokenized]

print(x_lemmatized[0])





# For classification data whether good or bad.

w2v_model = Word2Vec(min_count=20,window=2,sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=20,workers= 1 )

t = time()

w2v_model.build_vocab(x_lemmatized, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

t = time()

w2v_model.train(x_lemmatized, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

w2v_model.init_sims(replace=True)

w2v_model.save("word2vec11.model")

word_vectors = Word2Vec.load("word2vec11.model").wv

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ec2-user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
INFO - 18:52:12: Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=100, alpha=0.03)', 'datetime': '2022-02-27T18:52:12.809042', 'gensim': '4.1.2', 'python': '3.6.13 | packaged by conda-forge | (default, Feb 19 2021, 05:36:01) \n[GCC 9.3.0]', 'platform': 'Linux-4.14.252-131.483.amzn1.x86_64-x86_64-with-glibc2.9', 'event': 'created'}
INFO - 18:52:12: collecting all words and their counts
INFO - 18:52:12: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 18:52:12: PROGRESS: at sentence #10000, processed 80287 words, keeping 8478 word types
INFO - 18:52:12: collected 11409 word types from 

['what', 'is', 'this', 'word']
Time to build vocab: 0.0 mins


INFO - 18:52:13: worker thread finished; awaiting finish of 0 more threads
INFO - 18:52:13: EPOCH - 1 : training on 130228 raw words (20640 effective words) took 0.1s, 191995 effective words/s
INFO - 18:52:13: worker thread finished; awaiting finish of 0 more threads
INFO - 18:52:13: EPOCH - 2 : training on 130228 raw words (20915 effective words) took 0.1s, 191762 effective words/s
INFO - 18:52:13: worker thread finished; awaiting finish of 0 more threads
INFO - 18:52:13: EPOCH - 3 : training on 130228 raw words (20958 effective words) took 0.1s, 197459 effective words/s
INFO - 18:52:13: worker thread finished; awaiting finish of 0 more threads
INFO - 18:52:13: EPOCH - 4 : training on 130228 raw words (20847 effective words) took 0.1s, 192512 effective words/s
INFO - 18:52:13: worker thread finished; awaiting finish of 0 more threads
INFO - 18:52:13: EPOCH - 5 : training on 130228 raw words (20782 effective words) took 0.1s, 193009 effective words/s
INFO - 18:52:13: worker thread fini

Time to train the model: 0.06 mins


In [8]:
from sagemaker import KMeans

In [9]:

num_clusters = 2
kmeans = KMeans(
    role=role,
    instance_count=1,
    instance_type="ml.c4.xlarge",
    output_path="s3://" + bucket_name + "/kmeans_cluster",
    k=num_clusters,
)

In [10]:
model = kmeans.fit(kmeans.record_set(word_vectors.vectors.astype('float32')))

INFO - 18:52:17: Same images used for training and inference. Defaulting to image scope: inference.
INFO - 18:52:17: Ignoring unnecessary instance type: None.
INFO - 18:52:17: Defaulting to the only supported framework/algorithm version: latest.
INFO - 18:52:17: Ignoring unnecessary instance type: None.
INFO - 18:52:17: Same images used for training and inference. Defaulting to image scope: inference.
INFO - 18:52:17: Ignoring unnecessary instance type: None.
INFO - 18:52:17: Creating training-job with name: kmeans-2022-02-27-18-52-17-581


2022-02-27 18:52:17 Starting - Starting the training job...
2022-02-27 18:52:41 Starting - Launching requested ML instancesProfilerReport-1645987937: InProgress
......
2022-02-27 18:53:46 Starting - Preparing the instances for training.........
2022-02-27 18:55:17 Downloading - Downloading input data...
2022-02-27 18:55:48 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[02/27/2022 18:55:53 INFO 139773837301568] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'init_method': 'random', 'mini_batch_size': '5000', 'epochs': '1', 'extra_center_factor': 'auto', 'local_lloyd_max_iter': '300', 'local_lloyd_tol': '0.0001', 'local_lloyd_init_method': 'kmeans++', 'local_lloyd_num_trials': 'auto', 'half_life_time_size': '0', 'eval_metrics': '["msd"]', 'force_dense': 'true', '_disable_wait_to_

In [11]:
kmeans_deployed = kmeans.deploy(initial_instance_count=1,instance_type='ml.t2.medium')

INFO - 18:56:31: Same images used for training and inference. Defaulting to image scope: inference.
INFO - 18:56:31: Ignoring unnecessary instance type: None.
INFO - 18:56:31: Creating model with name: kmeans-2022-02-27-18-56-30-955
INFO - 18:56:31: Creating endpoint-config with name kmeans-2022-02-27-18-56-30-955
INFO - 18:56:31: Creating endpoint with name kmeans-2022-02-27-18-56-30-955


-----------!

## model.cluster_centers_

def cast_vector(row):
    return np.array(list(map(lambda x: x.astype('double'), row)))

words = pd.DataFrame(word_vectors.index_to_key)
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])
words['vectorsmean'] = words.vectors.apply(lambda x: x.mean())
words['vectors_typed'] = words.vectors.apply(cast_vector)
words['cluster'] = words.vectors_typed.apply(lambda x: kmeans_deployed.predict(x.astype('float32')))
words.cluster = words.cluster.apply(lambda x: x[0])
words['cluster_value'] = [1 if i==0 else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

words.head(10)

u_labels = np.unique(words['cluster'])

words['vectorsmean'] = words.vectors.apply(lambda x: x.mean())

words['vectorsmean'][0]

len(words["vectors"][1])

words.head(10)

# FOr plotting 

colors = {1: 'black', -1: 'Red'}
plt.scatter(words['sentiment_coeff'] , words['vectorsmean'] , c=words['cluster_value'].map(colors))

plt.show()

#os.remove("floop_data_15k.csv")
os.remove("word2vec11.model")

In [12]:
s3_output = s3.Object(bucket_name,'kmeans_output.json')

In [13]:
json_output = json.dumps({'numberOfClusters': num_clusters})

In [14]:
s3_output.put(Body = bytes(json.dumps(json_output).encode('UTF-8')))

{'ResponseMetadata': {'RequestId': '01JB0MQXXASAJD17',
  'HostId': '2vzd5CudO3l+SOcm1BIJvtVx0+5abKOqD5LUg3In6DFA6F7xq0Ghny2SvlVUXgbdHLDvS14V5AA=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '2vzd5CudO3l+SOcm1BIJvtVx0+5abKOqD5LUg3In6DFA6F7xq0Ghny2SvlVUXgbdHLDvS14V5AA=',
   'x-amz-request-id': '01JB0MQXXASAJD17',
   'date': 'Sun, 27 Feb 2022 19:02:03 GMT',
   'etag': '"8d5913eb78d427852588f4d5a6744e27"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"8d5913eb78d427852588f4d5a6744e27"'}