# Model 1

Source: 
1. https://github.com/aws/amazon-sagemaker-examples/blob/master/scientific_details_of_algorithms/ntm_topic_modeling/ntm_wikitext.ipynb
2. https://sagemaker-examples.readthedocs.io/en/latest/introduction_to_amazon_algorithms/ntm_synthetic/ntm_synthetic.html#Extensions

In [1]:
import os
import random
import string
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
from datetime import datetime
import boto3
import dask.dataframe as dd
import tempfile



%matplotlib inline
# %matplotlib notebook


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 5000)
pd.set_option('display.float_format', str)
matplotlib.rcParams['figure.figsize'] = (10, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

working_dir = '/home/ec2-user/SageMaker'
base_dir = '/home/ec2-user/SageMaker/topic_modelling/'
# s3_data_path = 's3://bucket-sushant/bangla-character-recognition/'

## For reproducible results
seed_value = 18
os.environ['PYTHONHASHSEED'] = str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
# https://stackoverflow.com/questions/5836335/consistently-create-same-random-numpy-array/5837352#5837352
random_state = np.random.RandomState(seed=seed_value)
s3 = boto3.client('s3')

def read_from_s3(file_path):
    bucket_name = file_path.split('/')[2]
    key = '/'.join(file_path.split('/')[3:])
    response = s3.get_object(Bucket=bucket_name, Key=key)
    body = response['Body'].read()
    return body

def read_pickle_from_s3(file_path):
    data = pickle.loads(read_from_s3(file_path))
    return data

# def read_csv_from_s3(file_path):
#     data = pd.read_csv(file_path, low_memory=False)
#     return data

def store_object_to_s3_as_pickle(data, file_path):
    bucket_name = file_path.split('/')[2]
    key = '/'.join(file_path.split('/')[3:])
#     # uses lot of memory
#     pickle_obj = pickle.dumps(data)
#     return s3.put_object(Key=key, Bucket=bucket_name, Body=pickle_obj)
    # using tmp file
    fd, path = tempfile.mkstemp()
    try:
        with open(path, 'wb') as pointer:
            pickle.dump(data, pointer)
        with open(path, "rb") as pointer:
            s3.upload_fileobj(pointer, bucket_name, key)
    finally:
        os.remove(path)

### Fetching Data Set

In [2]:
import os
import shutil


def check_create_dir(dir):
    if os.path.exists(dir):  # cleanup existing data folder
        shutil.rmtree(dir)
    os.mkdir(dir)
    
    
data_dir = f'{base_dir}/Data'
current_dir = os.getcwd()
print("Current directory: ", current_dir)

Current directory:  /home/ec2-user/SageMaker/topic_modelling/NTM/tam_data


In [None]:
data = pd.read_pickle(f'{data_dir}/topic_modeling_data.pkl')
print(data.shape)
data.head()

In [None]:
train = data.sample(frac=0.9, random_state=seed_value)
test = data.loc[~data.index.isin(train.index)]

train.shape, test.shape

In [None]:
test.to_pickle(f'{data_dir}/topic_modeling_data_test.pkl')
train.to_pickle(f'{data_dir}/topic_modeling_data_train.pkl')

In [3]:
test = pd.read_pickle(f'{data_dir}/topic_modeling_data_test.pkl')
train = pd.read_pickle(f'{data_dir}/topic_modeling_data_train.pkl')

### Preprocessing

In [4]:
train_doc_list = list(train['4_stop_words_removed'].values)
test_doc_list = list(test['4_stop_words_removed'].values)

In [5]:
type(train_doc_list), len(train_doc_list), type(test_doc_list), len(test_doc_list)

(list, 10889, list, 1210)

In [6]:
train_doc_list[1], test_doc_list[1], 

('stack arn aws cloudformation region wa trying update launch configuration created stack use volume ami amazon linux stack update failed error invalid valid volume type standard try launch instance using ami via ec2 console able missing',
 'hope great instance configured aws cli written script download file bucket run script manually terminal work working crontab')

In [7]:
import time
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

print("Lemmatizing and counting, this may take a few minutes...")
start_time = time.time()
vectorizer = CountVectorizer(
    input="content",
    analyzer="word",
    stop_words="english",
)

train_vectors = vectorizer.fit_transform(train_doc_list)
test_vectors = vectorizer.transform(test_doc_list)

vocab_list = vectorizer.get_feature_names()
vocab_size = len(vocab_list)
print("vocab size:", vocab_size)
print("Done. Time elapsed: {:.2f}s".format(time.time() - start_time))

Lemmatizing and counting, this may take a few minutes...
vocab size: 10355
Done. Time elapsed: 0.52s


In [8]:
import scipy.sparse as sparse


def setting_sparse_dtypes(vectors):
    vectors = sparse.csr_matrix(vectors, dtype=np.float32)
    print(type(vectors), vectors.dtype)
    return vectors


train_vectors_sparse = setting_sparse_dtypes(train_vectors)
test_vectors_sparse = setting_sparse_dtypes(test_vectors)

<class 'scipy.sparse.csr.csr_matrix'> float32
<class 'scipy.sparse.csr.csr_matrix'> float32


In [None]:
def split_convert(sparray, prefix, fname_template="data_part{}.pbr", n_parts=2):
    import io
    import sagemaker.amazon.common as smac

    chunk_size = sparray.shape[0] // n_parts
    for i in range(n_parts):

        # Calculate start and end indices
        start = i * chunk_size
        end = (i + 1) * chunk_size
        if i + 1 == n_parts:
            end = sparray.shape[0]

        # Convert to record protobuf
        buf = io.BytesIO()
        smac.write_spmatrix_to_sparse_tensor(array=sparray[start:end], file=buf, labels=None)
        buf.seek(0)

        fname = os.path.join(prefix, fname_template.format(i))
        with open(fname, "wb") as f:
            f.write(buf.getvalue())
        print("Saved data to {}".format(fname))


train_data_dir = os.path.join(data_dir, "train")
test_data_dir = os.path.join(data_dir, "test")

check_create_dir(train_data_dir)
check_create_dir(test_data_dir)

split_convert(train_vectors_sparse, prefix=train_data_dir, fname_template="train_part{}.pbr", n_parts=4)
split_convert(test_vectors_sparse, prefix=test_data_dir, fname_template="test_part{}.pbr", n_parts=1)

### Save the vocabulary file
To make use of the auxiliary channel for vocabulary file, we first save the text file with the name `vocab.txt` in the auxiliary directory.

In [None]:
aux_data_dir = os.path.join(data_dir, "auxiliary")
check_create_dir(aux_data_dir)
with open(os.path.join(aux_data_dir, "vocab.txt"), "w", encoding="utf-8") as f:
    for item in vocab_list:
        f.write(item + "\n")

### Store Data on S3

In [9]:
import os
import sagemaker

role = sagemaker.get_execution_role()

bucket = 'bucket-sushant'  # <or insert your own bucket name>#
prefix = "topic_modelling/ntm/"

train_prefix = os.path.join(prefix, "train")
aux_prefix = os.path.join(prefix, "auxiliary")
test_prefix = os.path.join(prefix, "test")
output_prefix = os.path.join(prefix, "output")

s3_train_data = os.path.join("s3://", bucket, train_prefix)
s3_aux_data = os.path.join("s3://", bucket, aux_prefix)
s3_test_data = os.path.join("s3://", bucket, test_prefix)
output_path = os.path.join("s3://", bucket, output_prefix)
print("Training set location", s3_train_data)
print("Auxiliary data location", s3_aux_data)
print("Test data location", s3_test_data)
print("Trained model will be saved at", output_path)

Training set location s3://bucket-sushant/topic_modelling/ntm/train
Auxiliary data location s3://bucket-sushant/topic_modelling/ntm/auxiliary
Test data location s3://bucket-sushant/topic_modelling/ntm/test
Trained model will be saved at s3://bucket-sushant/topic_modelling/ntm/output


#### Upload the input directories to s3
We use the `aws` command line interface (CLI) to upload the various input channels. 

In [None]:
import subprocess

cmd_train = "aws s3 cp " + train_data_dir + " " + s3_train_data + " --recursive"
p = subprocess.Popen(cmd_train, shell=True, stdout=subprocess.PIPE)
p.communicate()

In [None]:
cmd_test = "aws s3 cp " + test_data_dir + " " + s3_test_data + " --recursive"
p = subprocess.Popen(cmd_test, shell=True, stdout=subprocess.PIPE)
p.communicate()

In [None]:
cmd_aux = "aws s3 cp " + aux_data_dir + " " + s3_aux_data + " --recursive"
p = subprocess.Popen(cmd_aux, shell=True, stdout=subprocess.PIPE)
p.communicate()

### Model Training

In [10]:
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri
import sagemaker

sess = sagemaker.Session()

container = get_image_uri(boto3.Session().region_name, "ntm")

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


In [None]:

ntm = sagemaker.estimator.Estimator(
    container,
    role,
    train_instance_count=1,
    train_instance_type="ml.c4.xlarge",
    output_path=output_path,
    sagemaker_session=sess,
)

In [None]:
num_topics = 15
ntm.set_hyperparameters(
    num_topics=num_topics, feature_dim=vocab_size, mini_batch_size=60, epochs=50, sub_sample=0.8
)

In [None]:
from sagemaker.session import s3_input

s3_train = s3_input(
    s3_train_data, distribution="ShardedByS3Key", content_type="application/x-recordio-protobuf"
)
s3_test = s3_input(
    s3_test_data, distribution="FullyReplicated", content_type="application/x-recordio-protobuf"
)

s3_aux = s3_input(s3_aux_data, distribution="FullyReplicated", content_type="text/plain")

In [None]:
ntm.fit({"train": s3_train, "auxiliary": s3_aux, "test": s3_test})

In [None]:
print("Training job name: {}".format(ntm.latest_training_job.job_name))

## Inference

In [None]:
ntm_predictor = ntm.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")

In [None]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer


ntm_predictor.serializer = CSVSerializer()
ntm_predictor.deserializer = JSONDeserializer()

In [None]:
inference_vectors = vectorizer.transform(test_doc_list).toarray()


type(inference_vectors), inference_vectors.shape

In [None]:
results = ntm_predictor.predict(inference_vectors[:10], initial_args={"ContentType": "text/csv"})
print(results)

In [None]:
def predict_batches(data, rows=10):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = []
    for array in split_array:
        results = ntm_predictor.predict(array, initial_args={"ContentType": "text/csv"})
        predictions += [r["topic_weights"] for r in results["predictions"]]
    return np.array(predictions)

In [None]:
results = predict_batches(inference_vectors)

In [None]:
results.shape

In [34]:
topics = {
    0: "reconnecting initiating connect sip subham chat update chime shilpa useable mouse apeksha christophe chirs imthian cloudwath neshon prompted chrish particularly".split(),
    1: "timestamp password login showing unable launch window type shammi created tried server getting rdp panchal launched launching capacity role access".split(),
    2: "initiating reconnecting reinitiating lost disconnected connect chat looping inititaing join dueto narrowing phone arajun vnet hardik vaishnavi fialed srishti bernardo".split(),
    3: "phone amazon initiating machince inititaing fluctuation balraj thrugh shashank looping tsill shinde urgenty waited renitiating ghole scheuled monetization loo bos".split(),
    4: "reconnecting reinitiating disconnected lost connect pfa resent bindushree mailing imthiyaaz imparied mate allen balaji hibernated cake saneesh conferencing manar monetization".split(),
    5: "filename sudo module apt init yum rpm mnt nitro ascii ena line detached contain repository systemctl directory content nvme mount".split(),
    6: "training toll international incoming respond documentation managed accept management follow pin sent cost return corner enquiry star center rated simultaneously".split(),
    7: "reconnecting disconnected chat initiating connect update reconnect reponse nexus umair elk subhankar sublimits got domian loo kenedy mubulay neelansh unidentified".split(),
    8: "underlying hardware inconvenience iop healthy burst failure graph health spike occur credit passing balance apologize experienced performance throughput cpuutilization metric".split(),
    9: "reconnecting initiating connect chat disconnected lost subham connection struck anybody vinod retrying monthend postpone comminucation effecting pandey cofirm scheuled aravind".split(),
    10: "helped click yes resolve let know issue heard continued wish mark close hour regarding open action want time note url".split(),
    11: "reachability failed status check reconnecting failing joshi passed dear himanshu initiating disconnected rdp asap showing chat reachable got health failure".split(),
    12: "initiating connect chat disconnected reconnecting verma janak inititaing tsill tring phone ssh update rayrao struck join suddnely looping fluctuation prodind".split(),
    13: "chat disconnected initiating connect chime connecting join ssh urgent connection equipment resilient troubleshoot lost log host network mbaosxy commercially session".split(),
    14: "resolved assistance contact ha heard continued wish case mark close regarding need hour action open web required note url want".split(),
}

In [None]:
test_doc_list[11]

In [None]:
np.argmax(results, axis=1)

In [None]:
results[0]

In [None]:
ntm_predictor.delete_model()
ntm_predictor.delete_endpoint()

# Inference using S3 stored model

In [11]:
role, sess, container

('arn:aws:iam::874163252636:role/service-role/AmazonSageMaker-ExecutionRole-20201201T202376',
 <sagemaker.session.Session at 0x7f54e2078978>,
 '382416733822.dkr.ecr.us-east-1.amazonaws.com/ntm:1')

In [21]:
from sagemaker.model import Model
from sagemaker.predictor import Predictor

ntm_model = Model(
    image_uri=container, 
    model_data="s3://bucket-sushant/topic_modelling/ntm/output/ntm-2021-09-01-16-29-04-809/output/model.tar.gz", 
    role=role,
    predictor_cls=Predictor,
    sagemaker_session=sess,
    name='01-NTM'
)

In [22]:
ntm_predictor = ntm_model.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")

-------------!

In [23]:
ntm_predictor

<sagemaker.predictor.Predictor at 0x7f54df120320>

In [24]:
type(ntm_predictor)

sagemaker.predictor.Predictor

In [25]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer


ntm_predictor.serializer = CSVSerializer()
ntm_predictor.deserializer = JSONDeserializer()

In [26]:
inference_vectors = vectorizer.transform(test_doc_list).toarray()
type(inference_vectors), inference_vectors.shape

(numpy.ndarray, (1210, 10355))

In [27]:
def predict_batches(data, rows=10):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = []
    for array in split_array:
        results = ntm_predictor.predict(array, initial_args={"ContentType": "text/csv"})
        predictions += [r["topic_weights"] for r in results["predictions"]]
    return np.array(predictions)

In [28]:
results = ntm_predictor.predict(inference_vectors[:10], initial_args={"ContentType": "text/csv"})
print(results)

{'predictions': [{'topic_weights': [0.0508086979, 0.0571757555, 0.0497882068, 0.0476639606, 0.0494563468, 0.0513480119, 0.0459508561, 0.0514453873, 0.0502470955, 0.0506516621, 0.3130582571, 0.0478635244, 0.0512152128, 0.053954497, 0.0293724649]}, {'topic_weights': [0.0229765028, 0.0613116771, 0.0217093546, 0.0203811899, 0.0221266989, 0.5712631941, 0.1023188531, 0.0221275054, 0.0296478104, 0.0232598335, 0.017914284, 0.0204995833, 0.0236266125, 0.0252140276, 0.0156228626]}, {'topic_weights': [0.0368073247, 0.0264481902, 0.039422892, 0.0222580899, 0.0365636237, 0.0398879126, 0.122978501, 0.0389436148, 0.0406348184, 0.0390544683, 0.1432041377, 0.0350537673, 0.0420784578, 0.0417799428, 0.2948842347]}, {'topic_weights': [0.0348242261, 0.0275568552, 0.0307695512, 0.0225278474, 0.0342815742, 0.1440174431, 0.3426134288, 0.031165652, 0.1460194439, 0.0343183093, 0.0248820912, 0.0330546945, 0.0338792093, 0.0397889614, 0.0203007311]}, {'topic_weights': [0.0368073247, 0.0264481902, 0.039422892, 0.02

In [29]:
batch_results = predict_batches(inference_vectors)

In [30]:
batch_results

array([[0.0508087 , 0.05717576, 0.04978821, ..., 0.05121521, 0.0539545 ,
        0.02937246],
       [0.0229765 , 0.06131168, 0.02170935, ..., 0.02362661, 0.02521403,
        0.01562286],
       [0.03680732, 0.02644819, 0.03942289, ..., 0.04207846, 0.04177994,
        0.29488423],
       ...,
       [0.0410105 , 0.18014887, 0.03824858, ..., 0.04187911, 0.05683108,
        0.05252954],
       [0.04833374, 0.11762124, 0.04327942, ..., 0.04722911, 0.05552241,
        0.02699875],
       [0.06866349, 0.10166455, 0.06707709, ..., 0.06681987, 0.06991782,
        0.0562783 ]])

In [33]:
arg_max_results = np.argmax(batch_results, axis=1)
print(batch_results.shape, arg_max_results.shape)

(1210, 15) (1210,)


In [35]:
test.head()

Unnamed: 0,case_id,customer_name,service,case_billing_region,customer_billing_country_name,comm_owner_agent_login,comm_body,case_creation_cal_date,comm_date_utc,comm_subject,case_severity,urls,4_stop_words_removed
29,7621010241,Mylan,Elastic Compute Cloud (EC2 - Windows),APAC,INDIA,arizona,Please let us know if we helped resolve your i...,11/16/2020 0:00,11/17/2020 0:00,ASG Failed - IP limit,4,https://console.aws.amazon.com/support/feedbac...,please let know helped resolve issue yes click...
35,7414621131,Tata Communications Ltd.,Elastic Compute Cloud (EC2 - Linux),APAC,INDIA,tchemvur,"Hi, Hope you are doing great. Instance ID: i-...",9/23/2020 0:00,9/23/2020 0:00,AWS CLI not working from Crontab,2,,hope great instance configured aws cli written...
41,7682250211,Hotstar (Star TV India),Elastic Compute Cloud (EC2 - Linux),APAC,INDIA,arizona,"Hello, We haven't heard back from you regardi...",12/2/2020 0:00,12/9/2020 0:00,About Amazon Corretto a OpenJDK distribution.,4,https://console.aws.amazon.com/support/home?#/...,hello heard back regarding case continued supp...
44,8302246421,ALL_DEPRECATED,Elastic Compute Cloud (EC2 - Linux),APAC,INDIA,anearin,Luke - Thank you for taking a few minutes to ...,5/5/2021 0:00,5/18/2021 0:00,"Additional Elastic IP Blocks for us-east-1, us...",4,,luke thank taking minute chat today discussed ...
46,8302246421,ALL_DEPRECATED,Elastic Compute Cloud (EC2 - Linux),APAC,INDIA,arizona,"Hello, We haven't heard back from you regardi...",5/5/2021 0:00,5/13/2021 0:00,"Additional Elastic IP Blocks for us-east-1, us...",4,https://console.aws.amazon.com/support/home?#/...,hello heard back regarding case continued supp...


In [36]:
test['predicted_topic_pos'] = arg_max_results
test['predicted_topics'] = test.predicted_topic_pos.apply(lambda x: topics[x])

In [37]:
test['predicted_topic_pos'] = arg_max_results
test['predicted_topics'] = test.predicted_topic_pos.apply(lambda x: topics[x])

In [38]:
test.head()

Unnamed: 0,case_id,customer_name,service,case_billing_region,customer_billing_country_name,comm_owner_agent_login,comm_body,case_creation_cal_date,comm_date_utc,comm_subject,case_severity,urls,4_stop_words_removed,predicted_topic_pos,predicted_topics
29,7621010241,Mylan,Elastic Compute Cloud (EC2 - Windows),APAC,INDIA,arizona,Please let us know if we helped resolve your i...,11/16/2020 0:00,11/17/2020 0:00,ASG Failed - IP limit,4,https://console.aws.amazon.com/support/feedbac...,please let know helped resolve issue yes click...,10,"[helped, click, yes, resolve, let, know, issue..."
35,7414621131,Tata Communications Ltd.,Elastic Compute Cloud (EC2 - Linux),APAC,INDIA,tchemvur,"Hi, Hope you are doing great. Instance ID: i-...",9/23/2020 0:00,9/23/2020 0:00,AWS CLI not working from Crontab,2,,hope great instance configured aws cli written...,5,"[filename, sudo, module, apt, init, yum, rpm, ..."
41,7682250211,Hotstar (Star TV India),Elastic Compute Cloud (EC2 - Linux),APAC,INDIA,arizona,"Hello, We haven't heard back from you regardi...",12/2/2020 0:00,12/9/2020 0:00,About Amazon Corretto a OpenJDK distribution.,4,https://console.aws.amazon.com/support/home?#/...,hello heard back regarding case continued supp...,14,"[resolved, assistance, contact, ha, heard, con..."
44,8302246421,ALL_DEPRECATED,Elastic Compute Cloud (EC2 - Linux),APAC,INDIA,anearin,Luke - Thank you for taking a few minutes to ...,5/5/2021 0:00,5/18/2021 0:00,"Additional Elastic IP Blocks for us-east-1, us...",4,,luke thank taking minute chat today discussed ...,6,"[training, toll, international, incoming, resp..."
46,8302246421,ALL_DEPRECATED,Elastic Compute Cloud (EC2 - Linux),APAC,INDIA,arizona,"Hello, We haven't heard back from you regardi...",5/5/2021 0:00,5/13/2021 0:00,"Additional Elastic IP Blocks for us-east-1, us...",4,https://console.aws.amazon.com/support/home?#/...,hello heard back regarding case continued supp...,14,"[resolved, assistance, contact, ha, heard, con..."


In [39]:
test.to_csv(f'{data_dir}/test_predictions_1.csv')

In [40]:
ntm_predictor.delete_model()
ntm_predictor.delete_endpoint()