## Inspect and processing data manually

In [None]:
%%sh
pip -q install gensim nltk

In [None]:
import pandas as pd

In [None]:
num_lines = 100000

data = pd.read_csv('abcnews-date-text.csv.gz', compression='gzip',
                      error_bad_lines=False, dtype='str', nrows=num_lines)

data = data.sample(frac=1)

In [None]:
data.head()

In [None]:
data = data.drop(['publish_date'], axis=1)

In [None]:
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
    
stop_words = stopwords.words('english')
wnl = WordNetLemmatizer()

def process_text(text):
    for p in string.punctuation:
        text = text.replace(p, '')
    text = ''.join([c for c in text if not c.isdigit()])
    text = text.lower().split()
    text = [w for w in text if not w in stop_words] 
    text = [wnl.lemmatize(w) for w in text]
    return text

In [None]:
%%time
data['headline_text'] = data['headline_text'].apply(process_text)

In [None]:
data.head()

In [None]:
%%time

from gensim import corpora
dictionary = corpora.Dictionary(data['headline_text'])

In [None]:
print(dictionary)

In [None]:
dictionary.filter_extremes(keep_n=512)
print(dictionary)

In [None]:
with open('vocab.txt', 'w') as f:
    for index in range(0,len(dictionary)):
        f.write(dictionary.get(index)+'\n')

In [None]:
%%time

data['tokens'] = data.apply(lambda row: dictionary.doc2bow(row['headline_text']), axis=1)

In [None]:
data = data.drop(['headline_text'], axis=1)
data.head()

In [None]:
import io, boto3
import sagemaker
import sagemaker.amazon.common as smac
from scipy.sparse import lil_matrix

print(sagemaker.__version__)

session = sagemaker.Session()
bucket = session.default_bucket()
prefix = 'headlines-lda-ntm'

In [None]:
def build_protobuf_dataset(data, dictionary):
    num_lines = data.shape[0]
    num_columns = len(dictionary)
    token_matrix = lil_matrix((num_lines, num_columns)).astype('float32')
    line = 0
    for _, row in data.iterrows():
        for token_id, token_count in row['tokens']:
            token_matrix[line, token_id] = token_count
        line+=1
        
    buf = io.BytesIO()
    smac.write_spmatrix_to_sparse_tensor(buf, token_matrix, None)
    return buf

In [None]:
def upload_protbuf_dataset(buf, bucket, prefix, key):
    obj = '{}/{}'.format(prefix, key)
    buf.seek(0)
    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(training_buf)
    path = 's3://{}/{}'.format(bucket,obj)
    return path

In [None]:
%%time
training_buf = build_protobuf_dataset(data, dictionary)
s3_training_path = upload_protbuf_dataset(training_buf, bucket, prefix, 'training/training.protobuf')
print(s3_training_path)

In [None]:
s3_auxiliary_path = session.upload_data(path='vocab.txt', key_prefix=prefix + '/input/auxiliary')
print(s3_auxiliary_path)

## Training

In [None]:
s3_output = 's3://{}/{}/output/'.format(bucket, prefix)

print(s3_output)

In [None]:
from sagemaker.image_uris import retrieve

region = session.boto_session.region_name    
container = retrieve('lda', region)
print(container)

In [None]:
lda = sagemaker.estimator.Estimator(
    container,
    sagemaker.get_execution_role(),
    instance_count=1, 
    instance_type='ml.c5.2xlarge',
    output_path=s3_output)

In [None]:
lda.set_hyperparameters(
    num_topics=10, 
    feature_dim=len(dictionary), 
    mini_batch_size=num_lines,
    alpha0=0.1)

In [None]:
lda.fit(inputs={'train': s3_training_path})

In [None]:
lda_predictor = lda.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.large')

In [None]:
import numpy as np

def process_samples(samples, dictionary):
    num_lines = len(samples)
    num_columns = len(dictionary)
    sample_matrix = np.zeros((num_lines, num_columns)).astype('float32')
    for line in range(0, num_lines):
        s = samples[line]
        s = process_text(s)
        s = dictionary.doc2bow(s)
        for token_id, token_count in s:
            sample_matrix[line, token_id] = token_count
        line+=1
    return sample_matrix

In [None]:
# Run this cell to try your own samples

samples = [
    "Major tariffs expected to end Australian barley trade to China",
    "US woman wanted over fatal crash asks for release after coronavirus halts extradition",
    "Fifty trains out of service as fault forces Adelaide passengers to 'pack like sardines",
    "Germany's Bundesliga plans its return from lockdown as football world watches",
    "All AFL players to face COVID-19 testing before training resumes"
]

In [None]:
# Run this cell to load 5 random samples from the dataset
import numpy as np

data = pd.read_csv('abcnews-date-text.csv.gz', compression='gzip',
                      error_bad_lines=False, dtype='str')
samples = data.sample(frac=1)[:5]
samples = np.array(samples.headline_text)
print(samples)

In [None]:
lda_predictor.serializer = sagemaker.serializers.CSVSerializer()
response = lda_predictor.predict(process_samples(samples, dictionary))
print(response)

In [None]:
import json

response = json.loads(response)
vectors = [r['topic_mixture'] for r in response['predictions']]

In [None]:
for v in vectors:
    top_topic = np.argmax(v)
    print("topic %s, %2.2f" % (top_topic, v[top_topic]))

In [None]:
lda_predictor.delete_endpoint()