In [48]:
# Import libraries.
import os
!pip install boto3
import pickle
from tqdm import tqdm

import pandas as pd
import numpy as np

import nltk
nltk.download('stopwords')
nltk.download('punkt')
import joblib
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.metrics import accuracy_score

import json
import string
import re
from bs4 import BeautifulSoup
from tqdm import tqdm

from helpers import read_data, process_text, prepare_data, extract_features

# import sagemaker
# from sagemaker import get_execution_role
# from sagemaker.amazon.amazon_estimator import get_image_uri
# from sagemaker.predictor import csv_serializer

# Set global variables
RANDOM_STATE = 5
DIR = 'data/'

You should consider upgrading via the '/Users/aleemullahkhan/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aleemullahkhan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aleemullahkhan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [49]:
# Read data.
df = read_data(DIR, RANDOM_STATE)

# Show first rows.
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'data/True.csv'

In [None]:
!pygmentize helpers.py

In [None]:
# Join title and text into one column.
df['text'] = df.title + " " + df.text

# Remove useless columns.
df.drop(columns=['subject', 'date', 'title'], axis=1, inplace=True)

# Show the first rows.
display(df.head())

# Show an example of text.
df.text[3]

In [None]:
# Split data to train and test datasets.
train_X, test_X, train_y, test_y = train_test_split(df.text, df.label, test_size=0.2,
                                                    random_state=RANDOM_STATE)

print("Fake and True News (combined): train = {}, test = {}".format(len(train_X), len(test_X)))

In [None]:
print(train_X[3])
print(train_y[3])

In [None]:
# Apply process_text to an example.
process_text(train_X[3])

In [None]:
# Create a new 'tqdm' instance to time and estimate the progress of functions.
tqdm.pandas()

# Ensure directory exists.
os.makedirs(DIR, exist_ok=True)

# Preprocess data.
train_X, test_X, train_y, test_y = prepare_data(train_X, test_X, train_y, test_y, cache_dir=DIR)

In [None]:
# Use this cell to examine one of the processed reviews to make sure everything is working as intended.
print(train_X[5])
print(len(train_X[5]))

In [None]:
# Print some n-grams from the dictionary.
for key in sorted(vocabulary, key=vocabulary.get, reverse=True)[:20]:
    print(key, ':', vocabulary[key])

In [None]:
# Build and fit the model.
model = MultinomialNB()
model.fit(train_X, train_y)

In [None]:
# Make and save the predictions.
predictions = model.predict(test_X)

print(confusion_matrix(test_y, predictions))
print(classification_report(test_y, predictions))
plot_confusion_matrix(model, test_X, test_y, cmap='Blues')

In [None]:
# Split data to train, validation, and test datasets.
df_train, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_STATE)
df_train, df_valid = train_test_split(df_train, test_size=0.2, random_state=RANDOM_STATE)

# Put label in first column.
df_train = df_train[['label', 'text']]
df_valid = df_valid[['label', 'text']]
df_test = df_test[['label', 'text']]

# Add __label__ to class as prefix.
df_train.label = '__label__' + df_train.label.astype('str')
df_valid.label = '__label__' + df_valid.label.astype('str')
df_test.label = '__label__' + df_test.label.astype('str')

# Clean and normalize text.
df_train.text = df_train.text.progress_apply(process_text)
df_valid.text = df_valid.text.progress_apply(process_text)
df_test.text = df_test.text.progress_apply(process_text)

# Show dfs.
display(df_train.head())
display(df_valid.head())
display(df_test.head())

In [None]:
# Save to csv.
df_train.to_csv(os.path.join(DIR, 'news.train'), sep=' ', header=False, index=False)
df_valid.to_csv(os.path.join(DIR, 'news.valid'), sep=' ', header=False, index=False)
df_test.to_csv(os.path.join(DIR, 'news.test'), sep=' ', header=False, index=False)

In [None]:
from transformers import sagemaker

# Store the current SageMaker session.
session = sagemaker.Session()

# Store the bucket.
bucket = session.default_bucket()

# S3 prefix (which folder will we use).
prefix = 'fake-news-bt'

# Upload the processed test, train and validation files,
# which are contained in data directory to S3 using session.upload_data().
test_location = session.upload_data(os.path.join(DIR, 'news.test'), key_prefix=prefix)
val_location = session.upload_data(os.path.join(DIR, 'news.valid'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(DIR, 'news.train'), key_prefix=prefix)

In [None]:
# Our current execution role is required when creating the model as the training
# and inference code will need to access the model artifacts.
role = get_execution_role()

In [None]:
# We need to retrieve the location of the container, which is provided by Amazon for using XGBoost.
# As a matter of convenience, the training and inference code both use the same container.
container = get_image_uri(session.boto_region_name, 'blazingtext', 'latest')

In [None]:
# First we create a SageMaker estimator object for our model.
bt_model = sagemaker.estimator.Estimator(container, # The location of the container we wish to use
                                         role, # What is our current IAM Role
                                         train_instance_count=1, # How many compute instances
                                         train_instance_type='ml.c4.4xlarge', # What kind of compute instances
                                         train_volume_size = 30,
                                         train_max_run = 360000,
                                         input_mode= 'File',
                                         output_path='s3://{}/{}/output'.format(bucket, prefix),
                                         sagemaker_session=session)

# And then set the algorithm specific parameters.
bt_model.set_hyperparameters(mode="supervised",
                             epochs=10,
                             min_count=2,
                             learning_rate=0.05,
                             vector_dim=10,
                             early_stopping=True,
                             patience=4,
                             min_epochs=5,
                             word_ngrams=3)

In [None]:
s3_input_train = sagemaker.s3_input(s3_data=train_location, distribution='FullyReplicated',
                                    content_type='text/plain')
s3_input_validation = sagemaker.s3_input(s3_data=val_location, distribution='FullyReplicated',
                                         content_type='text/plain')

In [None]:
bt_model.fit({'train': s3_input_train, 'validation': s3_input_validation})

In [None]:
predictor = bt_model.deploy(initial_instance_count = 1,instance_type = 'ml.m4.xlarge')

In [None]:
test_X = pd.read_csv(os.path.join(DIR, 'news.test'), header=None)

In [None]:
# Create a function to define the batches.
# From: https://stackoverflow.com/questions/8290397/how-to-split-an-iterable-in-constant-size-chunks
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

In [None]:
# Create batches of 512 inputs for prediction and add them to a list.
predictions = []
for x in batch(test_X.iloc[:,0].str[12:-1].tolist(), 512):
    payload = {"instances" : x}
    prediction_batch = predictor.predict(json.dumps(payload))
    prediction_batch = [int(prediction.get("label")[0][9:]) for prediction in json.loads(prediction_batch)]
    predictions.append(prediction_batch)

In [None]:
# Flatten list.
predictions = sum(predictions, [])

In [None]:
accuracy_score(test_y, predictions)

In [None]:
# Clean only if you don't want to use later the Lambda function and API
# to make predictions through a simple web app.
predictor.delete_endpoint()

```python
# We need to use the low-level library to interact with SageMaker since the SageMaker API
# is not available natively through Lambda.
import boto3

# And we need the following libraries to do some of the data processing.
import re
import string
import json

# Create a function to process text.
def process_text(text):
    # Normalize links replacing them with the str 'link'.
    text = re.sub('http\S+', 'link', text)

    # Normalize numbers replacing them with the str 'number'.
    text = re.sub('\d+', 'number', text)

    # Normalize emails replacing them with the str 'email'.
    text = re.sub('\S+@\S+', 'email', text, flags=re.MULTILINE)
    
    # Remove punctuation.    
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove whitespaces.
    text = text.strip()
    
    # Convert all letters to lower case.
    text = text.lower()

    # Split text into words.
    words = text.split()

    return ' '.join(words)

runtime= boto3.client('runtime.sagemaker')

def lambda_handler(event, context):
    data = event['body']
    sentence = process_text(data)

    try:
        payload = {"instances" : sentence}

        response = runtime.invoke_endpoint(EndpointName='blazingtext-2020-06-15-07-32-18-142',
                                            ContentType='application/json',
                                            Body=json.dumps(payload))

        result = json.loads(response['Body'].read().decode())
        # prob = []
        labels = []
        for label in result[0]['label']:
            labels.append(label[9:])
        print("DATA", data)
        print("SENTENCE", sentence)
        return {
            'statusCode' : 200,
            'headers' : { 'Content-Type' : 'text/plain', 'Access-Control-Allow-Origin' : '*' },
            'body' : str(labels[0])
    }
        # return {'statusCode': 200, 'body': str(labels[0])}
    except Exception as e:
        print(e)
        return {'statusCode': 400,
                'body': json.dumps({'error_message': 'Unable to generate tag.'})}
```

In [None]:
predictor.endpoint

In [None]:
# print(str(vocabulary))

In [None]:
predictor.delete_endpoint()

In [None]:
# First we will remove all of the files contained in the data directory.
!rm $data_dir/*

# Then we delete the directory itself.
!rmdir $data_dir