# Develop, Train, Optimize, and Deploy Scikit-Learn LinearSVC
## Twitter Sentiment

In [None]:
!git pull

In [1]:
import boto3
import json
import pandas as pd
import numpy as np

import sagemaker
from sagemaker import get_execution_role
from sklearn.model_selection import train_test_split
from sagemaker.sklearn.estimator import SKLearn
from sagemaker import get_execution_role

role = get_execution_role()

# SageMaker Python SDK
sm_boto3 = boto3.client('sagemaker')
sess = sagemaker.Session()
region = sess.boto_session.region_name

bucket = sess.default_bucket()
print('Using bucket ' + bucket)

Using bucket sagemaker-us-east-1-159307201141


### Prepare data

In [None]:
DATASET_COLUMNS = ['target', 'ids', 'date', 'flag', 'user', 'tweet']
DATASET_ENCODING = 'ISO-8859-1'
TRAIN_SIZE = 0.8

# Read the data locally
df = pd.read_csv('training.1600000.processed.noemoticon.csv',
                 encoding=DATASET_ENCODING, names=DATASET_COLUMNS)

# Split the data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    df['tweet'], df['target'], test_size=1 - TRAIN_SIZE, random_state=817)

trainX = pd.DataFrame(X_train, columns=['tweet'])
trainX['target'] = y_train

testX = pd.DataFrame(X_test, columns=['tweet'])
testX['target'] = y_test

# Save the train_test_split locally
trainX.to_csv('twitter_train.csv', index=False)
testX.to_csv('twitter_test.csv', index=False)

# Send data to S3. SageMaker will take training data from S3
trainpath = sess.upload_data(
    path='twitter_train.csv', bucket=bucket,
    key_prefix='data/twitter')

testpath = sess.upload_data(
    path='twitter_test.csv', bucket=bucket,
    key_prefix='data/twitter')


### Writing a Script Mode Script

In [None]:
%%writefile svc.py

import argparse
import json
import os
import re
from io import StringIO
from distutils.util import strtobool

import numpy as np
import pandas as pd
    
from sklearn.externals import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score

from sagemaker_containers.beta.framework import worker


import nltk
# nltk.download('stopwords')

# nltk.download(info_or_id="stopwords", download_dir="/home/capstone", quiet=True)
# nltk.data.path.append("/home/capstone")

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

TWEET_CLEANING_RE = r'@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+'

decode_map = {
    0: 'NEGATIVE',
    4: 'POSITIVE'
}


def decode_sentiment(label):
    return decode_map[int(label)]


# TODO Adjust preprocessor to be better
def preprocess(tweet, stem=False):
    """Preprocesses one tweet"""
    tweet = re.sub(TWEET_CLEANING_RE, ' ', str(tweet).lower()).strip()
    tokens = []
    for token in tweet.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)


def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, 'model.joblib'))
    return clf


def input_fn(request_body, request_content_type):
    """An input_fun that loads JSON into a Pandas DataFrame, and preprocesses the tweets"""
    if request_content_type == "application/json":
        # TODO We don't really need Pandas here anymore
        df = pd.read_json(StringIO(request_body))
        # TODO Add way to provide stem parameter
        df.tweet = df.tweet.apply(lambda x: preprocess(x))
        return df['tweet'].to_numpy()
    else:
        raise ValueError(
            '{} is not supported by script.'.format(request_content_type))


def predict_fn(input_data, model):
    pred = model.predict(input_data)
    deci_func = model.decision_function(input_data)

    predictions = []
    for p, d in zip(pred, deci_func):
        predictions.append({
            'prediction': p,
            'probability': d
        })

    return {'results': predictions}


def output_fn(prediction, content_type):
    if content_type == 'application/json':
        return worker.Response(json.dumps(prediction), content_type, mimetype=content_type)
    else:
        raise ValueError(
            '{} accept type is not supported by this script.'.format(content_type))


if __name__ == '__main__':
    print('Extracting arguments')

    parser = argparse.ArgumentParser()

    # Hyperparameters from the client
    parser.add_argument('--stem', type=bool, default=False)
    parser.add_argument('--ngram_range', type=int, default=1)
    parser.add_argument('--max_df', type=float, default=1.0)
    parser.add_argument('--min_df', type=float, default=1.0)
    parser.add_argument('--max_features', type=int, default=None)
    parser.add_argument('--smooth-idf', type=str, default='true')
    parser.add_argument('--sublinear-tf', type=str, default='true')
    parser.add_argument('--C', type=float, default=1.0)
    parser.add_argument('--penalty', type=str, default='l2')
    parser.add_argument('--loss', type=str, default='squared_hinge')

    # Data, model, and output directories
    # TODO Remove this argument?
    # parser.add_argument('--output-data-dir', type=str, default=os.environ.get('SM_OUTPUT_DATA_DIR'))
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    parser.add_argument('--train-file', type=str, default='twitter_train.csv')
    parser.add_argument('--test-file', type=str, default='twitter_test.csv')

    args, _ = parser.parse_known_args()

    print('Reading Tweets')
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    print('Length of train_df: {}'.format(str(len(train_df.index))))
    print('Length of test_df: {}'.format(str(len(test_df.index))))

    print('Preprocessing the Tweets')
    # Decode Sentiment/Target
    train_df.target = train_df.target.apply(lambda x: decode_sentiment(x))
    test_df.target = test_df.target.apply(lambda x: decode_sentiment(x))

    # Preprocess Tweet
    train_df.tweet = train_df.tweet.apply(lambda x: preprocess(x, args.stem))
    test_df.tweet = test_df.tweet.apply(lambda x: preprocess(x, args.stem))

    print('Building training and testing datasets')
    X_train = train_df['tweet']
    y_train = train_df['target']
    X_test = test_df['tweet']
    y_test = test_df['target']

    # TODO TfIdf preprocessor?

    print('Training the model')
    pipe = Pipeline([
        ('tfidf', TfidfVectorizer(ngram_range=(1, args.ngram_range),
                # max_df=args.max_df,
                # min_df=args.min_df,
                # max_features=args.max_features,
                smooth_idf=bool(strtobool(args.smooth_idf)),
                sublinear_tf=bool(strtobool(args.sublinear_tf)))),
        # We can't use LinearSVC with a soft VotingClassifier
        # A hard VotingCLassifier doesn't fit my needs. We need probabilities, so that we can rank. 
        ('svc', LinearSVC(C=args.C,
                penalty=args.penalty,
                loss=args.loss))
    ])

    clf = pipe.fit(X_train, y_train)

    print('Print validation statistics')
    pred = clf.predict(X_test)
    # pred_prob = clf.predict_proba(X_test)
    decision = clf.decision_function(X_test)

    print(pred)
    print(decision)

    # TODO Why are these the same?
    print('Accuracy: {}'.format(accuracy_score(y_test, pred)))
    print('Precision: {}'.format(precision_score(y_test, pred, average='macro')))
    print('Recall: {}'.format(recall_score(y_test, pred, average='micro')))
    # TODO Add more validation statistics

    # labels = sorted(list(set(['POSITIVE', 'NEGATIVE'])))

    # print("\nConfusion matrix:")
    # print("Labels: {0}\n".format(",".join(labels)))
    # print(confusion_matrix(y_test, pred, labels=labels))

    # print("\nClassification report:")
    # print(classification_report(y_test, pred))

    print('Save the model')
    joblib.dump(clf, os.path.join(args.model_dir, 'model.joblib'))

### SageMaker Training

In [None]:
# We use the Estimator from the SageMaker Python SDK

sklearn_estimator = SKLearn(
    entry_point='svc.py',
    source_dir='.',
    role = role,
        train_instance_count=1,
    train_instance_type='ml.m5.large',
    framework_version='0.20.0',
    base_job_name='twitter-svc',
    metric_definitions=[
        {'Name': 'accuracy',
         'Regex': 'Accuracy: ([0-9.]+).*$'}],
    hyperparameters={
        'ngram_range': 2,
        'C': 0.45749319654611337,
        'smooth_idf': 'false',
        'sublinear_tf': 'false',
    }
)

In [None]:
sklearn_estimator.fit({'train':'s3://sagemaker-us-east-1-159307201141/data/twitter/twitter_train.csv', 'test':'s3://sagemaker-us-east-1-159307201141/data/twitter/twitter_test.csv'}, wait=False)

### Launching a tuning job with the Python SDK

In [None]:
# We use the Hyperparameter Tuner 

from sagemaker.tuner import ContinuousParameter, CategoricalParameter, IntegerParameter

hyperparameter_ranges = {
    'ngram_range': IntegerParameter(1, 2),
    # 'max_df': ContinuousParameter(0.0001, 1, 'Logarithmic'),
    # 'min_df': ContinuousParameter(0.0001, 1, 'Logarithmic'),
    # 'max_features': IntegerParameter(1000, 100000),
    # 'use_idf': CategoricalParameter(['true', 'false']),
    'smooth_idf': CategoricalParameter(['true', 'false']),
    'sublinear_tf': CategoricalParameter(['true', 'false']),
    'C': ContinuousParameter(0.0001, 100, 'Logarithmic')
    # 'penalty': CategoricalParameter(['l1', 'l2']),
    # 'loss': CategoricalParameter(['hinge', 'squared_hinge'])
}

Optimizer = sagemaker.tuner.HyperparameterTuner(
    estimator=sklearn_estimator,
    hyperparameter_ranges=hyperparameter_ranges,
    base_tuning_job_name='twitter-svc-tuner',
    objective_type='Maximize',
    objective_metric_name='accuracy',
    metric_definitions=[
        {'Name': 'accuracy',
         'Regex': 'Accuracy: ([0-9.]+).*$'}
    ],
    max_jobs=40,
    max_parallel_jobs=3
)

In [None]:
Optimizer.fit({'train':trainpath, 'test':testpath})

In [None]:

results = Optimizer.analytics().dataframe()
results.head()

### Deploy to a real-time endpoint

In [2]:
#TODO Get best model from a tuning job OR do it by name

sklearn_estimator = SKLearn.attach("twitter-svc-tuner-200324-1449-015-0d842270")

predictor = sklearn_estimator.deploy(
    instance_type='ml.t2.large',
    initial_instance_count=1
)

2020-03-24 20:21:54 Starting - Preparing the instances for training
2020-03-24 20:21:54 Downloading - Downloading input data
2020-03-24 20:21:54 Training - Training image download completed. Training in progress.
2020-03-24 20:21:54 Uploading - Uploading generated training model
2020-03-24 20:21:54 Completed - Training job completed[34m2020-03-24 20:18:22,268 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2020-03-24 20:18:22,268 sagemaker-containers INFO     Failed to parse hyperparameter _tuning_objective_metric value accuracy to Json.[0m
[34mReturning the value itself[0m
[34m2020-03-24 20:18:22,271 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-03-24 20:18:22,280 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2020-03-24 20:19:26,376 sagemaker-containers INFO     Module script does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m2020-03

Using already existing model: twitter-svc-tuner-200324-1449-015-0d842270


---------------!

### Invoke with boto3

In [3]:
runtime = boto3.client('sagemaker-runtime', 
    aws_access_key_id='AKIASKF3S7J2ZW36FB7V',
    aws_secret_access_key='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')

tweets = {
    'tweet': [
        '''Here to give you dinner inspo, so you don't keep eating chips for dinner.''',
        '''totally okay to not love Biden but if you’re threatening to just not vote in the election if he’s the candidate, you suck''',
        '''Spent 2 days in New Orleans. Gained 75 lbs. Totally worth it.''',
        '''Fresh pastry from Beetbox in the house!''',
        '''These people made me laugh so hard that I briefly thought I gave myself a hernia.''',
        '''I hate everything.'''
    ]
}

response = runtime.invoke_endpoint(
    EndpointName='twitter-svc-tuner-200324-1449-015-0d842270',
    Body=json.dumps(tweets),
    ContentType='application/json')

results = json.loads(response['Body'].read())

for pred in results['results']:
    print(pred)


{'prediction': 'POSITIVE', 'probability': 0.6611542933909837}
{'prediction': 'POSITIVE', 'probability': 0.15763187223286762}
{'prediction': 'POSITIVE', 'probability': 0.013655673248487599}
{'prediction': 'POSITIVE', 'probability': 0.7376460738221517}
{'prediction': 'POSITIVE', 'probability': 0.2431036530170101}
{'prediction': 'NEGATIVE', 'probability': -0.8907029657259222}


In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()
for sentence in tweets['tweet']:
    vs = analyzer.polarity_scores(sentence)
    # print("{:-<65} {}".format(sentence, str(vs)))

    compound = vs['compound']

    if compound >= 0.05:
        score = 'positive'
    if compound > -0.05 and compound < 0.05:
        score = 'neutral'
    if compound <= -0.05:
        score = 'negative'

    print('{} : {}'.format(score, compound))

In [None]:
sm_boto3.delete_endpoint(EndpointName=predictor.endpoint)

In [None]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt


combi['tidy_tweet'] = np.vectorize(remove_pattern)(combi['tweet'], "@[\w]*") 
combi['tidy_tweet'] = combi['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")

# remove hashtag

# lower everything

tokenized_tweet = combi['tidy_tweet'].apply(lambda x: x.split()) # tokenizing
# combi['tidy_tweet'] = combi['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

from nltk.stem.porter import *
stemmer = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming

for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
    
combi['tidy_tweet'] = tokenized_tweet

In [None]:
DATASET_COLUMNS = ['target', 'ids', 'date', 'flag', 'user', 'tweet']
DATASET_ENCODING = 'ISO-8859-1'

# Read the data locally
df = pd.read_csv('training.1600000.processed.noemoticon.csv',
                 encoding=DATASET_ENCODING, names=DATASET_COLUMNS)

df

In [None]:
import preprocessor as p 

p.set_options(p.OPT.HASHTAG)
df['tweet'] = df['tweet'].apply(lambda x: p.clean(x))

df