## MultiClassifier

The bulk of the discussion can be found in the previous notebook, so won't be repeated here.

In [1]:
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt


from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.dummy import DummyClassifier

from statsmodels.stats.contingency_tables import mcnemar

import pickle

import os

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *

import torch
import torch.utils.data
import torch.optim as optim

from train.model import LSTMClassifier

import sagemaker
from sagemaker.pytorch import PyTorch

In [2]:
nltk.download("stopwords", quiet=True)

True

In [3]:
df = pd.read_csv(r"alldata.csv")

In [4]:
df.iloc[0].description

"Development Director\nALS Therapy Development Institute has an immediate opening for Development Directors. Reporting directly to the Senior Development Director, the Development Director at ALS TDI is a senior fundraising position working to identifying potential prospects and cultivating solicitation strategies and in closing asks with donors including individuals and corporations by building networks via events, generating awareness of ALS TDI; outreach including attending and speaking at events as well as personally cultivates relationships with patients, prospects and donors. This position will be responsible for generating and managing a portfolio of at least two million to five million dollars per year. This position will be located in Atlanta, GA.\n\nRequirements:\nBachelor's Degree requiredMinimum 6-8 years' experience in fundraising and business developmentSuccessful track recording in fundraising with major donors or scientific sales preferredDemonstrated ability to work in

In [5]:
df.head(10)

Unnamed: 0,position,company,description,reviews,location
0,Development Director,ALS TDI,Development Director\nALS Therapy Development ...,,"Atlanta, GA 30301"
1,An Ostentatiously-Excitable Principal Research...,The Hexagon Lavish,"Job Description\n\n""The road that leads to acc...",,"Atlanta, GA"
2,Data Scientist,Xpert Staffing,"Growing company located in the Atlanta, GA are...",,"Atlanta, GA"
3,Data Analyst,Operation HOPE,DEPARTMENT: Program OperationsPOSITION LOCATIO...,44.0,"Atlanta, GA 30303"
4,Assistant Professor -TT - Signal Processing & ...,Emory University,DESCRIPTION\nThe Emory University Department o...,550.0,"Atlanta, GA"
5,Manager of Data Engineering,McKinsey & Company,Qualifications\nBachelor’s degree in Computer ...,385.0,"Atlanta, GA 30318"
6,"Product Specialist - Periscope, New Ventures",McKinsey & Company,Qualifications\nBachelor’s degree\n5-7 years o...,385.0,"Atlanta, GA 30318"
7,"Junior to Mid-level Engineer, Geologist or Env...",Wood,Overview / Responsibilities\nWood Environment ...,899.0,"Atlanta, GA"
8,Analyst - CIB Credit Research,SunTrust,Works closely with senior CIB professionals. P...,3343.0,"Atlanta, GA"
9,Senior Associate - Cognitive Data Scientist Na...,KPMG,Known for being a great place to work and buil...,4494.0,"Atlanta, GA 30338"


### Preprocessing

In [7]:
df.position = df.apply(lambda x: str(x.position).lower(), axis=1)

In [8]:
df[~df.position.str.contains("machine|learning|data|scientist|engineer|developer")].position.count()

2381

In [9]:
df = df[df.position.str.contains("machine|learning|data|scientist||engineer|develop")]

In [10]:
roles = {0:"Machine Learning Engineer", 1:"Data Scientist", 2:"Data Analyst", 3:"Data Engineer", 4:"Software Engineer"}

In [11]:
reg = ["(machine|deep).+learning", "data.+scientist", "data.+analyst", "data.+engineer", "software.+(engineer|developer)"]
n_classes = len(reg)
category = [x for x in range(0,n_classes)]
condition = [df.position.str.contains(x) for x in reg]
df["category"] = np.select(condition, category, default=-1)



In [12]:
data = df[df.category>=0]

In [13]:
data.head(10)

Unnamed: 0,position,company,description,reviews,location,category
2,data scientist,Xpert Staffing,"Growing company located in the Atlanta, GA are...",,"Atlanta, GA",1
3,data analyst,Operation HOPE,DEPARTMENT: Program OperationsPOSITION LOCATIO...,44.0,"Atlanta, GA 30303",2
4,assistant professor -tt - signal processing & ...,Emory University,DESCRIPTION\nThe Emory University Department o...,550.0,"Atlanta, GA",0
5,manager of data engineering,McKinsey & Company,Qualifications\nBachelor’s degree in Computer ...,385.0,"Atlanta, GA 30318",3
9,senior associate - cognitive data scientist na...,KPMG,Known for being a great place to work and buil...,4494.0,"Atlanta, GA 30338",1
12,"senior associate, data scientist",KPMG,Innovate. Collaborate. Shine. Lighthouse — KPM...,4494.0,"Atlanta, GA 30338",1
14,business intelligence data science analyst - s...,Newell Brands,Data Science Analyst– Business Intelligence\nL...,912.0,"Atlanta, GA",2
15,data scientist,Cotiviti,Cotiviti is looking for an industry leading Da...,104.0,"Atlanta, GA",1
18,data scientist,Relus Cloud,DATA SCIENTIST\n\nSUMMARY:\nAs an Amazon Web S...,,"Atlanta, GA",1
19,data scientist,Inspire Brands,This position is critical to understanding dri...,,"Atlanta, GA",1


In [14]:
li = []
for category in data.category.unique():
    li.append(data[data.category==category].sample(n=300, replace=True))
data = pd.concat(li)

In [15]:
data = data.sample(frac=1).reset_index(drop=True)

In [16]:
data.description.nunique()/len(data)

0.6393333333333333

In [17]:
train_X, test_X, train_y, test_y = train_test_split(
    data.description, data.category, test_size=0.2, random_state=1)
train_X, val_X, train_y, val_y = train_test_split(
    train_X, train_y, test_size=0.25, random_state=1)

In [18]:
def bow(text):
    text = str(text)
    stemmer = PorterStemmer()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # Convert to lower case
    words = text.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem
    
    return words

In [19]:
cache_dir = os.path.join(r"cache", "preprocessed")  # where to store cache files
os.makedirs(cache_dir, exist_ok=True)  # ensure cache directory exists

def preprocess_data(data_train, data_test, data_val, labels_train, labels_test, labels_val,
                    cache_dir=cache_dir, cache_file="preprocessed_data.pkl"):
    """Convert each review to words; read from cache if available."""

    # If cache_file is not None, try to read from it first
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = pickle.load(f)
            print("Read preprocessed data from cache file:", cache_file)
        except:
            pass  # unable to read from cache, but that's okay
    
    # If cache is missing, then do the heavy lifting
    if cache_data is None:
        # Preprocess training and test data to obtain words for each review
        #words_train = list(map(review_to_words, data_train))
        #words_test = list(map(review_to_words, data_test))
        words_train = [bow(text) for text in data_train]
        words_test = [bow(text) for text in data_test]
        words_val = [bow(text) for text in data_val]
        
        # Write to cache file for future runs
        if cache_file is not None:
            cache_data = dict(words_train=words_train, words_test=words_test, words_val=words_val,
                              labels_train=labels_train, labels_test=labels_test, labels_val=labels_val)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                pickle.dump(cache_data, f)
            print("Wrote preprocessed data to cache file:", cache_file)
    else:
        # Unpack data loaded from cache file
        words_train, words_test, words_val, labels_train, labels_test, labels_val = (cache_data['words_train'],
                cache_data['words_test'], cache_data['words_val'], cache_data['labels_train'], cache_data['labels_test'], cache_data['labels_val'])
    
    return words_train, words_test, words_val, labels_train, labels_test, labels_val

In [20]:
%%time

# Preprocess data
train_X, test_X, val_X, train_y, test_y, val_y = preprocess_data(train_X, test_X, val_X, train_y, test_y, val_y)

Read preprocessed data from cache file: preprocessed_data.pkl
CPU times: user 82.7 ms, sys: 38.4 ms, total: 121 ms
Wall time: 91.8 ms


In [21]:
def build_dict(data, vocab_size = 5000):
    """Construct and return a dictionary mapping each of the most frequently appearing words to a unique integer."""
    
    # TODO: Determine how often each word appears in `data`. Note that `data` is a list of sentences and that a
    #       sentence is a list of words.
    
    word_count = {} 
    
    for review in data:
        for word in review:
            if word in word_count:
                word_count[word] += 1
            else:
                word_count[word] = 1
    
    # TODO: Sort the words found in `data` so that sorted_words[0] is the most frequently appearing word and
    #       sorted_words[-1] is the least frequently appearing word.
    
    sorted_words = [word for word, count in sorted(word_count.items(), key=lambda x:x[1], reverse=True)]

    word_dict = {} # This is what we are building, a dictionary that translates words into integers
    for idx, word in enumerate(sorted_words[:vocab_size - 2]): # The -2 is so that we save room for the 'no word'
        word_dict[word] = idx + 2                              # 'infrequent' labels
        
    return word_dict

In [22]:
word_dict = build_dict(train_X)

### Save the data

In [23]:
data_dir = r'data/' # The folder we will use for storing data
if not os.path.exists(data_dir): # Make sure that the folder exists
    os.makedirs(data_dir)

In [24]:
with open(os.path.join(data_dir, 'word_dict.pkl'), "wb") as f:
    pickle.dump(word_dict, f)

In [25]:
def convert_and_pad(word_dict, sentence, pad=1000):
    NOWORD = 0 # We will use 0 to represent the 'no word' category
    INFREQ = 1 # and we use 1 to represent the infrequent words, i.e., words not appearing in word_dict
    
    working_sentence = [NOWORD] * pad
    
    for word_index, word in enumerate(sentence[:pad]):
        if word in word_dict:
            working_sentence[word_index] = word_dict[word]
        else:
            working_sentence[word_index] = INFREQ
            
    return working_sentence, min(len(sentence), pad)

def convert_and_pad_data(word_dict, data, pad=1000):
    result = []
    lengths = []
    
    for sentence in data:
        converted, leng = convert_and_pad(word_dict, sentence, pad)
        result.append(converted)
        lengths.append(leng)
        
    return np.array(result), np.array(lengths)

In [26]:
train_X, train_X_len = convert_and_pad_data(word_dict, train_X)
test_X, test_X_len = convert_and_pad_data(word_dict, test_X)
val_X, val_X_len = convert_and_pad_data(word_dict, val_X)

In [27]:
pd.concat([train_y.reset_index(drop=True), pd.DataFrame(train_X_len, columns=['Length']), pd.DataFrame(train_X)], axis=1) \
        .to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [28]:
pd.concat([test_y.reset_index(drop=True), pd.DataFrame(test_X_len, columns=['Length']), pd.DataFrame(test_X)], axis=1) \
        .to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)

In [29]:
pd.concat([val_y.reset_index(drop=True), pd.DataFrame(val_X_len, columns=['Length']), pd.DataFrame(val_X)], axis=1) \
        .to_csv(os.path.join(data_dir, 'val.csv'), header=False, index=False)

In [30]:
!pygmentize train/model.py

[34mimport[39;49;00m [04m[36mtorch.nn[39;49;00m [34mas[39;49;00m [04m[36mnn[39;49;00m
[34mimport[39;49;00m [04m[36mtorch[39;49;00m

[34mclass[39;49;00m [04m[32mLSTMClassifier[39;49;00m(nn.Module):
    [33m"""[39;49;00m
[33m    This is the simple RNN model we will be using to perform Sentiment Analysis.[39;49;00m
[33m    """[39;49;00m

    [34mdef[39;49;00m [32m__init__[39;49;00m([36mself[39;49;00m, embedding_dim, hidden_dim, vocab_size, out_size=[34m5[39;49;00m):
        [33m"""[39;49;00m
[33m        Initialize the model by settingg up the various layers.[39;49;00m
[33m        """[39;49;00m
        [36msuper[39;49;00m(LSTMClassifier, [36mself[39;49;00m).[32m__init__[39;49;00m()

        [36mself[39;49;00m.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=[34m0[39;49;00m)
        [36mself[39;49;00m.lstm = nn.LSTM(embedding_dim, hidden_dim)
        [36mself[39;49;00m.dense = nn.Linear(in_features=hidden_dim, out_features

### Train LSTM and Dummy

In [31]:
# Read in only the first 250 rows
train_sample = pd.read_csv(os.path.join(data_dir, 'train.csv'), header=None, names=None, nrows=250)

# Turn the input pandas dataframe into tensors
train_sample_y = torch.from_numpy(train_sample[[0]].values).squeeze()
train_sample_X = torch.from_numpy(train_sample.drop([0], axis=1).values).long()

# Build the dataset
train_sample_ds = torch.utils.data.TensorDataset(train_sample_X, train_sample_y)
# Build the dataloader
train_sample_dl = torch.utils.data.DataLoader(train_sample_ds, batch_size=50)

In [32]:
# Read in only the first 250 rows
val_sample = pd.read_csv(os.path.join(data_dir, 'val.csv'), header=None, names=None, nrows=250)

# Turn the input pandas dataframe into tensors
val_sample_y = torch.from_numpy(val_sample[[0]].values).squeeze()
val_sample_X = torch.from_numpy(val_sample.drop([0], axis=1).values).long()

# Build the dataset
val_sample_ds = torch.utils.data.TensorDataset(val_sample_X, val_sample_y)
# Build the dataloader
val_sample_dl = torch.utils.data.DataLoader(val_sample_ds, batch_size=50)

Note that we use CrossEntropyLoss for the multiclass problem. The model itself has a slightly different structure as well.

In [33]:
def train(model, train_loader, epochs, optimizer, loss_fn, device, valid_loader=None):
    for epoch in range(1, epochs + 1):
        model.train()
        total_loss = 0
        print("Epoch: {}".format(epoch))
        for batch in train_loader:         
            batch_X, batch_y = batch
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)
            
            optimizer.zero_grad()
            # forward propagate
            out = model(batch_X)

            loss = loss_fn(out, batch_y)

            # backwards propagate
            loss.backward()
            optimizer.step()
            
            total_loss += loss.data.item()
        print("Loss: {}".format(total_loss / len(train_loader)))
        if valid_loader:
            model.eval()
            val_loss = 0.0
            #val_preds = np.zeros((len(x_cv),len(le.classes_)))

            for i, (x_batch, y_batch) in enumerate(valid_loader):
                y_pred = model(x_batch).detach()
                val_loss += loss_fn(y_pred, y_batch).item()
                # keep/store predictions
                #val_preds[i * batch_size:(i+1) * batch_size] = F.softmax(y_pred).cpu().numpy()

            # Check Accuracy
            #val_accuracy = sum(val_preds.argmax(axis=1)==test_y)/len(test_y)
            print("ValLoss: {}".format(val_loss/len(valid_loader)))

In [34]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(32, 100, 5000).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.CrossEntropyLoss()



In [35]:
dum = DummyClassifier(strategy="stratified")
dum.fit(train_sample.iloc[:,1:], train_sample.iloc[:,0])
print("Dummy Classifier Accuracy: {}".format(dum.score(test_X, test_y)))

Dummy Classifier Accuracy: 0.18666666666666668


In [36]:
print("Training Accuracy:", accuracy_score(np.array(torch.argmax(model(train_sample_X), dim=1)),np.array(train_sample_y)))
test = torch.from_numpy(pd.concat([pd.DataFrame(test_X_len, columns=['Length']), pd.DataFrame(test_X)], axis=1).values).long()
print("Test Accuracy:", accuracy_score(np.array(torch.argmax(model(test), dim=1)),np.array(test_y)))
confusion_matrix(np.array(torch.argmax(model(test), dim=1)),np.array(test_y))

Training Accuracy: 0.208
Test Accuracy: 0.21333333333333335


array([[13, 10, 11, 22, 17],
       [ 2,  2,  3,  3,  4],
       [41, 39, 46, 34, 43],
       [ 1,  0,  0,  0,  0],
       [ 0,  1,  3,  2,  3]])

In [37]:
train(model, train_sample_dl, 6, optimizer, loss_fn, device, val_sample_dl)

Epoch: 1
BCELoss: 1.6039209604263305
ValLoss: 1.5786990165710448
Epoch: 2
BCELoss: 1.3875929832458496
ValLoss: 1.552106785774231
Epoch: 3
BCELoss: 1.0557105541229248
ValLoss: 1.6039378643035889
Epoch: 4
BCELoss: 0.6790406227111816
ValLoss: 1.7504199981689452
Epoch: 5
BCELoss: 0.45246206521987914
ValLoss: 1.8674320459365845
Epoch: 6
BCELoss: 0.3595859229564667
ValLoss: 2.058439111709595


In [38]:
print("Training Accuracy:", accuracy_score(np.array(torch.argmax(model(train_sample_X), dim=1)),np.array(train_sample_y)))
test = torch.from_numpy(pd.concat([pd.DataFrame(test_X_len, columns=['Length']), pd.DataFrame(test_X)], axis=1).values).long()
print("Test Accuracy:", accuracy_score(np.array(torch.argmax(model(test), dim=1)),np.array(test_y)))


Training Accuracy: 0.912
Test Accuracy: 0.3933333333333333


### McNemar

In [39]:
test_outcomes = pd.DataFrame({"Dummy":dum.predict(test_X)==test_y, "LSTM":np.array(torch.argmax(model(test), dim=1))==test_y})

In [40]:
contingency = pd.crosstab(test_outcomes.Dummy, test_outcomes.LSTM)

In [41]:
stats = mcnemar(contingency, exact=False, correction=True)

In [42]:
print('statistic=%.3f, p-value=%.3f' % (stats.statistic, stats.pvalue))

statistic=34.299, p-value=0.000


In [43]:
alpha = 0.05
if stats.pvalue > alpha:
    print('Fail to reject Null Hypothesis')
else:
    print('Statistically significant difference, reject Null Hypothesis')

Statistically significant difference, reject Null Hypothesis


While the  overall performance is actually worse than the binary case, the result is actually more significant in the multiclass case. The model is much better than random chance.

### Deploy Endpoint

In [44]:
sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
prefix = 'sagemaker/JobMulticlassLSTM'

role = sagemaker.get_execution_role()

In [45]:
input_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)

In [46]:
estimator = PyTorch(entry_point="train.py",
                    source_dir="train",
                    role=role,
                    framework_version='0.4.0',
                    train_instance_count=1,
                    train_instance_type='ml.m4.xlarge',
                    hyperparameters={
                        'epochs': 10,
                        'hidden_dim': 100,
                        'learning_rate': 0.1
                    })

In [47]:
estimator.fit({'training': input_data})

2020-06-16 22:07:10 Starting - Starting the training job...
2020-06-16 22:07:12 Starting - Launching requested ML instances......
2020-06-16 22:08:14 Starting - Preparing the instances for training...
2020-06-16 22:09:00 Downloading - Downloading input data...
2020-06-16 22:09:24 Training - Downloading the training image..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2020-06-16 22:09:45,259 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2020-06-16 22:09:45,261 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-06-16 22:09:45,275 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2020-06-16 22:09:45,495 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2020-06-16 22:09:45,718 sagemaker-containers INFO     Module train does not provi

In [48]:
predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

---------------!

In [49]:
test = pd.concat([pd.DataFrame(test_X_len), pd.DataFrame(test_X)], axis=1)

In [50]:
def predict(data, rows=512):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = np.array([])
    for array in split_array:
        predictions = np.append(predictions, np.argmax(predictor.predict(array), axis=1))
    
    return predictions

In [51]:
predictions = predict(test.values)
predictions = [round(num) for num in predictions]

In [52]:
accuracy_score(test_y, predictions)

0.55

In [53]:
confusion_matrix(test_y, predictions)

array([[33,  8,  6,  3,  7],
       [12, 18,  4, 13,  5],
       [ 3,  6, 43,  5,  6],
       [11,  5,  9, 33,  3],
       [11,  7,  5,  6, 38]])

In [54]:
predictor.delete_endpoint()