## Load Data

In [1]:
# Don't do in production. Doing now to keep output clean for understanding
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
from datasets import Dataset

data = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/IMDB-Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
dataset = Dataset.from_pandas(data)
dataset

Dataset({
    features: ['review', 'sentiment'],
    num_rows: 50000
})

In [4]:
dataset = Dataset.from_pandas(data)
dataset = dataset.train_test_split(test_size=0.3)
dataset

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 35000
    })
    test: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 15000
    })
})

In [5]:
#huggingface needs this
# input_ids, attention_mask, label -> numbers

In [6]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [7]:
label2id = {'negative': 0, 'positive': 1}
id2label = {0:'negative', 1:'positive'}

dataset = dataset.map(lambda x: {'label': label2id[x['sentiment']]})

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

In [8]:
dataset['train'][0]

{'review': 'National Treasure is about as over-rated and over-hyped as they come. Nicholas Cage is in no way a believable action hero, and this film is no "Indiana Jones". People who have compared this movie to the Indian Jones classic trilogy have seriously fallen off their rocker.<br /><br />I can\'t really figure out what kind of target audience this film was shooting for. Maybe the pre-teen audience will like it, but I found it to be absolutely ludicrous. I also can\'t imagine adults or young adults to find this to be that great of a film. Simply put: it\'s just OK at best.<br /><br />National Treasure is unimagined and uninspired, borrowing what it does have from "The Da Vinci Code". I would recommend waiting for that movie to be released in 2006, and passing on this nonsense.<br /><br />The whole idea of being able to so easily steal the Declaration of Independence and run around all over Washington DC and Philadelphia with it (while never damaging it once), while fighting the "b

## Data Tokenization

In [9]:
from transformers import AutoTokenizer
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model_ckpt = 'huawei-noah/TinyBERT_General_4L_312D'  #from huggingface
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=True)

In [10]:
tokenizer(dataset['train'][0]['review'])

def tokenize(batch):
    temp = tokenizer(batch['review'], padding=True, truncation=True, max_length=300)
    return temp

dataset = dataset.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

In [11]:
dataset['train'][0].keys()

dict_keys(['review', 'sentiment', 'label', 'input_ids', 'token_type_ids', 'attention_mask'])

### Building Model Evaluation Functions
https://huggingface.co/docs/transformers/v4.42.0/en/tasks/sequence_classification#evaluate

In [12]:
# !pip install evaluate

import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## Model Building

In [13]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=len(label2id), label2id=label2id, id2label=id2label)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
train_data = dataset['train'].select(range(200))
train_data


Dataset({
    features: ['review', 'sentiment', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 200
})

In [22]:
test_data=dataset['test'].select(range(100))
test_data

Dataset({
    features: ['review', 'sentiment', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 100
})

In [23]:
args = TrainingArguments(
    output_dir='train_dir',
    overwrite_output_dir=True,
    num_train_epochs=1,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_data,
    eval_dataset=test_data,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [24]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.599181,0.68


TrainOutput(global_step=7, training_loss=0.5199833938053676, metrics={'train_runtime': 89.1364, 'train_samples_per_second': 2.244, 'train_steps_per_second': 0.079, 'total_flos': 1680351120000.0, 'train_loss': 0.5199833938053676, 'epoch': 1.0})

In [25]:
trainer.evaluate()

{'eval_loss': 0.5991806983947754,
 'eval_accuracy': 0.68,
 'eval_runtime': 7.395,
 'eval_samples_per_second': 13.523,
 'eval_steps_per_second': 0.541,
 'epoch': 1.0}

## Model Save and Load for Inference

In [26]:
trainer.save_model('tinybert-sentiment-analysis')

In [27]:
data = ['this movie was horrible, the plot was really boring. acting was okay',
        'the movie is really sucked. there is not plot and acting was bad',
        'what a beautiful movie. great plot. acting was good. will see it again']

In [28]:
from transformers import pipeline
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

classifier = pipeline('text-classification', model='tinybert-sentiment-analysis', device=device)

classifier(data)

Device set to use cpu


[{'label': 'negative', 'score': 0.7161254286766052},
 {'label': 'negative', 'score': 0.7219476699829102},
 {'label': 'positive', 'score': 0.7426159977912903}]

## Push Model to AWS S3

In [29]:
import boto3

s3 = boto3.client('s3')

bucket_name = 'mlops-sheenu'

def create_bucket(bucket_name):
    response = s3.list_buckets()
    buckets = [buck['Name'] for buck in response['Buckets']]
    if bucket_name not in buckets:
        s3.create_bucket(Bucket=bucket_name)
        print("Bucket is created")

    else:
        print("Bucket already exists in your account!!! Feel free to use it.")

create_bucket(bucket_name)

Bucket is created


In [30]:
# upload model folder to s3 bucket ml-models/tinybert-sentiment-analysis
import os
import boto3

s3 = boto3.client('s3')
bucket_name = 'mlops-sheenu'

def upload_directory(directory_path, s3_prefix):
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file).replace("\\", "/")
            relpath = os.path.relpath(file_path, directory_path)
            s3_key = os.path.join(s3_prefix, relpath).replace("\\", "/")
            
            s3.upload_file(file_path, bucket_name, s3_key)


upload_directory('tinybert-sentiment-analysis', 'ml-models/tinybert-sentiment-analysis')
