# Feature Transformation with Scikit-Learn In This Notebook

In this notebook, we convert raw text into feature embeddings.  This will allow us to perform natural language processing tasks.


![](img/prepare_dataset_bert.png)

# Understand Embeddings

* For more details on Transformers Architecture, see [Attention Is All You Need](https://arxiv.org/abs/1706.03762).

<img src="img/bert_input_features.png" width="80%" align="left">

* **input_ids**: 
The id from the pre-trained vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than max_seq_length)

* **attention_mask**: 
Specifies which tokens should pay attention to (0 or 1). Padded input_ids will have 0 in each of these vector elements.

In [4]:
import psutil

notebook_memory = psutil.virtual_memory()

if notebook_memory.total < 32 * 1024 * 1024:
    print('*******************************************')    
    print('YOU ARE NOT USING THE CORRECT INSTANCE TYPE')
    print('PLEASE CHANGE INSTANCE TYPE TO  m5.2xlarge ')
    print('*******************************************')
else:
    correct_instance_type=True
    print(notebook_memory)

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from pathlib import Path
import csv

def _transform_to_dataset(file, output_data, train_split_percentage, validation_split_percentage, test_split_percentage):
    print("file {}".format(file))

    # Read the file
#    df = pd.read_parquet(file)
    df = pd.read_csv(file, delimiter="\t", quoting=csv.QUOTE_NONE, compression="gzip")

    df.isna().values.any()
    df = df.dropna()
    df = df.reset_index(drop=True)    

    
    # Split data
    
    print("Shape of dataframe before splitting {}".format(df.shape))

    print("train split percentage {}".format(train_split_percentage))
    print("validation split percentage {}".format(validation_split_percentage))
    print("test split percentage {}".format(test_split_percentage))

    holdout_percentage = 1.00 - train_split_percentage
    print("validation holdout percentage {}".format(holdout_percentage))
    
    df_train, df_holdout = train_test_split(df, test_size=holdout_percentage)

    test_holdout_percentage = test_split_percentage / holdout_percentage
    
    print("test holdout percentage {}".format(test_holdout_percentage))
    
    df_validation, df_test = train_test_split(
        df_holdout, test_size=test_holdout_percentage)

    df_train = df_train.reset_index(drop=True)
    df_validation = df_validation.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    print("Shape of train dataframe {}".format(df_train.shape))
    print("Shape of validation dataframe {}".format(df_validation.shape))
    print("Shape of test dataframe {}".format(df_test.shape))

    
    # Convert Pandas dataframes into Datasets
    import datasets
    from datasets import Dataset

    dataset_train = Dataset.from_pandas(df_train)
    dataset_validation = Dataset.from_pandas(df_validation)
    dataset_test = Dataset.from_pandas(df_test)    

    
    # Tokenize
    
    from transformers import AutoTokenizer

    model_checkpoint = "bigscience/bloom-560m"

    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

    text_column_name = "review_body"

    def tokenize_function(examples):
        tokenized = tokenizer(examples[text_column_name])
        return tokenized

    import multiprocessing

    num_cpus = multiprocessing.cpu_count()
    print("num_cpus {}".format(num_cpus))

    # if using .tsv, the data will have `product_category`, but not `year`
    # if using .parquet, the data will have also have `year`
    tokenized_dataset_train = dataset_train.map(tokenize_function, batched=True, num_proc=num_cpus, remove_columns=[
        'marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title', 'product_category',
        'star_rating', 'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
        'review_headline', 'review_date', text_column_name]) # 'year'

    tokenized_dataset_validation = dataset_validation.map(tokenize_function, batched=True, num_proc=num_cpus, remove_columns=[
        'marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title', 'product_category',
        'star_rating', 'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
        'review_headline', 'review_date', text_column_name]) # 'year'

    tokenized_dataset_test = dataset_validation.map(tokenize_function, batched=True, num_proc=num_cpus, remove_columns=[
        'marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title', 'product_category',
        'star_rating', 'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
        'review_headline', 'review_date', text_column_name]) # 'year'
        
    
    # Group into blocks and save to S3/disk

    block_size = 128

    def group_texts(examples):    
        # Concatenate all texts.
        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
        total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result

    lm_dataset_train = tokenized_dataset_train.map(
        group_texts,
        batched=True,
        batch_size=10,
        num_proc=num_cpus,
    )

    lm_dataset_validation = tokenized_dataset_validation.map(
       group_texts,
       batched=True,
       batch_size=10,
       num_proc=num_cpus,
    )
    
    lm_dataset_test = tokenized_dataset_test.map(
       group_texts,
       batched=True,
       batch_size=10,
       num_proc=num_cpus,
    )

    print(tokenizer.decode(lm_dataset_train[1]["input_ids"]))
    print(tokenizer.decode(lm_dataset_validation[1]["input_ids"]))
    print(tokenizer.decode(lm_dataset_test[1]["input_ids"]))
        
    filename_without_extension = Path(Path(file).stem).stem

    os.makedirs('{}/gpt3-train/'.format(output_data), exist_ok=True)
    os.makedirs('{}/gpt3-validation/'.format(output_data), exist_ok=True)
    os.makedirs('{}/gpt3-test/'.format(output_data), exist_ok=True)
    
    lm_dataset_train.to_parquet('{}/gpt3-train/{}.parquet'.format(output_data, filename_without_extension))    
    lm_dataset_validation.to_parquet('{}/gpt3-validation/{}.parquet'.format(output_data, filename_without_extension))
    lm_dataset_test.to_parquet('{}/gpt3-test/{}.parquet'.format(output_data, filename_without_extension))

In [6]:
import functools
import multiprocessing
import glob
import os

def process(args):

#    input_files = glob.glob("{}/*.parquet".format(args.input_data))
    input_files = glob.glob("{}/*.tsv.gz".format(args.input_data))
    print(input_files)

    print("Listing contents of {}".format(args.input_data))
    dirs_input = os.listdir(args.input_data)
    for file in dirs_input:
        print(file)

    train_data = "{}/gpt3-train".format(args.output_data)
    validation_data = "{}/gpt3-validation".format(args.output_data)
    test_data = "{}/gpt3-test".format(args.output_data)

    transform_to_dataset = functools.partial(
        _transform_to_dataset,
        output_data=args.output_data,
        train_split_percentage=args.train_split_percentage, 
        validation_split_percentage=args.validation_split_percentage, 
        test_split_percentage=args.test_split_percentage,
        # prefix=args.feature_store_offline_prefix,
        # feature_group_name=args.feature_group_name,
    )

    num_cpus = multiprocessing.cpu_count()
    print("num_cpus {}".format(num_cpus))

    p = multiprocessing.Pool(num_cpus)
    p.map(transform_to_dataset, input_files)

    print("Listing contents of {}".format(args.output_data))
    dirs_output = os.listdir(args.output_data)
    for file in dirs_output:
        print(file)

    print("Listing contents of {}".format(train_data))
    dirs_output = os.listdir(train_data)
    for file in dirs_output:
        print(file)

    print("Listing contents of {}".format(validation_data))
    dirs_output = os.listdir(validation_data)
    for file in dirs_output:
        print(file)

    print("Listing contents of {}".format(test_data))
    dirs_output = os.listdir(test_data)
    for file in dirs_output:
        print(file)


In [7]:
class Args:
    input_data: str
    output_data: str
    train_split_percentage: float
    validation_split_percentage: float
    test_split_percentage: float

args = Args()    
    
#args.input_data = '../../data-science-on-aws.gpt3/00_quickstart/data-parquet/Digital_Software'
args.input_data = './data-tsv'
args.output_data = './data-gpt3'
args.train_split_percentage = .80
args.validation_split_percentage = .10
args.test_split_percentage = .10

process(args)


['./data-tsv/amazon_reviews_us_Gift_Card_v1_00.tsv.gz', './data-tsv/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', './data-tsv/amazon_reviews_us_Digital_Video_Games_v1_00.tsv.gz']
Listing contents of ./data-tsv
amazon_reviews_us_Gift_Card_v1_00.tsv.gz
.ipynb_checkpoints
amazon_reviews_us_Digital_Software_v1_00.tsv.gz
amazon_reviews_us_Digital_Video_Games_v1_00.tsv.gz
num_cpus 2
file ./data-tsv/amazon_reviews_us_Digital_Software_v1_00.tsv.gz
file ./data-tsv/amazon_reviews_us_Gift_Card_v1_00.tsv.gz
Shape of dataframe before splitting (149081, 15)
train split percentage 0.8
validation split percentage 0.1
test split percentage 0.1
validation holdout percentage 0.19999999999999996
test holdout percentage 0.5000000000000001
Shape of train dataframe (119264, 15)
Shape of validation dataframe (14908, 15)
Shape of test dataframe (14909, 15)
Shape of dataframe before splitting (102084, 15)
train split percentage 0.8
validation split percentage 0.1
test split percentage 0.1
validation holdout

#0:   0%|          | 0/60 [00:00<?, ?ba/s]

 

#0:   0%|          | 0/41 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/60 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/41 [00:00<?, ?ba/s]

   

#0:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/8 [00:00<?, ?ba/s]

     

#0:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#0:   0%|          | 0/6 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/6 [00:00<?, ?ba/s]

    

#0:   0%|          | 0/5964 [00:00<?, ?ba/s]

#1:   0%|          | 0/5964 [00:00<?, ?ba/s]

   

#0:   0%|          | 0/6 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/6 [00:00<?, ?ba/s]

    

#0:   0%|          | 0/4084 [00:00<?, ?ba/s]

#1:   0%|          | 0/4084 [00:00<?, ?ba/s]

    

#0:   0%|          | 0/746 [00:00<?, ?ba/s]

#1:   0%|          | 0/746 [00:00<?, ?ba/s]

    

#1:   0%|          | 0/746 [00:00<?, ?ba/s]

#0:   0%|          | 0/746 [00:00<?, ?ba/s]

    

#1:   0%|          | 0/511 [00:00<?, ?ba/s]

#0:   0%|          | 0/511 [00:00<?, ?ba/s]

    

#0:   0%|          | 0/511 [00:00<?, ?ba/s]

#1:   0%|          | 0/511 [00:00<?, ?ba/s]

It was perfectGot a gift card for my sister at Christmas!  Perfect for the person who has everything, right?  She likes to read and if she doesn't want a book, she can purchase a large variety of other things-dvd's, fish line, electronics, etc.Excellent...I got this for my husband for Christmas so he could purchase some movies or music for himself. It was easy to purchase and print out last-minute on Christmas morning. It turned out he did the same for me, haha.Fast and perfect!!Good gift for my distant family, they enjoy to purchase thousands gifts in Amazon, the only bad is
love e-giftcards.. easy to send and received immediately.  I use them a lot.. for myself and others... good to know I can choose a gift quickly and have them received immediately.excellent!ThanksNext day delivery... what more can you ask for!  The gift card was purchased as a Christmas gift for someone who has a Kindle Fire HD.  It arrived as promised and the gift wrapped box was perfect!With my Son and his wife a

Creating parquet from Arrow format:   0%|          | 0/22 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

file ./data-tsv/amazon_reviews_us_Digital_Video_Games_v1_00.tsv.gz
 sort of digital Ex-lax to get his thing moving? Also I have to keep shutting it down and restarting it because like the Greek Civil Service it decides to take a coffee break and go away whenever it feels like it. The upside is that I am really enjoying watching this big spider spin its beautiful web across my keyboard because the time between my keystrokes can be measured in mosquito outbreaks. The positive aspects of this program? I hear the new features are great...I can't wait to use them...maybe one day before LR5 comes out. Until then I hope that I have many new knit sweaters
 to go to a spreadsheet I found on line for free.  Spreadsheet works fine and is user friendly. Deleted Quicken from my computer.TurboTax did the job this year as it has in the past.<br />I was easily able to import last year's data as well as download my tax documents from my payroll company and my brokerage.<br />This helped save me a lot o

Creating parquet from Arrow format:   0%|          | 0/49 [00:00<?, ?ba/s]

Shape of dataframe before splitting (145427, 15)
train split percentage 0.8
validation split percentage 0.1
test split percentage 0.1
validation holdout percentage 0.19999999999999996
test holdout percentage 0.5000000000000001
Shape of train dataframe (116341, 15)
Shape of validation dataframe (14542, 15)
Shape of test dataframe (14544, 15)
num_cpus 2


Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

   

#0:   0%|          | 0/59 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/59 [00:00<?, ?ba/s]

   

#0:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/8 [00:00<?, ?ba/s]

   

#0:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/8 [00:00<?, ?ba/s]

   

#0:   0%|          | 0/5818 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/5817 [00:00<?, ?ba/s]

    

#0:   0%|          | 0/728 [00:00<?, ?ba/s]

#1:   0%|          | 0/728 [00:00<?, ?ba/s]

    

#0:   0%|          | 0/728 [00:00<?, ?ba/s]

#1:   0%|          | 0/728 [00:00<?, ?ba/s]

 easy and convenient.UPDATE: The game is great. Simcity probably had one of the worst game launches of all time, but they have bounded back and have made me a very happy customer. They are constantly releasing updates, fixes, and new content to make it enjoyable. And to make up for the terrible launch, us \\"early adopters\\" got a free game (an older one, but still nice) and access to an exclusive \\"theme park\\" in-game. If you've watched any videos about the game, then that is truly the experience you will have with this game now. The only issue is the
 up I could sleep.I am a huge fan of games such as Fable, Elder Scrolls, Diablo, Mass Effect, Dungeon Siege, Fallout, etc. This lesser-known game deserves a spot with the rest of those games.<br />You can find all kinds of reviews online for this game, so I won't go into too much detail - I just want to give this game 5 stars and tell you that if you enjoy rpg's, this is a must-have.<br /><br />The game has been criticized for being 

Creating parquet from Arrow format:   0%|          | 0/69 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Listing contents of ./data-gpt3
.ipynb_checkpoints
gpt3-validation
gpt3-train
gpt3-test
Listing contents of ./data-gpt3/gpt3-train
amazon_reviews_us_Gift_Card_v1_00.parquet
amazon_reviews_us_Digital_Video_Games_v1_00.parquet
amazon_reviews_us_Digital_Software_v1_00.parquet
Listing contents of ./data-gpt3/gpt3-validation
amazon_reviews_us_Gift_Card_v1_00.parquet
amazon_reviews_us_Digital_Video_Games_v1_00.parquet
amazon_reviews_us_Digital_Software_v1_00.parquet
Listing contents of ./data-gpt3/gpt3-test
amazon_reviews_us_Gift_Card_v1_00.parquet
amazon_reviews_us_Digital_Video_Games_v1_00.parquet
amazon_reviews_us_Digital_Software_v1_00.parquet


Then we apply it to all the splits in our `datasets` object, using `batched=True` and 4 processes to speed up the preprocessing. We won't need the `text` column afterward, so we discard it.

In [8]:
from datasets import Dataset

reloaded_dataset_train = Dataset.from_parquet('./data-gpt3/gpt3-train/*.parquet')
reloaded_dataset_validation = Dataset.from_parquet('./data-gpt3/gpt3-validation/*.parquet')
reloaded_dataset_test = Dataset.from_parquet('./data-gpt3/gpt3-test/*.parquet')

Using custom data configuration default-778b367b1c7dc7dd


Downloading and preparing dataset parquet/default to /root/.cache/huggingface/datasets/parquet/default-778b367b1c7dc7dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/parquet/default-778b367b1c7dc7dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Using custom data configuration default-0ddde35a3328bef9


Downloading and preparing dataset parquet/default to /root/.cache/huggingface/datasets/parquet/default-0ddde35a3328bef9/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/parquet/default-0ddde35a3328bef9/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Using custom data configuration default-be74b8e8585efa07


Downloading and preparing dataset parquet/default to /root/.cache/huggingface/datasets/parquet/default-be74b8e8585efa07/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/parquet/default-be74b8e8585efa07/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


In [9]:
reloaded_dataset_train

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 138117
})

In [10]:
reloaded_dataset_validation

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 17171
})

In [11]:
reloaded_dataset_test

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 17171
})

# Release Resources

In [12]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>