In [1]:
x = 10

In [2]:
x = 15

In [3]:
y = x + 5

In [4]:
y

20

In [5]:
# "!" does shell commands
# !pip3 install pandas
# !pip3 install numpy
import numpy as np
import pandas as pd

In [6]:
df = pd.read_csv("movie.csv")
df.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [7]:
df["text"]

0        I grew up (b. 1965) watching and loving the Th...
1        When I put this movie in my DVD player, and sa...
2        Why do people who do not know what a particula...
3        Even though I have great interest in Biblical ...
4        Im a die hard Dads Army fan and nothing will e...
                               ...                        
39995    "Western Union" is something of a forgotten cl...
39996    This movie is an incredible piece of work. It ...
39997    My wife and I watched this movie because we pl...
39998    When I first watched Flatliners, I was amazed....
39999    Why would this film be so good, but only gross...
Name: text, Length: 40000, dtype: object

In [8]:
df.loc[0]

text     I grew up (b. 1965) watching and loving the Th...
label                                                    0
Name: 0, dtype: object

In [11]:
df.dropna(inplace = True)

In [12]:
df.fillna(1, inplace = True)

In [13]:
df.drop_duplicates(inplace = True)

In [18]:
df["label"].value_counts()

1    19908
0    19815
Name: label, dtype: int64

In [None]:
!pip3 install transformers
from transformers import pipeline

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
#Basic Pipelining
generator = pipeline(task = "text-generation")

generator ("You're walking alone in the woods. There is no one around and your phone is dead. You stumble upon")

No model was supplied, defaulted to gpt2 and revision 6c0e608 (https://huggingface.co/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "You're walking alone in the woods. There is no one around and your phone is dead. You stumble upon a man who looks as terrified as you. Then he turns in his bed and puts his arm around you. You walk to the door."}]

In [None]:
#Specify Model and Tokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
model = AutoModelForCausalLM.from_pretrained("distilgpt2")
better_generator = pipeline(task = "text-generation", model = model, tokenizer = tokenizer)

In [None]:
better_generator ("You're walking alone in the woods. There is no one around and your phone is dead. You stumble upon")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'You\'re walking alone in the woods. There is no one around and your phone is dead. You stumble upon your couch, thinking: You\'re in a room on the couch. You scream and shout "It\'ll bring you back for this, it'}]

In [None]:
## Example of Hugging Face Datasets
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")



  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
## Fine Tuning
from sklearn.model_selection import train_test_split
from transformers import DistilBertForSequenceClassification, TrainingArguments, Trainer, DistilBertTokenizerFast

In [None]:
# Load Tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

#Select text and label columns
labels = df["label"]
texts = df["text"]

#Split into train and test datasets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=.2)

#Tokenize
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
import torch

#Store the data in a way that HuggingFace Recognizes
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


#Create the Datasets
train_dataset = IMDbDataset(train_encodings, list(train_labels))
test_dataset = IMDbDataset(test_encodings, list(test_labels))

In [None]:
!pip3 install evaluate
from evaluate import metric

In [None]:
#Load Model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

#Specify training metric
metric = evaluate.load("accuracy")

#Example hyperparameters for tuning
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

#Create the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.23.1",
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/pytorch_model.bin
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification

35570    0
4844     1
28103    1
36226    0
9967     0
27039    0
38151    0
39989    1
14545    0
17413    0
Name: label, dtype: int64


In [None]:
trainer.train()

***** Running training *****
  Num examples = 31778
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5961


Step,Training Loss
10,0.6981
20,0.6951
30,0.6984
40,0.6916
50,0.6931
60,0.6887
70,0.6781
