# This notebook was based on the [Getting Started with HuggingFace video](https://youtu.be/QEaBAZQCtwE?si=oWwmLrXaRD-NZwUG&t=272) and the official [HuggingFace NLP course](https://huggingface.co/learn/nlp-course/chapter2/2?fw=pt).

In [None]:
!pip install transformers datasets accelerate bitsandbytes

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.41.2.post2-py3-none-any.whl (92.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# MAKING SURE WE ARE USING GPU
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
import transformers
from transformers import pipeline

import pprint
pp = pprint.PrettyPrinter(indent=2)


# Using it the easiest way possible by using the HuggingGace Pipeline class.

###[Pipeline Documentation](https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.pipeline.task)

List of all the tasks it can do https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.pipeline.task


In [None]:
# model_name = 'meta-llama/Llama-2-7b-chat'
# model_name = 'NousResearch/Llama-2-7b-chat-hf'
# model_name = 'bigscience/bloom'
model_name = 'gpt2'
generator = pipeline('text-generation', model=model_name, device=device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
result = generator(
    "In CUNY Tech Prep's Data Science you will learn how to",
    max_length=69,
    num_return_sequences=1,
    do_sample=True,
)

pp.pprint(result)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[ { 'generated_text': "In CUNY Tech Prep's Data Science you will learn how to "
                      'create applications that utilize data analytics to make '
                      'the most of your data.\n'
                      '\n'
                      'The core of Data Science is the use of structured data, '
                      'a method that creates algorithms based on knowledge '
                      'about how your data is structured.\n'
                      '\n'
                      'Data Science Explores Machine Learning\n'
                      '\n'
                      'Learn from'}]


In [None]:
result = generator(
    'Describe Rick from the TV show Rick and Morty.',
    max_length=69,
    num_return_sequences=1,
    do_sample=True,
)

pp.pprint(result)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[ { 'generated_text': 'Describe Rick from the TV show Rick and Morty. "Hello, '
                      'how are they?" (Rick: "I\'m trying").\n'
                      '\n'
                      '"Hello, how are they?" (Rick: "I\'m trying"). Give Rick '
                      'the nickname "Dad". (John: "Aunt Dad-bait").\n'
                      '\n'
                      '"Aunt Dad-b'}]


In [None]:
prompt = 'You are an expert surfer, describe what a good swell looks like to you.'
result = generator(
    prompt,
    max_length=100,
    num_return_sequences=1
)
pp.pprint(result)

In [None]:
prompt = '### Instructions:  You are an expert surfer with years of experience riding big waves.  ### Input: Describe what a good swell looks like to you?'
result = generator(
    prompt,
    max_length=300,
    num_return_sequences=1
)
pp.pprint(result)

In [None]:
prompt = '### Instructions:  You are an expert surfer with years of experience riding big waves.  ### Input: Describe what a good swell looks like to you.  ### Response: (your response here).'
result = generator(
    prompt,
    max_length=300,
    num_return_sequences=1
)
pp.pprint(result)

In [None]:
# model_name = 'distilgpt2'
# generator = pipeline('text-generation', model=model_name, device=device)
result = generator(
    "In CUNY Tech Prep's Data Science you will learn how to",
    max_length=69,
    num_return_sequences=2
)

pp.pprint(result)




prompt = 'You are an expert surfer, describe what a good swell looks like to you.'
result = generator(
    prompt,
    max_length=69,
    num_return_sequences=3
)
pp.pprint(result)


### THESE MODELS CRASHED MY COMPUTER
# # model_name = 'meta-llama/Llama-2-7b-chat'
# model_name = 'NousResearch/Llama-2-7b-chat-hf'
# generator = pipeline('text-generation', model=model_name, device=device)



In [None]:
classifier = pipeline('zero-shot-classification',
                      cache_dir='/content/zero-shot-classifier/')


In [None]:
result = classifier(
    "This is a course about data science",
    candidate_labels=['education', 'sports', 'celebrity', 'politics']
)
result

In [None]:
model_path = 'distilbert-base-uncased-finetuned-sst-2-english'
sentiment_classifier = pipeline('sentiment-analysis', model=model_path)

input_text = 'I love CUNY Tech Prep!'
result = sentiment_classifier(input_text)
pp.pprint(result)

In [None]:
result = classifier(
    "This is a course about data science",
    candidate_labels=['Positive emotion', 'Negative emotion',]
)
result

# Stepping out of the pipeline.  

## Step 1:  Tokenization
(what we used to call vectorization back in the old days).

[Lets look here first.](https://platform.openai.com/tokenizer)

Here are the docs for [tokenizer](https://huggingface.co/docs/transformers/main_classes/tokenizer)

In [None]:
from transformers import AutoTokenizer

## FIRST LOAD YOUR TOKENIZER.  WE USED TO CALL THIS OUR 'VECTORIZER' IN THE OLD DAYS OF TF-IDF
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [None]:
document = 'I love CUNY Tech Prep!'
res = tokenizer(document)
res

In [None]:
tokens = tokenizer.tokenize(document)
tokens

In [None]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(token_ids)

In [None]:
input_text = 'We be loving evvverything yo!'
tokens = tokenizer.tokenize(input_text)
print(tokens)

token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(token_ids)

In [None]:
input_text = "I'm leaving the United States for island living life."

tokens = tokenizer.tokenize(input_text)
print(tokens)

token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(token_ids)

In [None]:
# this turns your words into numbers
input_text = 'This class is dope'


## Note, the numbers it gives back are not the numbers you will be doing the math with.
# Rather, a map to the number it will be doing math with.
inputs = tokenizer(input_text, padding=True, truncation=True, return_tensors="pt")
pp.pprint(inputs)



# this turns your words into numbers
input_text = """
She say, "Do you love me?" I tell her, "Only partly
I only love my bed and my mama, I'm sorry"
"""


## Note, the numbers it gives back are not the numbers you will be doing the math with.
# Rather, a map to the number it will be doing math with.
inputs = tokenizer(input_text,
                   padding=True,
                   truncation=True,
                   return_tensors="pt")
pp.pprint(inputs)


In [None]:
raw_inputs = [
    "CUNY Tech Prep's data science course is the best college course I've ever taken.",
    'I think the instructor sucks!',
    "The TA's are great!",
]

inputs = tokenizer(raw_inputs, padding=True, return_tensors="pt")
pp.pprint(inputs)

# Step 2: Put documents into model.
Basically, the pipeline is a combination onf functions.
These are those functions.  Now you can customize them.

In [None]:
from transformers import AutoModelForSequenceClassification

# this turns your words into numbers
input_texts = ['I love CUNY Tech Prep!', 'I hate dogs and cats. They are terrible.']

inputs = tokenizer(input_texts, padding=True, truncation=True, return_tensors="pt")

# remember you have to use the same model you tokenized with.
print(model_name)

model = AutoModelForSequenceClassification.from_pretrained(model_name)
outputs = model(**inputs)
outputs

In [None]:
pp.pprint(outputs)

In [None]:
pp.pprint(outputs.logits)

In [None]:
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

In [None]:
import numpy as np

pred_label = []
for pred in predictions:
  label = np.argmax(pred.detach().numpy())
  pred_label.append(label)

pred_label

# Datasets

In [None]:
from datasets import load_dataset, Dataset

In [None]:
hf_dataset = load_dataset('emotion')

In [None]:
hf_dataset

In [None]:
hf_dataset['test']

In [None]:
hf_dataset['test']['text'][0]

In [None]:
df_test = hf_dataset['test'].to_pandas()
df_test.head()

In [None]:
hf_dataset

In [None]:
from transformers import AutoTokenizer

# Load your tokenizer
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# LOAD YOUR MODEL
model = AutoModelForSequenceClassification.from_pretrained(model_name)


# CREATE A SIMPLE BATCH TOKENIZER FUNCTION
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)


In [None]:
# TOKENIZE YOUR ENTIRE DATASET IN ONE STEP
tokenized_dataset = hf_dataset.map(tokenize)
tokenized_dataset

In [None]:
import pandas as pd
file_path = "https://raw.githubusercontent.com/CUNYTechPrep/2022-fall-data-science/main/Week-07-NLP/data/dem-vs-rep-tweets.csv"
df = pd.read_csv(file_path)
df = pd.DataFrame(df)
df.head()

In [None]:
my_dataset = Dataset.from_pandas(df)
my_dataset

In [None]:
my_dataset

In [None]:
## WHY DOES THIS BREAK???

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

my_tokenized_dataset = my_dataset.map(tokenize)

In [None]:
my_dataset.train_test_split()

In [None]:
dems = df[df['Party']=='Democrat'].copy()
reps = df[df['Party']=='Republican'].copy()
reps.head(1)

# Using different models via the HG [ModelsHub](https://huggingface.co/models)
[@11:36 mark in video](https://youtu.be/QEaBAZQCtwE?si=oquQTAVK-tR4KAmk&t=695)

Lets use a facebook model to summarize text.
* https://huggingface.co/facebook/bart-large-cnn

In [None]:
model = 'facebook/bart-large-cnn'
summarizer = pipeline('summarization', model=model, device=device)

In [None]:
# via https://abcnews.go.com/Technology/wireStory/oppenheimer-fanfare-fuel-record-attendance-new-mexicos-trinity-104191894
document = """
WHITE SANDS MISSILE RANGE, N.M. -- Thousands of visitors are expected to descend Saturday on the southern New Mexico site where the world's first atomic bomb was detonated, with officials preparing for a record turnout amid ongoing fanfare surrounding Christopher Nolan's blockbuster film, “ Oppenheimer.”
Trinity Site, a designated National Historic Landmark, is usually closed to the public because of its proximity to the impact zone for missiles fired at White Sands Missile Range. But twice a year, in April and October, the site opens to spectators.
This may be the first time gaining entry will be like getting a golden ticket to Willy Wonka's chocolate factory.
White Sands officials warned online that the wait to enter the gates could be as long as two hours. No more than 5,000 visitors are expected to make it within the window between 8 a.m. and 2 p.m.
Visitors also are being warned to come prepared as Trinity Site is in a remote area with limited Wi-Fi and no cell service or restrooms.
“Oppenheimer,” the retelling of the work of J. Robert Oppenheimer and the top-secret Manhattan Project during World War II, was a summer box office smash. Scientists and military officials established a secret city in Los Alamos during the 1940s and tested their work at the Trinity Site some 200 miles (322 kilometers) away.
Part of the film's success was due to the “Barbenheimer” phenomenon in which filmgoers made a double feature outing of the “Barbie” movie and “Oppenheimer."
While the lore surrounding the atomic bomb has become pop culture fodder, it was part of a painful reality for residents who lived downwind of Trinity Site. The Tularosa Basin Downwinders plan to protest outside the gates to remind visitors about a side of history they say the movie failed to acknowledge.
The group says the U.S. government never warned residents about the testing. Radioactive ash contaminated soil and water. Rates of infant mortality, cancer and other illnesses increased. There are younger generations dealing with health issues now, advocates say.
The Tularosa Basin Downwinders Consortium has worked with the Union of Concerned Scientists and others for years to bring attention to the Manhattan Project's impact. A new documentary by filmmaker Lois Lipman, “First We Bombed New Mexico,” made its world premiere Friday at the Santa Fe International Film Festival.
The notoriety from “Oppenheimer” has been embraced in Los Alamos, more than 200 miles (321 kilometers) north of the Tularosa Basin. About 200 locals, many of them Los Alamos National Laboratory employees, were extras in the film, and the city hosted an Oppenheimer Festival in July.
"""

result = summarizer(document, max_length=100, min_length=20, do_sample=False)


In [None]:
result

In [None]:
summarizer = pipeline('summarization', model=model)

In [None]:
result = summarizer(document, max_length=100, min_length=20, do_sample=False)