In [2]:
!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import os
os.chdir("/content/drive/MyDrive/Openclassrooms/AI_P7")
os.getcwd()

'/content/drive/MyDrive/Openclassrooms/AI_P7'

In [5]:
import time
import pathlib 

import pandas as pd

from sklearn.model_selection import train_test_split

from transformers import TFAutoModelForSequenceClassification, AutoTokenizer, pipeline
from datasets import Dataset, load_dataset

import tensorflow as tf

random_seed = 0

In [6]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [7]:
model_name = "nlptown/bert-base-multilingual-uncased-sentiment" # 5 labels
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest" # 3 labels mais fait pour
model_name = "distilbert-base-uncased-finetuned-sst-2-english" # 2 labels
# finiteautomata/bertweet-base-sentiment-analysis ??
# "distilbert-base-uncased" --> fine-tuning

Un **transformer** est un modèle qui utilise l'attention pour augmenter la vitesse à laquelle il peut être entrainé.

# 1 - pipeline

In [8]:
sentiment_pipeline = pipeline("sentiment-analysis", top_k=10, model=model_name)

In [9]:
data = ["I really love it", "I really hate it"]

In [10]:
sentiment_pipeline(data)

[[{'label': 'POSITIVE', 'score': 0.9998788833618164},
  {'label': 'NEGATIVE', 'score': 0.00012104465713491663}],
 [{'label': 'NEGATIVE', 'score': 0.9996116757392883},
  {'label': 'POSITIVE', 'score': 0.0003883192257490009}]]

# 2 - AutoModel & AutoTokenizer

In [11]:
tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [12]:
data = tokenizer(["I really hate it", "I really love it"], 
          truncation=True, padding=True, return_tensors="tf")

In [13]:
data

{'input_ids': <tf.Tensor: shape=(2, 6), dtype=int32, numpy=
array([[ 101, 1045, 2428, 5223, 2009,  102],
       [ 101, 1045, 2428, 2293, 2009,  102]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 6), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1]], dtype=int32)>}

In [14]:
tf_outputs = tf_model(data)
tf_outputs

TFSequenceClassifierOutput([('logits',
                             <tf.Tensor: shape=(2, 2), dtype=float32, numpy=
                             array([[ 4.3365097, -3.5167837],
                                    [-4.3277135,  4.6915174]], dtype=float32)>)])

In [15]:
tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)
tf_predictions

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[9.9961168e-01, 3.8831957e-04],
       [1.2104454e-04, 9.9987888e-01]], dtype=float32)>

# 3 - Dataset

### From CSV 1

In [16]:
dataset1 = load_dataset(
    'data',
    'csv',
    split='train',
    data_files={'data_nlp_1563108.csv'},
    column_names=['text', 'target'],
)

Using custom data configuration data-4c73a3af64a95650
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/data-4c73a3af64a95650/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


In [17]:
dataset1

Dataset({
    features: ['text', 'target', '__index_level_0__', '__index_level_1__', '__index_level_2__', '__index_level_3__', '__index_level_4__', '__index_level_5__', '__index_level_6__'],
    num_rows: 1452792
})

In [18]:
dsplit1 = dataset1.train_test_split(test_size=0.1)
dsplit1

DatasetDict({
    train: Dataset({
        features: ['text', 'target', '__index_level_0__', '__index_level_1__', '__index_level_2__', '__index_level_3__', '__index_level_4__', '__index_level_5__', '__index_level_6__'],
        num_rows: 1307512
    })
    test: Dataset({
        features: ['text', 'target', '__index_level_0__', '__index_level_1__', '__index_level_2__', '__index_level_3__', '__index_level_4__', '__index_level_5__', '__index_level_6__'],
        num_rows: 145280
    })
})

In [19]:
dataset1[1]

{'__index_level_0__': '0',
 '__index_level_1__': "@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D",
 '__index_level_2__': "$URL$ - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D",
 '__index_level_3__': "$ url$ - awww , that 's a bummer . you shoulda got david carr of third day to do it . ; d",
 '__index_level_4__': 'bummer shoulda got day d',
 '__index_level_5__': 'bummer shoulda got day d',
 '__index_level_6__': 'bummer shoulda get day d',
 'target': '$ url$ - awww , that be a bummer . you shoulda get david carr of third day to do it . ; d',
 'text': 'bummer shoulda get day d'}

In [20]:
dsplit1['test'][0]

{'__index_level_0__': '1',
 '__index_level_1__': 'BBQ in the garden tonite ',
 '__index_level_2__': 'BBQ in the garden tonite',
 '__index_level_3__': 'bbq in the garden tonite',
 '__index_level_4__': 'garden tonite',
 '__index_level_5__': 'garden tonite',
 '__index_level_6__': 'garden tonite',
 'target': 'bbq in the garden tonite',
 'text': 'garden tonite'}

### From CSV 2

In [21]:
dataset2 = Dataset.from_csv('data/data_nlp_1563108.csv')

Using custom data configuration default-5dedf3ad74896a58
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-5dedf3ad74896a58/0.0.0)


In [22]:
dataset2

Dataset({
    features: ['target', 'text', 'text_clean', 'tokens', 'tokens_filtered_advanced', 'tokens_filtered_simple', 'lemmas_filtered_advanced', 'lemmas_filtered_simple', 'lemmas_not_filtered'],
    num_rows: 1452791
})

In [23]:
dsplit2 = dataset2.train_test_split(test_size=0.1)
dsplit2

DatasetDict({
    train: Dataset({
        features: ['target', 'text', 'text_clean', 'tokens', 'tokens_filtered_advanced', 'tokens_filtered_simple', 'lemmas_filtered_advanced', 'lemmas_filtered_simple', 'lemmas_not_filtered'],
        num_rows: 1307511
    })
    test: Dataset({
        features: ['target', 'text', 'text_clean', 'tokens', 'tokens_filtered_advanced', 'tokens_filtered_simple', 'lemmas_filtered_advanced', 'lemmas_filtered_simple', 'lemmas_not_filtered'],
        num_rows: 145280
    })
})

In [24]:
dataset2[1]

{'lemmas_filtered_advanced': 'upset update facebook texte cry result school today',
 'lemmas_filtered_simple': 'upset update facebook texte cry result school today',
 'lemmas_not_filtered': 'be upset that he can not update his facebook by texte it ... and might cry as a result school today also . blah !',
 'target': 0,
 'text': "is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!",
 'text_clean': "is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!",
 'tokens': "is upset that he ca n't update his facebook by texting it ... and might cry as a result school today also . blah !",
 'tokens_filtered_advanced': 'upset update facebook texting cry result school today',
 'tokens_filtered_simple': 'upset update facebook texting cry result school today'}

In [25]:
dsplit2['test'][0]

{'lemmas_filtered_advanced': 'wake clock say 13:37',
 'lemmas_filtered_simple': 'wake clock say 13:37',
 'lemmas_not_filtered': 'wake up and the clock say 13:37 .',
 'target': 1,
 'text': 'Woke up and the clock said 13:37. ',
 'text_clean': 'Woke up and the clock said 13:37.',
 'tokens': 'woke up and the clock said 13:37 .',
 'tokens_filtered_advanced': 'woke clock said 13:37',
 'tokens_filtered_simple': 'woke clock said 13:37'}

### From Pandas dataframe

In [26]:
data_work = pd.read_csv(
    pathlib.Path(pathlib.Path().absolute(), 'data', 'data_nlp_1563108.csv'), 
    usecols=['target', 'text'],
    encoding='ISO-8859-1',
    #nrows=100000,
)
data_work.rename(columns={'text':'text', 'target':'label'}, inplace=True)
display(data_work.head(2), data_work.shape)

# Select samples
sample_size = 100000
data_work = data_work.groupby('label', group_keys=False).apply(lambda x: x.sample(sample_size//2, random_state=random_seed))
data_work.set_index('label', inplace=True, drop=True)
display(data_work.head(2), data_work.shape)

Unnamed: 0,label,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...


(1452791, 2)

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,I dont have a background for my twitter stream...
0,I did many things but missed tweeting


(100000, 1)

In [27]:
dataset3 = Dataset.from_pandas(data_work)

In [28]:
dataset3

Dataset({
    features: ['text', 'label'],
    num_rows: 100000
})

In [29]:
dsplit3 = dataset3.train_test_split(test_size=0.1)
dsplit3

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 90000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 10000
    })
})

In [30]:
dataset3[1]

{'label': 0, 'text': 'I did many things but missed tweeting '}

In [31]:
dsplit3['test'][0]

{'label': 0,
 'text': '@chantalclaret it doesnt make me laugh either,my friend hacked my twitter im sorry '}

---
---

In [32]:
dataset3[0]

{'label': 0,
 'text': 'I dont have a background for my twitter stream...does anyone ever check those things?? and if they do...what should mine be? help '}

In [33]:
dsplit3['test'][0]

{'label': 0,
 'text': '@chantalclaret it doesnt make me laugh either,my friend hacked my twitter im sorry '}

In [34]:
dsplit3['test'][0]['text']

'@chantalclaret it doesnt make me laugh either,my friend hacked my twitter im sorry '

### Un test avec **quelques textes** 

#### avec le pipeline

In [35]:
test_txt = ["I hate this stuff", "I really love it"]

In [36]:
pp_outputs = sentiment_pipeline(test_txt)
pp_outputs

[[{'label': 'NEGATIVE', 'score': 0.999596893787384},
  {'label': 'POSITIVE', 'score': 0.00040317379171028733}],
 [{'label': 'POSITIVE', 'score': 0.9998788833618164},
  {'label': 'NEGATIVE', 'score': 0.00012104465713491663}]]

In [37]:
y_preds_proba = [[y['score'] for y in x if y['label'] == "POSITIVE"][0] for x in pp_outputs]
y_preds_proba

[0.00040317379171028733, 0.9998788833618164]

#### avec les Auto

In [38]:
test_txt = ["I hate this stuff", "I really love it"]
test_tokens = tokenizer(test_txt, truncation=True, padding="max_length", max_length=65, return_tensors="tf")

In [39]:
tf_outputs = tf_model(test_tokens)
tf_outputs

TFSequenceClassifierOutput([('logits',
                             <tf.Tensor: shape=(2, 2), dtype=float32, numpy=
                             array([[ 4.3156366, -3.500103 ],
                                    [-4.3277135,  4.691517 ]], dtype=float32)>)])

In [40]:
tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)
tf_predictions

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[9.9959689e-01, 4.0317379e-04],
       [1.2104466e-04, 9.9987888e-01]], dtype=float32)>

In [41]:
y_preds_proba = tf_predictions.numpy()
y_preds_proba = [x[1] for x in y_preds_proba]
y_preds_proba

[0.0004031738, 0.9998789]

### Un test avec **de nombreux textes**

#### avec le pipeline

In [42]:
test_txt = data_work.text[:100].to_list()
test_txt[:5]

['I dont have a background for my twitter stream...does anyone ever check those things?? and if they do...what should mine be? help ',
 'I did many things but missed tweeting ',
 '@JoAnneJoyM most probably - eww all that pasty white skin on show. ',
 'Ohh, how bad I sleep ',
 '@LornA_AlicE hell yeah u were texting me i remember, i was at my nans n u told me tragic news! boo hoo! n u went to amsterdam without me! ']

In [43]:
pp_outputs = sentiment_pipeline(test_txt)
pp_outputs[:5]

[[{'label': 'NEGATIVE', 'score': 0.9994959831237793},
  {'label': 'POSITIVE', 'score': 0.0005040322430431843}],
 [{'label': 'NEGATIVE', 'score': 0.9947372078895569},
  {'label': 'POSITIVE', 'score': 0.0052627259865403175}],
 [{'label': 'NEGATIVE', 'score': 0.9963040351867676},
  {'label': 'POSITIVE', 'score': 0.0036959876306355}],
 [{'label': 'NEGATIVE', 'score': 0.9995403289794922},
  {'label': 'POSITIVE', 'score': 0.00045966787729412317}],
 [{'label': 'NEGATIVE', 'score': 0.9946225881576538},
  {'label': 'POSITIVE', 'score': 0.0053774346597492695}]]

In [44]:
y_preds_proba = [[y['score'] for y in x if y['label'] == "POSITIVE"][0] for x in pp_outputs]
y_preds_proba[:5]

[0.0005040322430431843,
 0.0052627259865403175,
 0.0036959876306355,
 0.00045966787729412317,
 0.0053774346597492695]

#### avec les Auto

In [45]:
# OK
test_txt = data_work.text[:100].to_list()
test_tokens = tokenizer(test_txt, truncation=True, padding="max_length", max_length=65, return_tensors="tf")
test_tokens[:1]

[Encoding(num_tokens=65, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]

In [46]:
test_tokens

{'input_ids': <tf.Tensor: shape=(100, 65), dtype=int32, numpy=
array([[  101,  1045,  2123, ...,     0,     0,     0],
       [  101,  1045,  2106, ...,     0,     0,     0],
       [  101,  1030, 23459, ...,     0,     0,     0],
       ...,
       [  101,  1045,  8415, ...,     0,     0,     0],
       [  101,  6295,  2035, ...,     0,     0,     0],
       [  101, 19752,  2664, ...,     0,     0,     0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(100, 65), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>}

In [47]:
# OK
sample_size = 10000
def preprocess_function(x):
    return tokenizer(x['text'], truncation=True, padding="max_length", max_length=65, return_tensors="tf")
test_tokens = preprocess_function(dsplit3['test'][:sample_size])
test_tokens

{'input_ids': <tf.Tensor: shape=(10000, 65), dtype=int32, numpy=
array([[  101,  1030, 16883, ...,     0,     0,     0],
       [  101, 12802,  1997, ...,     0,     0,     0],
       [  101,  2012,  2277, ...,     0,     0,     0],
       ...,
       [  101,  1008,  8038, ...,     0,     0,     0],
       [  101,  1030,  4678, ...,     0,     0,     0],
       [  101,  2003,  1030, ...,     0,     0,     0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(10000, 65), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>}

In [120]:
def preprocess_function(x):
    return tokenizer(x['text'], truncation=True, padding="max_length", max_length=65, return_tensors="tf")

def batch_inference(data, model, tokenizer_func, sample_size=None, step_size=1000, verbose=1):

  #out = {'logits':tf.zeros([0, data.shape[1]])}
  predictions_batches = []
  
  max_index = data.shape[0] if sample_size is None else min(sample_size, data.shape[0])
  print("max_index", max_index)
  for i in range(0, max_index, step_size):
    max_step = min(max_index - i, step_size)

    tokens = tokenizer_func(data[i:i+max_step])
    preds = model(tokens)
    predictions_batches.append(preds['logits'])

    #out['logits'] = tf.concat(axis=0, values = [tf.cast(out['logits'], tf.float32), preds['logits']])
    if verbose:
      print(i, i+max_step)

  # out2 = tf.concat(predictions_batches, axis=1)
  out2 = "Blop"
  return {'logits':out2}

In [121]:
# 10000
t0 = time.perf_counter()
tf_outputs = batch_inference(dsplit3["test"], tf_model, preprocess_function, sample_size=20000, step_size=2000, verbose=1)
#tf_outputs
print(f"Inference time: {(time.perf_counter() - t0):.2f}s")

max_index 10000
0 2000
2000 4000
4000 6000
6000 8000
8000 10000
Inference time: 18.66s


In [72]:
tf_outputs

{'logits': <tf.Tensor: shape=(2000, 2), dtype=float32, numpy=
 array([[ 4.04228   , -3.3074424 ],
        [-1.752534  ,  1.9127328 ],
        [ 1.6398104 , -1.4212432 ],
        ...,
        [ 1.8359622 , -1.5229459 ],
        [-0.86904997,  1.0118532 ],
        [-0.34156895,  0.5226345 ]], dtype=float32)>}

In [73]:
xxx

{'logits': <tf.Tensor: shape=(2000, 2), dtype=float32, numpy=
 array([[ 4.04228   , -3.3074424 ],
        [-1.752534  ,  1.9127328 ],
        [ 1.6398104 , -1.4212432 ],
        ...,
        [ 1.8359622 , -1.5229459 ],
        [-0.86904997,  1.0118532 ],
        [-0.34156895,  0.5226345 ]], dtype=float32)>}

In [55]:
# 1000
t0 = time.perf_counter()
tf_outputs = batch_inference(dsplit3["test"], tf_model, preprocess_function, sample_size=1000, step_size=1000, verbose=1)
#tf_outputs
print(f"Inference time: {(time.perf_counter() - t0):.2f}s")

max_index 1000
0 1000
Inference time: 0.17s


In [95]:
# 100
t0 = time.perf_counter()
tf_outputs = tf_model(test_tokens)
#tf_outputs
print(f"Inference time: {(time.perf_counter() - t0):.2f}s")

Inference time: 0.09s


In [79]:
tf_predictions = tf.nn.softmax(tf_outputs['logits'], axis=-1)
tf_predictions[:5]

<tf.Tensor: shape=(5, 2), dtype=float32, numpy=
array([[9.9935764e-01, 6.4235757e-04],
       [2.4958465e-02, 9.7504151e-01],
       [9.5525736e-01, 4.4742644e-02],
       [7.0228521e-04, 9.9929774e-01],
       [8.1635207e-02, 9.1836482e-01]], dtype=float32)>

In [80]:
y_preds_proba = tf_predictions.numpy()
y_preds_proba = [x[1] for x in y_preds_proba]
print(y_preds_proba[:5])
print(len(y_preds_proba))


[0.00064235757, 0.9750415, 0.044742644, 0.99929774, 0.9183648]
2000


---
---

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)
 
tokenized_test = dataset_df_split['train'].map(preprocess_function, batched=True)
tokenized_test

In [None]:
tokenized_test[0]

In [None]:
tf_outputs = tf_model(tokenized_test)
tf_outputs

In [None]:
def encode(x):
    #return tokenizer(str(x), truncation=True, padding="max_length", return_tensors="tf")
    return tokenizer(str(x), truncation=True, padding=True, return_tensors="tf")

In [None]:
data_test = dataset_df_split['test'].map(encode , batched=True)
data_test

In [None]:
# Create new index
train_idx = [i for i in range(len(X_train.index))]
test_idx = [i for i in range(len(X_test.index))]
val_idx = [i for i in range(len(X_valid.index))]

# Convert to numpy
x_train = X_train.values[train_idx]
x_test = X_test.values[test_idx]
x_val = X_valid.values[val_idx]

#y_train = y_train.values[train_idx]
#y_test = y_test.values[test_idx]
#y_val = y_valid.values[val_idx]

In [None]:
# Tokenize datasets
tr_tok = tokenizer(list(x_train), return_tensors='tf', truncation=True, padding=True, max_length=128)
val_tok = tokenizer(list(x_val), return_tensors='tf', truncation=True, padding=True, max_length=128)
test_tok = tokenizer(list(x_test), return_tensors='tf', truncation=True, padding=True, max_length=128)

In [None]:
y_preds_raw = model(test_tok)