In [1]:
!pip install torch
!pip install pytorch_lightning
!pip install wandb
!pip install pandas

Collecting pytorch_lightning
  Downloading pytorch_lightning-2.1.2-py3-none-any.whl (776 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.9/776.9 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics>=0.7.0 (from pytorch_lightning)
  Downloading torchmetrics-1.2.0-py3-none-any.whl (805 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m805.2/805.2 kB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from pytorch_lightning)
  Downloading lightning_utilities-0.10.0-py3-none-any.whl (24 kB)
Installing collected packages: lightning-utilities, torchmetrics, pytorch_lightning
Successfully installed lightning-utilities-0.10.0 pytorch_lightning-2.1.2 torchmetrics-1.2.0
Collecting wandb
  Downloading wandb-0.16.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  

In [2]:
import torch
from torch import nn, optim
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
import wandb
from torch.utils.data import TensorDataset, DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import pandas as pd
import numpy as np

LEARNING_RATE = 1e-3
BATCH_SIZE = 64

In [17]:
tokenizer = get_tokenizer('spacy')
def yield_tokens(data):
    for text in data['text']:
        yield tokenizer(text)

topic = pd.read_pickle("topic.pkl")
topic['text'] = topic['text'].map(str)
topic.reset_index(inplace=True)

vocab = build_vocab_from_iterator(iterator=yield_tokens(topic), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])




In [5]:
len(topic['topic'])

111840

In [6]:
def gen_dataset(dataframe, classes, classname):
    # assign an index to each class
    dataframe['class'] = dataframe[classname].map({classes[idx]: idx for idx in range(len(classes))})
    dataframe['tokens'] = dataframe['text'].map(tokenizer)
    max_len = 280
    # add padding
    dataframe['tokens'] = dataframe['tokens'].map(lambda tokens: tokens + ["<pad>"] * (max_len - len(tokens)))
    dataframe['token_ids'] = dataframe['tokens'].map(vocab)

    x = np.zeros((len(dataframe), max_len))
    for i in range(len(dataframe)):
      x[i] = np.array(dataframe['token_ids'][i])

    return dataframe['class'].to_numpy(), x

def split_dataset(dataset, percent=0.7):
    train_amount = int(len(dataset) * percent) # approx percent% des donnees
    return dataset[:train_amount], dataset[train_amount:]

def yield_batches(x, y):
    for i in range(len(x)):
        yield (x[i], y[i])

In [7]:
class NLPModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.dense = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        embedded = self.embedding(x)
        mask = (x != vocab["<pad>"])
        mask = mask.unsqueeze(-1)
        embedded = embedded * mask.float()
        embedded = embedded.mean(dim=2)
        return self.dense(embedded)

class Model(pl.LightningModule):
    def __init__(self, vocab_len, output_dim):
        super().__init__()
        self.model = NLPModel(vocab_len, 1000, 256, output_dim)
        self.loss = nn.CrossEntropyLoss()
        self.tests = 0
        self.correct = 0

    def training_step(self, batch):
        x, y = batch
        y_hat = self.model(x)
        y = y.squeeze(0)
        y = y.squeeze(-1)
        y_hat = y_hat.squeeze(0)
        loss = self.loss(y_hat, y)
        self.log("train_loss", loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.model(x)
        loss = self.loss(y_hat, y)
        self.log("val_loss", loss, prog_bar=True)

    def test_step(self, batch):
        x, y = batch
        y_hat = self.model(x)
        y = y.squeeze(0)
        y = y.squeeze(-1)
        y_hat = y_hat.squeeze(0)
        self.tests += BATCH_SIZE
        loss = self.loss(y_hat, y)
        self.log("test_loss", loss, prog_bar=True)
        self.correct += (y_hat.argmax(dim=-1) == y).float().sum()
        self.log("accuracy", self.correct/self.tests, prog_bar=True)

    def configure_optimizers(self, lr=LEARNING_RATE):
        return optim.Adam(self.parameters(), lr=lr)

In [18]:
TOPICS = topic['topic'].unique()
print(TOPICS)
print(topic['topic'][0:10])

# ["joy", "sadness", "fear", "anger", "surprise", "neutral", "shame", "disgust"]

data_y, data_x = gen_dataset(topic, TOPICS, "topic")
p = np.random.permutation(len(data_x))
data_x = data_x[p]
data_y = data_y[p]

['emotion' 'financial' 'politics' 'sport' 'health' 'science']
0      emotion
1    financial
2    financial
3    financial
4    financial
5     politics
6     politics
7     politics
8     politics
9        sport
Name: topic, dtype: object


In [19]:
train_x, test_x = split_dataset(data_x)
train_y, test_y = split_dataset(data_y)

train_x = train_x[0:BATCH_SIZE * (len(train_x) // BATCH_SIZE)]
train_y = train_y[0:BATCH_SIZE * (len(train_x) // BATCH_SIZE)]
test_x = test_x[0:BATCH_SIZE * (len(test_x) // BATCH_SIZE)]
test_y = test_y[0:BATCH_SIZE * (len(test_y) // BATCH_SIZE)]

In [20]:
train_x = train_x.reshape(len(train_x) // BATCH_SIZE, BATCH_SIZE, len(train_x[0]))
test_x = test_x.reshape(len(test_x) // BATCH_SIZE, BATCH_SIZE, len(test_x[0]))
train_y = train_y.reshape(len(train_y) // BATCH_SIZE, BATCH_SIZE, 1)
test_y = test_y.reshape(len(test_y) // BATCH_SIZE, BATCH_SIZE, 1)
train_x = torch.Tensor(train_x).long()
train_y = torch.Tensor(train_y).long()
test_x = torch.Tensor(test_x).long()
test_y = torch.Tensor(test_y).long()

train = TensorDataset(train_x, train_y)
test = TensorDataset(test_x, test_y)

#train = torch.Tensor([batch for batch in yield_batches(train_x, train_y)])
#test = torch.Tensor([batch for batch in yield_batches(test_x, test_y)])

In [22]:

wandb.login(key="68fded06a6651270206da4fc4c0f175085cadbd7")

run = wandb.init(
    project="twittos-emotion",
    config={
        "learning_rate": LEARNING_RATE,
        "epochs": 20,
    })
wandb_logger = WandbLogger()


topic_model = Model(len(vocab),len(TOPICS))

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33manonx3247[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [31]:
EPOCHS = 1
trainer = pl.Trainer(
    max_epochs=EPOCHS,
    min_epochs=1,
    devices=1,
    accelerator="gpu",
    logger=wandb_logger
)

trainer.fit(
    model=topic_model,
    train_dataloaders=DataLoader(dataset=train, shuffle=True),
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type             | Params
-------------------------------------------
0 | model | NLPModel         | 130 M 
1 | loss  | CrossEntropyLoss | 0     
-------------------------------------------
130 M     Trainable params
0         Non-trainable params
130 M     Total params
520.770   Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


In [36]:
trainer.test(dataloaders=DataLoader(dataset=test, shuffle=True), ckpt_path='./lightning_logs/w4a99ey1/checkpoints/epoch=0-step=1223.ckpt')

INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at ./lightning_logs/w4a99ey1/checkpoints/epoch=0-step=1223.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at ./lightning_logs/w4a99ey1/checkpoints/epoch=0-step=1223.ckpt


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 0.6881061792373657, 'accuracy': 0.8899899125099182}]

In [39]:
from google.colab import files

files.download('./lightning_logs/w4a99ey1/checkpoints/epoch=4-step=6115.ckpt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>