In [1]:
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from transformers import RobertaTokenizer, DataCollatorWithPadding

from dataset import sentenceClsData
from recipes import Roberta
import torch
import numpy as np
import pandas as pd

In [2]:
torch.cuda.device_count() 

1

In [3]:
read_csv = lambda x: pd.read_csv(x, header=None)

In [4]:
linksNamesFilter = lambda x : False if '@' in x or 'http://'  in x else True

def clean(s: str)-> str:
    
    words = s.lower().split()
    words = [word for word in words if linksNamesFilter(word)]
    
    
    fs = ' '.join(words)
    
    return fs.replace('#', '')
    

In [5]:
train_pth = '/projectnb/cs640g/students/pranchan/covid_instagram_sentiment/data/train_filtered.csv'
val_pth = '/projectnb/cs640g/students/pranchan/covid_instagram_sentiment/data/dev_filtered.csv'
test_pth = '/projectnb/cs640g/students/pranchan/covid_instagram_sentiment/data/test_filtered.csv'

train = read_csv(train_pth).dropna()
val = read_csv(val_pth).dropna()
test = read_csv(test_pth).dropna()


X_train, y_train = train[1].apply(clean).tolist(), train[0]
X_val, y_val = val[1].apply(clean).tolist(), val[0]
X_test, y_test = test[1].apply(clean).tolist(), test[0]
# X_test, y_test = test[2].apply(clean).tolist(), test[2]


In [6]:
test.head()

Unnamed: 0,0,1
0,anger,This game has pissed me off more than any othe...
1,anger,@moocowward @mrsajhargreaves @Melly77 @GaryBar...
2,anger,@moocowward @mrsajhargreaves @Melly77 @GaryBar...
3,anger,@virginmedia I've been disconnected whilst on ...
4,anger,@eMilsOnWheels I'm furious 😩😩😩


In [7]:
cmap = {'anger':0, 'fear':1, 'joy':2, 'sadness':3}

In [8]:
checkpoint = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(checkpoint)

token_train = tokenizer(X_train, truncation=True)
token_val = tokenizer(X_val, truncation=True)
token_test = tokenizer(X_test, truncation=True)

token_train['labels'] = [cmap[i] for i in y_train]
token_val['labels'] = [cmap[i] for i in y_val]
token_test['labels'] = [cmap[i] for i in y_test]
# token_test['labels'] = [0 for i in y_test]

train_data = sentenceClsData(token_train)
val_data  = sentenceClsData(token_val)
test_data = sentenceClsData(token_test)


In [9]:
cmap

{'anger': 0, 'fear': 1, 'joy': 2, 'sadness': 3}

In [10]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_dataloader = DataLoader(
    train_data, shuffle=True, batch_size=4, collate_fn=data_collator, num_workers=4
)
eval_dataloader = DataLoader(
    val_data, batch_size=16, collate_fn=data_collator, num_workers=4
)

test_dataloader = DataLoader(
    test_data, batch_size=16, collate_fn=data_collator, num_workers=4
)

In [None]:
model = Roberta(checkpoint, 4, train_dataloader, eval_dataloader)
trainer = pl.Trainer(
    progress_bar_refresh_rate=10,
    max_epochs=5,
    gpus=1,
    logger=pl.loggers.TensorBoardLogger("lightning_logs/", name="collated"),
)
trainer.fit(model)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
model.eval()
preds, labels = [], []
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    logits = torch.argmax(logits, axis=1)
    # logits = torch.softmax(logits, axis=1)
    # preds.append(logits.cpu().numpy())
    
    preds.extend(logits.cpu().tolist())
    labels.extend(batch['labels'].tolist())

In [31]:
labels = np.array(labels)
preds = np.array(preds)

In [32]:
from sklearn.metrics import classification_report

In [33]:
n, l = zip(*cmap.items())

In [34]:
print(classification_report(labels, preds, labels=l, target_names=n))

              precision    recall  f1-score   support

       anger       0.90      0.85      0.87       380
        fear       0.83      0.88      0.85       504
         joy       0.95      0.98      0.96       377
     sadness       0.85      0.81      0.83       349

    accuracy                           0.88      1610
   macro avg       0.88      0.88      0.88      1610
weighted avg       0.88      0.88      0.88      1610



In [19]:
preds

array([0, 0, 0, ..., 3, 3, 0])

In [20]:
labels

array([0, 0, 0, ..., 0, 0, 0])

In [46]:
a = torch.rand(4, 6)

In [47]:
a.to("cuda")

tensor([[0.1832, 0.1197, 0.8428, 0.7713, 0.4780, 0.1263],
        [0.0243, 0.9207, 0.0852, 0.2365, 0.4308, 0.2383],
        [0.9016, 0.2516, 0.5143, 0.5447, 0.9926, 0.0947],
        [0.8774, 0.1750, 0.6554, 0.2305, 0.0318, 0.8707]], device='cuda:0')

In [48]:
np.unique(preds)

array([0, 1, 2, 3])

In [14]:
cmap

{'anger': 0, 'fear': 1, 'joy': 2, 'sadness': 3}

In [37]:
test

Unnamed: 0,0,1,2,preds
0,en,#covid #covid2020 #covidvirus #virus #coronava...,#covid #covid2020 #covidvirus #virus #coronava...,1
1,en,Well this is the final mural of my trip in Aus...,Well this is the final mural of my trip in Aus...,3
2,pt,Chegamos !!! Vão seguindo o movimento... Tem m...,We have arrived!!! Keep following the movement...,2
3,en,😻😻😻😻😻,😻😻😻😻😻,2
4,es,EN MI DOMICILIO 🏡 #quedateencasa 📲 0414-464.18...,AT MY HOME 🏡 #quedateencasa 📲 0414-464.18.89. ...,1
...,...,...,...,...
9643,en,#water #foryou #followforfollowback #photograp...,#water #foryou #followforfollowback #photograp...,1
9644,en,#like4likes #20likes #tagforlikes #instalikes ...,#like4likes #20likes #tagforlikes #instalikes ...,1
9645,fr,🙈🥰😍👉🏽 @love_serie_karma #daancorona #daancoron...,🙈🥰😍👉🏽 @love_serie_karma #daancorona #daancoron...,1
9646,en,💥🚨 𝗙𝗔𝗟𝗟 𝗜𝗦 𝗖𝗢𝗠𝗜𝗡𝗚 🚨💥⁣ ⁣ 𝘼𝙧𝙚 𝙮𝙤𝙪 𝙧𝙚𝙖𝙙𝙮 𝙛𝙤𝙧 𝙖 𝙣𝙚...,💥🚨 𝗙𝗔𝗟𝗟 𝗜𝗦 𝗖𝗢𝗠𝗜𝗡𝗚 🚨💥⁣ ⁣ 𝘼𝙧𝙚 𝙮𝙤𝙪 𝙧𝙚𝙖𝙙𝙮 𝙛𝙤𝙧 𝙖 𝙣𝙚...,1


In [38]:
test['preds'] = pd.Series(preds, index=test.index)

In [39]:
imap = dict([(float(i), j) for j, i in cmap.items()])

In [40]:
imap

{0.0: 'anger', 1.0: 'fear', 2.0: 'joy', 3.0: 'sadness'}

In [41]:
test['preds'] = test['preds'].map(imap)

In [42]:
test

Unnamed: 0,0,1,2,preds
0,en,#covid #covid2020 #covidvirus #virus #coronava...,#covid #covid2020 #covidvirus #virus #coronava...,fear
1,en,Well this is the final mural of my trip in Aus...,Well this is the final mural of my trip in Aus...,sadness
2,pt,Chegamos !!! Vão seguindo o movimento... Tem m...,We have arrived!!! Keep following the movement...,joy
3,en,😻😻😻😻😻,😻😻😻😻😻,joy
4,es,EN MI DOMICILIO 🏡 #quedateencasa 📲 0414-464.18...,AT MY HOME 🏡 #quedateencasa 📲 0414-464.18.89. ...,fear
...,...,...,...,...
9643,en,#water #foryou #followforfollowback #photograp...,#water #foryou #followforfollowback #photograp...,fear
9644,en,#like4likes #20likes #tagforlikes #instalikes ...,#like4likes #20likes #tagforlikes #instalikes ...,fear
9645,fr,🙈🥰😍👉🏽 @love_serie_karma #daancorona #daancoron...,🙈🥰😍👉🏽 @love_serie_karma #daancorona #daancoron...,fear
9646,en,💥🚨 𝗙𝗔𝗟𝗟 𝗜𝗦 𝗖𝗢𝗠𝗜𝗡𝗚 🚨💥⁣ ⁣ 𝘼𝙧𝙚 𝙮𝙤𝙪 𝙧𝙚𝙖𝙙𝙮 𝙛𝙤𝙧 𝙖 𝙣𝙚...,💥🚨 𝗙𝗔𝗟𝗟 𝗜𝗦 𝗖𝗢𝗠𝗜𝗡𝗚 🚨💥⁣ ⁣ 𝘼𝙧𝙚 𝙮𝙤𝙪 𝙧𝙚𝙖𝙙𝙮 𝙛𝙤𝙧 𝙖 𝙣𝙚...,fear


In [30]:
test.sha

(9589, 4)

In [31]:
preds.shape

(9589,)

In [43]:
test.to_csv('../insta_predictions.csv', index=False)

In [50]:
pred = np.vstack(preds)

In [51]:
pred.shape

(9589, 4)

In [52]:
imap

{0.0: 'anger', 1.0: 'fear', 2.0: 'joy', 3.0: 'sadness'}

In [53]:
imap[0]

'anger'

In [56]:
for i in range(4):
    test[imap[i]] = pd.Series(pred[:, i], index=test.index)

In [58]:
test.to_csv('../insta_predictions_prob.csv', index=False)