In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.nn import functional as F
import random
import itertools
import pandas as pd


model_name_pap = 'flax-community/papuGaPT2'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizerPap = AutoTokenizer.from_pretrained(model_name_pap)
modelPap = AutoModelForCausalLM.from_pretrained(model_name_pap).to(device)



In [2]:

def log_probs_from_logits(logits, labels):
    logp = F.log_softmax(logits, dim=-1)
    logp_label = torch.gather(logp, 2, labels.unsqueeze(2)).squeeze(-1)
    return logp_label


def sentence_prob(sentence_txt):
    input_ids = tokenizerPap(sentence_txt, return_tensors='pt')[
        'input_ids'].to(device)
    length = input_ids.shape[1]
    with torch.no_grad():
        output = modelPap(input_ids=input_ids)
        log_probs = log_probs_from_logits(
            output.logits[:, :-1, :], input_ids[:, 1:])
        seq_log_probs = torch.sum(log_probs)
    return seq_log_probs.cpu().numpy()/length


def rate_sentence(sentence_txt):
    posSentence = sentence_txt + " - często będę tu wracać!"
    negSentence = sentence_txt + " - nigdy tu nie przyjade!"
    probP = sentence_prob(posSentence)
    probN = sentence_prob(negSentence)
    return probP, probN, probP-probN



In [3]:
import torch
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

model_name_bert = 'allegro/herbert-base-cased'

tokenizerBert = AutoTokenizer.from_pretrained(model_name_bert)
modelBert = AutoModel.from_pretrained(model_name_bert).to(device)

def representation(sentence_txt):
    input_ids = tokenizerBert(sentence_txt, return_tensors='pt')['input_ids'].to(device)
    output = modelBert(input_ids=input_ids)
    return output.last_hidden_state.detach().cpu().numpy()[0,0,:]

representation("Ala ma kota")

Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.sso.sso_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


array([ 7.75121748e-02, -1.26742825e-01,  1.28273308e-01,  6.88978210e-02,
        1.63997412e-02,  5.44668175e-02, -3.27508003e-01, -6.34695232e-01,
        9.20320675e-02,  8.05968881e-01,  1.59044549e-01,  2.29028761e-01,
        2.93846512e+00,  3.55168045e-01,  1.10074170e-01, -1.78765580e-01,
        1.20000951e-01,  7.92342052e-03, -1.32740200e-01,  2.68556714e-01,
        2.42710233e-01,  4.63390164e-02,  4.08298010e-03, -2.29847297e-01,
        1.29358888e-01,  2.12778538e-01,  2.44441733e-01,  8.24066103e-02,
        3.90837789e-01,  3.85150641e-01, -3.88434790e-02, -3.29763256e-02,
        6.87328205e-02, -4.60860617e-02,  1.64548978e-01, -5.30544259e-02,
       -2.92029202e-01, -7.47868046e-02, -3.22980314e-01,  2.10616469e-01,
       -4.49957661e-02, -2.33092487e-01, -1.98450059e-01,  4.04942214e-01,
        2.84340023e-03,  1.94603443e-01, -1.16448402e-01,  9.03225690e-02,
       -9.03956518e-02, -6.43589860e-03,  5.03770933e-02, -2.66932338e-01,
        1.24548942e-01, -

In [6]:
from sklearn.model_selection import train_test_split

dataset = pd.read_csv("reviews_c.txt", sep="#", header=None, names=[
                      "sentiment", "review"], encoding="utf-8")

dataset = dataset.sample(frac=1)

In [7]:

dataX = dataset["review"]
dataY = dataset["sentiment"]

trainInputX, testInputX, trainInputY, testInputY = train_test_split(dataX, dataY, test_size=0.20, random_state=40)
dataset.head()

Unnamed: 0,sentiment,review
466,BAD,Przepisanie dawki leku i koniec wizyty.
95,GOOD,Pracownicy kawiarni wyjątkowo przyjacielsko n...
217,GOOD,"Okolice hotelu, promenada Andrzeja Quina, Fal..."
131,GOOD,Jedzenie doskonale.
126,GOOD,Nawet jeżdżące pod oknem tramwaje nie zepsuły...


In [8]:
trainX = []
for i in trainInputX:
    trainX.append(representation(i).tolist() + list(rate_sentence(i)))
    # break

testX = []
for i in testInputX:
    testX.append(representation(i).tolist() + list(rate_sentence(i)))

trainX[0]

[0.150121808052063,
 -0.0033762743696570396,
 -0.06703820824623108,
 0.11551938951015472,
 -0.46302342414855957,
 0.765114963054657,
 -0.21170595288276672,
 0.683717668056488,
 -0.1461258828639984,
 0.06799130886793137,
 -0.0750637948513031,
 -0.030163733288645744,
 1.4196925163269043,
 -1.090575933456421,
 0.21212710440158844,
 0.27811145782470703,
 0.3989894390106201,
 -0.2820851504802704,
 -0.29335686564445496,
 0.1921294629573822,
 0.9791862964630127,
 0.017250217497348785,
 -0.19769616425037384,
 -0.22996574640274048,
 -0.45579254627227783,
 -0.13344621658325195,
 0.1360889971256256,
 0.07428149878978729,
 -0.28596702218055725,
 0.12008938193321228,
 -0.05277477204799652,
 0.21105174720287323,
 -0.053467996418476105,
 -0.12495828419923782,
 0.3313329517841339,
 -0.07333025336265564,
 -0.19959551095962524,
 0.11774194240570068,
 0.6622300148010254,
 -0.15868228673934937,
 -0.003598609007894993,
 -0.020797381177544594,
 -0.2083897888660431,
 0.1473928987979889,
 0.2170235812664032,


In [9]:
trainY = [1 if i == "GOOD" else 0 for i in trainInputY]
testY = [1 if i == "GOOD" else 0 for i in testInputY]

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# print(len(trainX), len(trainY))

model = LogisticRegression(max_iter=1000, solver='liblinear')
model.fit(trainX, trainY)

# model.score(testX, testY)
print(model.score(testX, testY))
print(classification_report(testY, model.predict(testX)))


0.825
              precision    recall  f1-score   support

           0       0.85      0.80      0.82        82
           1       0.80      0.85      0.82        78

    accuracy                           0.82       160
   macro avg       0.83      0.83      0.82       160
weighted avg       0.83      0.82      0.82       160

