In [22]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.nn import functional as F
import random
import itertools
import pandas as pd


model_name_pap = 'flax-community/papuGaPT2'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizerPap = AutoTokenizer.from_pretrained(model_name_pap)
modelPap = AutoModelForCausalLM.from_pretrained(model_name_pap).to(device)



In [23]:

def log_probs_from_logits(logits, labels):
    logp = F.log_softmax(logits, dim=-1)
    logp_label = torch.gather(logp, 2, labels.unsqueeze(2)).squeeze(-1)
    return logp_label


def sentence_prob(sentence_txt):
    input_ids = tokenizerPap(sentence_txt, return_tensors='pt')[
        'input_ids'].to(device)
    length = input_ids.shape[1]
    with torch.no_grad():
        output = modelPap(input_ids=input_ids)
        log_probs = log_probs_from_logits(
            output.logits[:, :-1, :], input_ids[:, 1:])
        seq_log_probs = torch.sum(log_probs)
    return seq_log_probs.cpu().numpy()/length


def rate_sentence(sentence_txt):
    posSentence = sentence_txt + " - często będę tu wracać!"
    negSentence = sentence_txt + " - nigdy tu nie przyjade!"
    probP = sentence_prob(posSentence)
    probN = sentence_prob(negSentence)
    return probP, probN, probP-probN



In [24]:
import torch
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

model_name_bert = 'allegro/herbert-base-cased'

tokenizerBert = AutoTokenizer.from_pretrained(model_name_bert)
modelBert = AutoModel.from_pretrained(model_name_bert).to(device)

def representation(sentence_txt):
    input_ids = tokenizerBert(sentence_txt, return_tensors='pt')['input_ids'].to(device)
    output = modelBert(input_ids=input_ids)
    return output.last_hidden_state.detach().cpu().numpy()[0,0,:]

representation("Ala ma kota")

Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.sso.sso_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


array([ 7.75121748e-02, -1.26742825e-01,  1.28273308e-01,  6.88978210e-02,
        1.63997412e-02,  5.44668175e-02, -3.27508003e-01, -6.34695232e-01,
        9.20320675e-02,  8.05968881e-01,  1.59044549e-01,  2.29028761e-01,
        2.93846512e+00,  3.55168045e-01,  1.10074170e-01, -1.78765580e-01,
        1.20000951e-01,  7.92342052e-03, -1.32740200e-01,  2.68556714e-01,
        2.42710233e-01,  4.63390164e-02,  4.08298010e-03, -2.29847297e-01,
        1.29358888e-01,  2.12778538e-01,  2.44441733e-01,  8.24066103e-02,
        3.90837789e-01,  3.85150641e-01, -3.88434790e-02, -3.29763256e-02,
        6.87328205e-02, -4.60860617e-02,  1.64548978e-01, -5.30544259e-02,
       -2.92029202e-01, -7.47868046e-02, -3.22980314e-01,  2.10616469e-01,
       -4.49957661e-02, -2.33092487e-01, -1.98450059e-01,  4.04942214e-01,
        2.84340023e-03,  1.94603443e-01, -1.16448402e-01,  9.03225690e-02,
       -9.03956518e-02, -6.43589860e-03,  5.03770933e-02, -2.66932338e-01,
        1.24548942e-01, -

In [78]:
from sklearn.model_selection import train_test_split

dataset = pd.read_csv("reviews.txt", sep="#", header=None, names=[
                      "sentiment", "review"], encoding="utf-8")

dataset = dataset.sample(frac=1)

In [79]:

dataX = dataset["review"]
dataY = dataset["sentiment"]

trainInputX, testInputX, trainInputY, testInputY = train_test_split(dataX, dataY, test_size=0.20, random_state=40)
dataset.head()

Unnamed: 0,sentiment,review
374,BAD,Niestety dzisiaj przyjechałam z mężem na wycz...
216,BAD,Już na pierwszej wizycie dostałam leki antyde...
157,GOOD,Zobaczenie znajomej twarzy w sali pełnej obcy...
295,BAD,Moim zdaniem jak jechac na wczasy to lepiej c...
94,GOOD,Zadzwonilam na recepcje mila pani recepcjonis...


In [80]:
trainX = []
for i in trainInputX:
    trainX.append(representation(i).tolist() + list(rate_sentence(i)))
    # break

testX = []
for i in testInputX:
    testX.append(representation(i).tolist() + list(rate_sentence(i)))

trainX[0]

[-0.040315575897693634,
 0.08591874688863754,
 0.049482204020023346,
 0.13792642951011658,
 -0.11229722946882248,
 -0.007811898831278086,
 -0.059455715119838715,
 -0.02069631963968277,
 -0.2355015128850937,
 0.005893684457987547,
 -0.033661775290966034,
 0.07145070284605026,
 1.557607889175415,
 -0.168218195438385,
 0.09768645465373993,
 0.16536448895931244,
 0.21626634895801544,
 0.0031463855411857367,
 0.21270790696144104,
 0.11281789094209671,
 0.8255682587623596,
 -0.030825333669781685,
 0.5378895998001099,
 -0.1821688562631607,
 0.09555608779191971,
 -0.18087340891361237,
 0.3002123236656189,
 0.18937431275844574,
 0.23174655437469482,
 0.07227630168199539,
 0.03752201423048973,
 0.2928580641746521,
 0.048870936036109924,
 -0.28847846388816833,
 0.01978931948542595,
 0.04557527229189873,
 -0.20799113810062408,
 0.13587234914302826,
 0.05529016628861427,
 -0.3843388557434082,
 0.08268508315086365,
 0.1570327877998352,
 -0.11532467603683472,
 0.34143826365470886,
 -0.022933393716812

In [81]:
trainY = [1 if i == "GOOD" else 0 for i in trainInputY]
testY = [1 if i == "GOOD" else 0 for i in testInputY]

In [82]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# print(len(trainX), len(trainY))

model = LogisticRegression(max_iter=1000, solver='liblinear')
model.fit(trainX, trainY)

# model.score(testX, testY)
print(model.score(testX, testY))
print(classification_report(testY, model.predict(testX)))


0.8
              precision    recall  f1-score   support

           0       0.74      0.89      0.81        38
           1       0.88      0.71      0.79        42

    accuracy                           0.80        80
   macro avg       0.81      0.80      0.80        80
weighted avg       0.81      0.80      0.80        80

