In [29]:
import argostranslate.package
import argostranslate.translate
import pandas as pd
from sklearn import metrics

from transformers import pipeline

In [13]:
classifier = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest", top_k=None)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
argostranslate.package.update_package_index()

In [3]:
from_code = "es"
to_code = "en"

# Download and install Argos Translate package
argostranslate.package.update_package_index()
available_packages = argostranslate.package.get_available_packages()
package_to_install = next(
    filter(
        lambda x: x.from_code == from_code and x.to_code == to_code, available_packages
    )
)
argostranslate.package.install_from_path(package_to_install.download())

In [6]:
with open("50000paragraphs_maincategories_filledout.csv", "r", encoding="UTF-8") as file:
    df = pd.read_csv(file)

df = df.rename(columns={"Tone": "sentiment", "Paragraphs": "text"})
df = df.dropna(subset=["sentiment"]).reset_index(drop=True)

In [8]:
def translateText(text):
    return argostranslate.translate.translate(text, from_code, to_code)

In [9]:
df["textenglish"] = df["text"].apply(translateText)

In [10]:
df.to_json("50000paragraphs_maincategories_english_filledout.json", orient="records", lines=True)

In [17]:
classifier("I am sad")

[[{'label': 'negative', 'score': 0.7221518158912659},
  {'label': 'neutral', 'score': 0.23753465712070465},
  {'label': 'positive', 'score': 0.0403134748339653}]]

In [19]:
def score(row):
    text = row["textenglish"]

    labels=["negative", "neutral", "positive"]
    label_weights=[-1, 1, 0]

    scoreDicts = classifier(text, padding=True, max_length=512, truncation=True)[0]

    partialScore = 0
    for scoreDict in scoreDicts:
        label = scoreDict["label"]
        score = scoreDict["score"]
        j = labels.index(label)
        weight = label_weights[j]
        partialScore = partialScore + score*weight
    
    wordScore = scoreDicts[0]["label"]

    row["negpos"] = partialScore
    row["wordnegpos"] = wordScore
    
    return row

In [20]:
df = df.apply(score, axis=1)

In [25]:
conversiondict = {
    "NEG": 0,
    "negative": 0,
    "NEU": 1,
    "neutral": 1,
    "POS": 2,
    "positive": 2
}

def applyDict(x):
    return conversiondict[x]

In [28]:
df["wordnegpos"] = df["wordnegpos"].apply(applyDict)
df["sentiment"] = df["sentiment"].apply(applyDict)

In [30]:
metrics.accuracy_score(df["sentiment"], df["wordnegpos"])

0.6462882096069869

In [33]:
metrics.recall_score(df["sentiment"], df["wordnegpos"], average=None)

array([0.39954853, 0.79932546, 0.29064039])