In [None]:
from transformers import pipeline

In [None]:
model_name = 'uer/roberta-base-finetuned-jd-binary-chinese'
nlp = pipeline('sentiment-analysis', model=model_name)

In [None]:
def get_negative_sentiment_score(text, nlp):
    try:
        result = nlp(text)
        negative_sentiment = result[0]['score'] if result[0]['label'] == 'negative (stars 1, 2 and 3)' else 1 - result[0]['score']
    except:
        negative_sentiment = 'NA'
    return negative_sentiment

In [None]:
%%configure -f
{
    "conf": {
        "spark.pyspark.python": "python3",
        "spark.pyspark.virtualenv.enabled": "true",
        "spark.pyspark.virtualenv.type":"native",
        "spark.pyspark.virtualenv.bin.path":"/usr/bin/virtualenv"
    }
}

In [None]:
sc.install_pypi_package("boto3==1.19.2")
sc.install_pypi_package("pandas==1.0.5")
sc.install_pypi_package("scipy==1.4.1")
sc.install_pypi_package("matplotlib==3.2.1")
sc.install_pypi_package("seaborn==0.10.1")
sc.install_pypi_package('spark-nlp')
sc.install_pypi_package("torch")
sc.install_pypi_package("transformers==4.2")

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import numpy as np
import matplotlib.pyplot as plt
from transformers import pipeline
import torch
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.annotator import *
from sparknlp.common import RegexRule
from sparknlp.base import *

In [None]:
data = spark.read.json('s3://amazon-reviews-ml/json/train/dataset_zh_train.json')

In [None]:
from transformers import pipeline
# model_name = 'uer/roberta-base-finetuned-jd-binary-chinese'
# model_name = 'uer/roberta-base-finetuned-dianping-chinese'
models = [
    'uer/roberta-base-finetuned-jd-binary-chinese',
    'uer/roberta-base-finetuned-dianping-chinese',
    'philschmid/distilbert-base-multilingual-cased-sentiment',
    'philschmid/distilbert-base-multilingual-cased-sentiment-2',
]
# nlp = pipeline('sentiment-analysis', model=model_name)
nlps = [pipeline('sentiment-analysis', model=model_name) for model_name in models]

In [None]:
def get_positive_sentiment_score(text, nlps):
    results = [nlp(text) for nlp in nlps]
    scores = []
    for result in results:
        try:
            scores.append(i['score'] for i in result if i['label'] == 'positive')
            scores.append(i['score'] for i in result if i['label'] == 'neutral')
            scores.append(i['score'] for i in result if i['label'] == 'positive (stars 4 and 5)')
            print(result[0])
        except:
            pass
#         positive_sentiment = result[0]['score'] if result[0]['label'] != 'negative (stars 1, 2 and 3)' else 1 - result[0]['score']
    return scores

In [None]:
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-jd-binary-chinese")
model = AutoModelForSequenceClassification.from_pretrained("uer/roberta-base-finetuned-jd-binary-chinese")
encoded_input = tokenizer(text, return_tensors='pt')
with torch.no_grad():
    logits = model(**encoded_input).logits
    probabilities = torch.softmax(logits, dim=1).squeeze()
    sentiment_scores = probabilities.tolist()
print(sentiment_scores)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification