In [186]:
# Import Libraries
import pandas as pd
import numpy as np
import ast

from transformers import pipeline

from sklearn.metrics import classification_report

In [137]:
# load data
trainDF = pd.read_csv('..//data//preprocessingTrainDF')
testDF = pd.read_csv('..//data//preprocessingTestDF')

In [138]:
# Convert bowTokens to list
def stringToList(dfCol):
    return ast.literal_eval(dfCol)

for df in [trainDF, testDF]:
    df['bowTokens'] = df['bowTokens'].apply(stringToList)

In [139]:
trainDF.head()

Unnamed: 0,cleanText,bowTokens,label
0,rented curiousyellow video store controversy s...,"[rented, curiousyellow, video, store, controve...",0
1,curious yellow risible pretentious steaming pi...,"[curious, yellow, risible, pretentious, steami...",0
2,avoid making type film future film interesting...,"[avoid, making, type, film, future, film, inte...",0
3,film probably inspired godards masculin femini...,"[film, probably, inspired, godards, masculin, ...",0
4,oh brotherafter hearing ridiculous film umptee...,"[oh, brotherafter, hearing, ridiculous, film, ...",0


# Loading pretrained siebert/sentiment-roberta-large-english

In [140]:

sentiment_analysis = pipeline("sentiment-analysis",model="siebert/sentiment-roberta-large-english", device="cuda")
print(sentiment_analysis("I love this!"))



[{'label': 'POSITIVE', 'score': 0.9988656044006348}]


In [141]:
sentiment_analysis.tokenizer.model_max_length

512

Truncating strings to fit in model for max length

In [171]:
#
def truncateJoinList(input_string):

    # max words is not the model max length as the model does not tokenize by bow
    # Limit to 200 words
    maxWords = 200
    if len(input_string) > maxWords:
        truncated_words = input_string[:maxWords]
        truncated_string = ' '.join(truncated_words)
        return truncated_string
    else:
        return ' '.join(input_string)

In [172]:
# Apply the function to the text columns
for df in [trainDF, testDF]:
    df['truncatedText'] = df['bowTokens'].apply(truncateJoinList)

In [173]:
# lengths 
print(len(trainDF['truncatedText'][17]))
print(len(trainDF['bowTokens'][17]))
print(len(trainDF['cleanText'][17]))

1442
457
3258


# Running pretrained model

In [174]:
def predSentiment(df):
    pred = []
    for i in range(df.shape[0]):
        pred.append(sentiment_analysis(df[i]))
        print(i,end='\r')
    return pred

In [183]:
trainPred = predSentiment(trainDF['truncatedText'])

24999

In [184]:
testPred = predSentiment(testDF['truncatedText'])

24999

### Putting results in dataframe

In [208]:
# 

def extractData(data):
    labels = [item[0]['label'] for item in data]
    scores = [item[0]['score'] for item in data]

    return pd.DataFrame({'label': labels,'score': scores})

trainPredDF = extractData(trainPred)
testPredDF = extractData(testPred)

In [210]:
# Saving results
# trainPredDF.to_csv('..//data//trainPred.csv', index=False)
# testPredDF.to_csv('..//data//testPred.csv', index=False)

In [216]:
y_pred_pretrained = testPredDF['label'].map({'NEGATIVE': 0, 'POSITIVE': 1}).values
y_test = testDF['label']

In [217]:
print(classification_report(y_test, y_pred_pretrained))

              precision    recall  f1-score   support

           0       0.88      0.93      0.91     12500
           1       0.93      0.88      0.90     12500

    accuracy                           0.90     25000
   macro avg       0.90      0.90      0.90     25000
weighted avg       0.90      0.90      0.90     25000



The siebert sentiment -robertare pretrained model performed better, with average f1-score of 0.90 compared to the 0.87 from the logistic BoW model.