In [1]:
import pandas as pd
import numpy as np
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
import matplotlib.pyplot as plt
import math

In [2]:
# this is in utils - copying here for ease of access lol
def preprocess_conversation_columns(df):
	# remove all special characters from df
	df.columns = df.columns.str.replace('[^A-Za-z0-9_]', '', regex=True)
	
	# If data is grouped by batch/round, add a conversation num
	if {'batch_num', 'round_num'}.issubset(df.columns):
		df['conversation_num'] = df.groupby(['batch_num', 'round_num']).ngroup()
		df = df[df.columns.tolist()[-1:] + df.columns.tolist()[0:-1]] # make the new column first

	return(df)

def get_sentiment(text):

    if (pd.isnull(text)):
        return({'positive': np.nan, 'negative': np.nan, 'neutral': np.nan})
    
    encoded = tokenizer(text, return_tensors='pt')
    output = model(**encoded)

    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    # sample output format
    return({'positive': scores[2], 'negative': scores[0], 'neutral': scores[1]})

In [3]:
MODEL  = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Downloading (…)lve/main/config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
dataset_names = ["juries_tiny_for_testing", "jury_conversations_with_outcome_var", "csop_conversations_withblanks", "csopII_conversations_withblanks", "DAT_conversations_withblanks", "pgg_conversations_withblanks", "gurcay2015_group_estimation", "becker_group_estimation"]

directory="../data/raw_data/"

for dataset in dataset_names:
    csv_path = directory + dataset + '.csv'
    messages = pd.read_csv(csv_path, encoding='mac_roman')['message'].astype(str)
    
    sentiments = messages.apply(get_sentiment)

    sent_arr = [list(dict.values()) for dict in sentiments]

    sent_df = pd.DataFrame(sent_arr, columns =['positive_bert', 'negative_bert', 'neutral_bert']) 
    
    output_csv_folder = '../sentiment_bert/'

    sent_df.to_csv(output_csv_folder + dataset + '.csv')

In [7]:
# chat = pd.read_csv('../data/raw_data/juries_tiny_for_testing.csv')
# pd.concat([chat,pd.read_csv('../sentiment_bert/juries_tiny_for_testing.csv').drop('Unnamed: 0', axis=1)
# ], axis = 1)

Unnamed: 0,batch_num,round_num,speaker_hash,speaker_nickname,timestamp,message,majority_pct,num_flipped,flipped_pct,num_votes,positive_bert,negative_bert,neutral_bert
0,0,0,5e7e1e0031f4e454e196c30b,niceRhino,2020-04-20T18:27:20.125Z,Hello!,1.0,1,0.333333,3,0.837152,0.004957,0.157891
1,0,0,5e31d6e4e31c5304c46f1413,culturedCow,2020-04-20T18:27:23.764Z,Hi!,1.0,1,0.333333,3,0.717188,0.017631,0.265181
2,0,0,5e7e4f4c31f4e454e196c9c4,spryBison,2020-04-20T18:27:27.724Z,Hello,1.0,1,0.333333,3,0.528299,0.056653,0.415048
3,0,0,5d482ea421c9be351f762255,youngLion,2020-04-20T18:27:30.410Z,Hi,1.0,1,0.333333,3,0.442517,0.093418,0.464066
4,0,0,5e84cc3c50f6e364321d6265,smallGiraffe,2020-04-20T18:27:35.506Z,hi,1.0,1,0.333333,3,0.417184,0.110424,0.472392
...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,0,2,5e7e4f4c31f4e454e196c9c4,newLion,2020-04-20T19:02:55.111Z,I say asshole under stress,0.6,0,0.000000,5,0.058323,0.644060,0.297616
93,0,2,5d6feec65f80ae21f5c5f054,conventionalMonkey,2020-04-20T19:03:21.819Z,"Yes, she is the asshole... unfortunately. Husb...",0.6,0,0.000000,5,0.006095,0.890461,0.103443
94,0,2,5d482ea421c9be351f762255,newPanda,2020-04-20T19:03:36.308Z,I think she is being presumptuous and acting l...,0.6,0,0.000000,5,0.007374,0.783853,0.208773
95,0,2,5e7e4f4c31f4e454e196c9c4,newLion,2020-04-20T19:03:53.219Z,"Tha's true, she ins't considering her husband ...",0.6,0,0.000000,5,0.042617,0.340633,0.616749


In [None]:
# pd.DataFrame(sent_arr, columns =['positive_bert', 'negative_bert', 'neutral_bert']) 

Unnamed: 0,positive_bert,negative_bert,neutral_bert
0,0.837152,0.004957,0.157891
1,0.717188,0.017631,0.265181
2,0.528298,0.056653,0.415048
3,0.442517,0.093418,0.464066
4,0.417184,0.110424,0.472392
...,...,...,...
92,0.058323,0.644060,0.297617
93,0.006095,0.890461,0.103443
94,0.007374,0.783853,0.208773
95,0.042617,0.340634,0.616749


In [None]:
#handling due to lack of infrastructure in ipynb
# pos_ratings = sent_ratings.to_frame().rename(columns={'message':'sentiment'})
# chatted = pd.concat([chat_data, pos_ratings], axis =1)
# chatted

#parse
# column for positive_bert, neutral_bert, negative_bert, concat to the chat level output 
# generate it in another folder (bert ratings)
# concat while calculating chat level features 
