In [2]:
#!pip install empath

In [2]:
from empath import Empath
lexicon = Empath()

In [3]:
import os
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
import argparse
import json
from util import contentToString, getSegments
import csv
from util import isGood
from pathlib import Path


dataset="PANC"
datapackID=None
if datapackID is None: datapackID = dataset

for datasetType in ["train", "test"]:
	datapackPath = "%s/datapacks/datapack-%s-%s.json" % (
		dataset, datapackID, datasetType)

	outPath = os.path.join("%s/csv/" % dataset)
	Path(outPath).mkdir(parents=True, exist_ok=True)
	csvPath = os.path.join(outPath, "%s-%s.csv" %
		(datapackID, datasetType))
	with open(datapackPath, "r") as file:
		datapack = json.load(file)

		# write TSV
		#with open(csvPath, 'w', newline='') as file:
			#writeCSV(file, datapack, datasetType)

print("\nSuccessfully wrote all csv files")



Successfully wrote all csv files


In [4]:
def number_of_questions(body):
    return body.count('?')
    
def size_of_words(body):
    return np.mean([len(w) for w in body.split(" ")])

In [57]:
def dictionary_based(body):
    return lexicon.analyze(body, categories=["sexual"])['sexual']

def dictionary_based_multi(body, category):
    return lexicon.analyze(body, categories=[category])[category]

res = lexicon.analyze('Bro what')
categories = list(res.keys())

In [6]:
def relative_difference(x,y):
    """
    Assume
    x,y>=0
    """
    absolute_difference = abs(x-y)
    denom = max(x,y)
    if denom > 0:
        return absolute_difference/denom
    else:
        return 0

In [7]:
import numpy as np

def calculate_metric(metric,absolute=False):
    """
    Calulates the given metric for every message. Sums it up per author. Then the calculate the relative difference of this sum between the two authors.
    Returns the average of these differences for non-predatory and predatory messages
    
    In other words, how many more "metric" does one author use compared to the other.
    
    The reason why we care about relative differences instead of absolute values is the assumption that predators talk diffrent that victims. We aim
    to quantify this difference in writing style.
    
    For example calculate_metric(len) will return the average of the relative difference between the length of the messages of authorA and authorB
    """
    rel_diff_pred = []
    rel_diff_non = []
    for chatName, chat in datapack["chats"].items():
        for i, segment in enumerate(getSegments(chat)):
            #For every segment

            len_dict = {} # Keys: authors, Values: sum of length of messages
            for message in segment:
                if message == None:
                    continue

                author = message["author"]
                body = message["body"]

                if body == None:
                    continue

                if author in len_dict.keys():
                    len_dict[author] += metric(body)
                else:
                    len_dict[author] = metric(body)

            # Some segments contain only one author. Maybe because of omegle
            if len(len_dict.keys()) ==2 and segment[0]:
                a, b = len_dict.values()
                if not absolute:
                    if (segment[0]["isFromPredator"]):
                        rel_diff_pred.append(relative_difference(a,b))
                    else:
                        rel_diff_non.append(relative_difference(a,b))
                else:
                    if (segment[0]["isFromPredator"]):
                        # in that case the variable souldn't be called rel_diff but anyway
                        rel_diff_pred.append(a+b)
                    else:
                        rel_diff_non.append(a+b)
    return np.mean(rel_diff_non), np.mean(rel_diff_pred)



In [28]:
def calculate_metric_multi(metric,category,absolute=False):
    """
    Calulates the given metric for every message. Sums it up per author. Then the calculate the relative difference of this sum between the two authors.
    Returns the average of these differences for non-predatory and predatory messages
    
    In other words, how many more "metric" does one author use compared to the other.
    
    The reason why we care about relative differences instead of absolute values is the assumption that predators talk diffrent that victims. We aim
    to quantify this difference in writing style.
    
    For example calculate_metric(len) will return the average of the relative difference between the length of the messages of authorA and authorB
    """
    rel_diff_pred = []
    rel_diff_non = []
    for chatName, chat in datapack["chats"].items():
        for i, segment in enumerate(getSegments(chat)):
            #For every segment

            len_dict = {} # Keys: authors, Values: sum of length of messages
            for message in segment:
                if message == None:
                    continue

                author = message["author"]
                body = message["body"]

                if body == None:
                    continue

                if author in len_dict.keys():
                    len_dict[author] += metric(body,category)
                else:
                    len_dict[author] = metric(body,category)

            # Some segments contain only one author. Maybe because of omegle
            if len(len_dict.keys()) ==2 and segment[0]:
                a, b = len_dict.values()
                if not absolute:
                    if (segment[0]["isFromPredator"]):
                        rel_diff_pred.append(relative_difference(a,b))
                    else:
                        rel_diff_non.append(relative_difference(a,b))
                else:
                    if (segment[0]["isFromPredator"]):
                        # in that case the variable souldn't be called rel_diff but anyway
                        rel_diff_pred.append(a+b)
                    else:
                        rel_diff_non.append(a+b)
    return np.mean(rel_diff_non), np.mean(rel_diff_pred)



In [17]:
print("Metric: (Non-predators, Predators)")
print()
print("Difference in Message Length: ", calculate_metric(len))
print("Absolute Message Length: ", calculate_metric(len,absolute=True))
print("No. of questions: ", calculate_metric(number_of_questions))


Metric: (Non-predators, Predators)

Difference in Message Length:  (0.4030566335339038, 0.38650301992468344)
Absolute Message Length:  (1157.4328123772873, 3520.7461322081576)
No. of questions:  (0.5738894152006618, 0.6764642266093621)


In [12]:
print("Size of words:", calculate_metric(size_of_words))

Size of words: (0.35141627101212997, 0.2416746686888594)


In [9]:
print("Dictionary_based: ", calculate_metric(dictionary_based))

Dictionary_based:  (0.22968258152114968, 0.5129049708592553)


In [55]:
import pickle

"""
with open('category_scores.pickle', 'wb') as handle:
    pickle.dump(category_scores, handle, protocol=pickle.HIGHEST_PROTOCOL)
"""

with open('category_scores.pickle', 'rb') as handle:
    category_score = pickle.load(handle)
for key, value in category_score.items():
    print(key, ' : ', value)

sleep  :  (0.13113417298524546, 0.4662130804442793)
night  :  (0.11098099373458445, 0.4430081165995785)
domestic_work  :  (0.11818739380970841, 0.44826138539994337)
vacation  :  (0.17390972275210767, 0.4844983752458018)
love  :  (0.19643064444230127, 0.5027391756238117)
family  :  (0.20334340086697084, 0.5059048374817265)
shopping  :  (0.12074464646325166, 0.4142074912117106)
affection  :  (0.16251072556731427, 0.45149121500299955)
furniture  :  (0.08436926651801388, 0.3733213640417662)
body  :  (0.1665154620205998, 0.4541098162787639)
home  :  (0.17952260777178475, 0.4666258821142494)
celebration  :  (0.17253155232421707, 0.4567825306003938)
sexual  :  (0.22968258152114968, 0.5129049708592553)
work  :  (0.20446769712830343, 0.48592297526227196)
morning  :  (0.08546836179954848, 0.36537593130420126)
traveling  :  (0.19182767207742837, 0.46988756440197854)
driving  :  (0.08261189232680631, 0.35599057822159086)
shame  :  (0.15872840618241318, 0.43151336083825526)
wedding  :  (0.130264119

In [37]:
'''
category_scores = {}
for category in categories:
    res = calculate_metric_multi(dictionary_based_multi,category)
    print(category,": ", res)
    category_scores[category] = res
'''

'\ncategory_scores = {}\nfor category in categories:\n    res = calculate_metric_multi(dictionary_based_multi,category)\n    print(category,": ", res)\n    category_scores[category] = res\n'

### Let's try emotion bert

In [13]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification, Trainer, TrainingArguments

tokenizer = AutoTokenizer.from_pretrained("bhadresh-savani/bert-base-go-emotion")

model = AutoModelForSequenceClassification.from_pretrained("bhadresh-savani/bert-base-go-emotion")

In [17]:
#model.predict(tokenized)

trainer = Trainer(
    model=model,
    args=TrainingArguments(output_dir='./predictions'),
    #eval_dataset=test_data,
    tokenizer=tokenizer
)

tokenized = tokenizer("i like it and I love it")
trainer.predict(tokenized)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 2
  Batch size = 8


IndexError: list index out of range