In [1]:
import requests
import pandas as pd
import xml.etree.ElementTree as ET
import json
import html
import multiprocessing
from collections import Counter
from bs4 import BeautifulSoup
from pathlib import Path
from googleapiclient import discovery
import os, glob
import time
from googleapiclient.errors import HttpError
from tqdm import tqdm

In [2]:
with open('./apikey.txt') as api_file: 
    API_KEY = api_file.read()
    
API_CLIENT = api_client = discovery.build(
    "commentanalyzer",
    "v1alpha1",
    developerKey=API_KEY,
    discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
    static_discovery=False)

In [3]:
def query_perspective(text, languages = None):
    
    #post = json.load(content)
    
    analyze_request = {
        'comment': { 'text': text },
        'requestedAttributes': {'TOXICITY': {}, 'SEVERE_TOXICITY': {}, 'IDENTITY_ATTACK' : {}, 'INSULT': {}, 'PROFANITY' : {}, 'THREAT' : {}}
    }
    
    if(languages):
        analyze_request['languages'] = languages
    try :
        response = API_CLIENT.comments().analyze(body=analyze_request).execute()
    except (Exception,HttpError) as err:
        if (type(err) == HttpError and err.status_code == 429):
            time.sleep(61)
            return query_perspective(text=text, languages = languages)
        else :
            print("Catched an exception not dealt with.. we will print and then sleep and resume after 61 seconds. ")
            print("Exception : ")
            print(err)
            print("Sleeping for 61 seconds...")
            time.sleep(61)
            print("...Restarted...")
            return query_perspective(text=text, languages = languages)
    return response['attributeScores']


In [4]:
from collections import Counter

A = [{'a':1,'b':2,'c':3,'d':4,'e':5},{'b':1,'c':2,'d':3,'e':4,'f':5},{'c':1,'d':2,'e':3,'f':4,'g':5}]

sums = Counter()
counters = Counter()
for itemset in A:
    sums.update(itemset)
    counters.update(itemset.keys())

ret = {x: float(sums[x])/counters[x] for x in sums.keys()}

print(ret)

{'a': 1.0, 'b': 1.5, 'c': 2.0, 'd': 3.0, 'e': 4.0, 'f': 4.5, 'g': 5.0}


In [5]:
with open('./scores.csv', 'w+') as scores_file :
    scores_file.write('twitter_username,mastodon_username,Twitter_TOXICITY,Twitter_SEVERE_TOXICITY,Twitter_IDENTITY_ATTACK,Twitter_INSULT,Twitter_PROFANITY,Twitter_THREAT,Mastodon_TOXICITY,Mastodon_SEVERE_TOXICITY,Mastodon_IDENTITY_ATTACK,Mastodon_INSULT,Mastodon_PROFANITY,Mastodon_THREAT')

In [9]:
def process_users_scores(twitter_username, mastodon_username):
    print(f"Processing {twitter_username}")
    # Example mastodon user : https://techhub.social/@nateplusplus
    # extract mastodon server name  
    mastodon_server = mastodon_username.split('https://')[1].split('/@')[0]
    # extract mastodon user name
    mastodon_user = mastodon_username.split('/@')[-1].split('/')[0]
    # build folder name as we constructed it before
    mastodon_folder = mastodon_server+'_'+mastodon_user
    
    ## TODO: for folder read all tweets and query api
    # 1. extract tweets
    try:
        tweets = glob.glob(f"./tweets/{twitter_username}/*.json")
        # 2. extract mastodon posts
        mastodon_posts = glob.glob(f"./mastodon_posts/{mastodon_folder}/*.json")
    except:
        print(f"This user is not present in one of the two socials, going next.")
        return
    tweets_amount = len(list(tweets))
    mastodon_posts_amount = len(list(mastodon_folder))
    
    twitter_scores = []
    mastodon_scores = []
    
    languages = set()
    print("scoring Mastodon")
    for post in tqdm(mastodon_posts) :
        with open(post, 'r+') as post_file:
            post_json = json.load(post_file)
        text = post_json['content']
        languages.add(post_json['language'])
        
        scores = query_perspective(text=text, languages = list(languages))
        

        mastodon_scores.append({
            'TOXICITY': scores['TOXICITY']['summaryScore']['value'],
                'SEVERE_TOXICITY': scores['SEVERE_TOXICITY']['summaryScore']['value'],
                'IDENTITY_ATTACK' : scores[ 'IDENTITY_ATTACK' ]['summaryScore']['value'],
                'INSULT': scores['INSULT']['summaryScore']['value'],
                'PROFANITY' : scores[ 'PROFANITY' ]['summaryScore']['value'],
                'THREAT' : scores[ 'THREAT' ]['summaryScore']['value']
        })
          
    print("scoring Twitter")  
    for tweet in tqdm(tweets) :
        with open(tweet, 'r+') as tweet_file:
            tweet_json = json.load(tweet_file)
        text = tweet_json['text']

        scores = query_perspective(text=text, languages= list(languages))
            
        twitter_scores.append(
            {
            'TOXICITY': scores['TOXICITY']['summaryScore']['value'],
            'SEVERE_TOXICITY': scores['SEVERE_TOXICITY']['summaryScore']['value'],
            'IDENTITY_ATTACK' : scores['IDENTITY_ATTACK']['summaryScore']['value'],
            'INSULT': scores['INSULT']['summaryScore']['value'],
            'PROFANITY' : scores['PROFANITY']['summaryScore']['value'],
            'THREAT' : scores['THREAT']['summaryScore']['value']
        })
        
    
    sums = Counter()
    counters = Counter()
    for itemset in mastodon_scores:
        sums.update(itemset)
        counters.update(itemset.keys())

    mastodon_score = {x: float(sums[x])/counters[x] for x in sums.keys()}

    sums = Counter()
    counters = Counter()
    for itemset in twitter_scores:
        sums.update(itemset)
        counters.update(itemset.keys())

    twitter_score = {x: float(sums[x])/counters[x] for x in sums.keys()}
    
    with open('./scores.csv', 'a+') as scores_file :
        scores_file.write(f"\n{twitter_username},{mastodon_folder},{twitter_score['TOXICITY']},{twitter_score['SEVERE_TOXICITY']},{twitter_score['IDENTITY_ATTACK']},{twitter_score['INSULT']},{twitter_score['PROFANITY']},{twitter_score['THREAT']},{mastodon_score['TOXICITY']},{mastodon_score['SEVERE_TOXICITY']},{mastodon_score['IDENTITY_ATTACK']},{mastodon_score['INSULT']},{mastodon_score['PROFANITY']},{mastodon_score['THREAT']}")
        

            

In [10]:
users = pd.read_csv('./users.csv', delimiter=',', index_col=None)
with open('./users_processed.txt', 'r+') as users_processed_file :
    users_processed = users_processed_file.read().split(',')
    print(users_processed)
for idx, row in users.iterrows():
    if(row['twitter_username'] not in users_processed):
        process_users_scores(row['twitter_username'], row['mastodon_username'])
        with open('./users_processed.txt', 'a+') as users_processed_file :
            users_processed_file.write(f"{row['twitter_username']},")

['']
Processing piesdeperro
scoring Mastodon


100%|██████████| 2/2 [01:01<00:00, 30.69s/it]


scoring Twitter


 45%|████▌     | 214/473 [06:37<00:37,  6.96it/s] 