In [3]:
from pymongo import MongoClient
from pprint import pprint
from bson.objectid import ObjectId
from collections import defaultdict
import dateutil
import json

In [4]:
with open("secrets.json", "r") as secrets:
    mongo_uri = json.load(secrets)["mongo_uri"]


In [3]:
client = MongoClient(mongo_uri)
db=client.aggie

In [52]:
# testing connection
facebook = db.reports.count_documents({'$and': [{'_sourceNicknames': ["ct"]}]})
print(facebook)

225289


In [5]:
# find start date/time for HS activation

hs_reports_sorted = db.reports.find({'$and':[{"metadata.hateSpeechScore": { '$exists': True}}]}, {'metadata.rawAPIResponse': 0}).sort([("fetchedAt", 1)])
print("First Hate Speech report fetched at : " + str(hs_reports_sorted.next()['fetchedAt']))


First Hate Speech report fetched at : 2020-08-05 06:01:12.969000


In [19]:
total_reports = db.reports.count_documents({})
print("Total Reports : {}".format(total_reports))

Total Reports : 551676


In [18]:
#verify that all reports after hs activation have hs field
num_reports_after_hs_activation = db.reports.count_documents({'$and':[{'$or':[{"_sourceNicknames": ["fb"]}, {"_sourceNicknames": ["ct"]}]}, {'fetchedAt':{'$lte': dateutil.parser.parse('2020-11-22 18:40:32.568000')}}]})
total_hs_reports = db.reports.count_documents({'$and':[{"metadata.hateSpeechScore": { '$exists': True}},   {'$or' : [{"_sourceNicknames": ["ct"]}, {"_sourceNicknames": ["fb"]}]},{'fetchedAt':{'$lte': dateutil.parser.parse('2020-11-22 18:40:32.568000')}}]})
print(num_reports_after_hs_activation)
assert(num_reports_after_hs_activation == total_hs_reports)

502454


In [20]:
# None HS score means that the report is in non Burmese language or does not have any content
non_null_hs_reports_count = db.reports.count_documents({'$and':[{"metadata.hateSpeechScore": { '$exists': True}}, {"metadata.hateSpeechScore":{'$ne' : None}}, {'$or' : [{"_sourceNicknames": ["ct"]}, {"_sourceNicknames": ["fb"]}]}, {'fetchedAt':{'$lte': dateutil.parser.parse('2020-11-22 18:40:32.568000')}}]})
print("Total non null HS FB reports : " + str(non_null_hs_reports_count))
# hs_reports_sorted = db.reports.find_one({'$and': [{'fetchedAt':{'$gte': dateutil.parser.parse('2020-10-16T00:00:00.000+00:00')}},{"metadata.hateSpeechScore": { '$exists': True}}, {"metadata.hateSpeechScore":{'$ne' : None}}]}, {'metadata.rawAPIResponse': 0})
# pprint(hs_reports_sorted)

Total non null HS FB reports : 207297


In [21]:
#Read/Unread
non_null_hs_reports_read_count = db.reports.count_documents({'$and':[{"metadata.hateSpeechScore": { '$exists': True}}, {"metadata.hateSpeechScore":{'$ne' : None}}, {'$or' : [{"_sourceNicknames": ["ct"]}, {"_sourceNicknames": ["fb"]}]}, {"read": True}, {"fetchedAt": {'$lte': dateutil.parser.parse('2020-11-22 18:40:32.568000')}}]})
print("Total non null HS FB reports read: {} ({})".format(str(non_null_hs_reports_read_count), non_null_hs_reports_read_count/non_null_hs_reports_count))

Total non null HS FB reports read: 93058 (0.44891146519245334)


In [22]:
hs_reports_45 = db.reports.count_documents({'$and':[{'$or' : [{"_sourceNicknames": ["ct"]}, {"_sourceNicknames": ["fb"]}]}, {"metadata.hateSpeechScore":{'$gte':0.45}}, {"read": True}, {"fetchedAt": {'$lte': dateutil.parser.parse('2020-11-22 18:40:32.568000')}}]})
print("Total non null HS FB reports 0.45: {} ({})".format(hs_reports_45, hs_reports_45/non_null_hs_reports_read_count))


Total non null HS FB reports 0.45: 45209 (0.48581529798620215)


In [24]:
hs_reports_55 = db.reports.count_documents({'$and':[{'$or' : [{"_sourceNicknames": ["ct"]}, {"_sourceNicknames": ["fb"]}]}, {"metadata.hateSpeechScore":{'$gte':0.55}}, {"read": True}, {"fetchedAt": {'$lte': dateutil.parser.parse('2020-11-22 18:40:32.568000')}}]})
print("Total non null HS FB reports 0.55: {} ({})".format(hs_reports_55, hs_reports_55/non_null_hs_reports_read_count))

Total non null HS FB reports 0.55: 29681 (0.31895162156934387)


In [25]:
hs_reports_65 = db.reports.count_documents({'$and':[{'$or' : [{"_sourceNicknames": ["ct"]}, {"_sourceNicknames": ["fb"]}]}, {"metadata.hateSpeechScore":{'$gte':0.65}}, {"read": True}, {"fetchedAt": {'$lte': dateutil.parser.parse('2020-11-22 18:40:32.568000')}}]})
print("Total non null HS FB reports 0.65: {} ({})".format(hs_reports_65, hs_reports_65/non_null_hs_reports_read_count))

Total non null HS FB reports 0.65: 1010 (0.01085344623783017)


In [26]:
# total incidents with "hate" in title
hs_incidents = db.incidents.count_documents({'$and':[{"title": {"$regex": ".*[hH]ate.*"}}, {"storedAt": {'$lte': dateutil.parser.parse('2020-11-22 18:40:32.568000')}}]})
print("HS incidents : {}".format(hs_incidents))


HS incidents : 566


In [27]:
#hate speech incidents with 0 report and hate speech in title
hs_incidents_0_reports = db.incidents.count_documents({'$and': [{'totalReports': 0}, {"title" : {'$regex' : ".*[hH]ate.*"}}, {"storedAt": {'$lte': dateutil.parser.parse('2020-11-22 18:40:32.568000')}}]})
print("HS incidents with 0 reports: {}".format(hs_incidents_0_reports))

HS incidents with 0 reports: 8


In [28]:
#hate speech incidents with only 1 report
hs_incidents_1_report = db.incidents.count_documents({'$and': [{'totalReports': 1}, {"title" : {'$regex' : ".*[hH]ate.*"}}, {"storedAt": {'$lte': dateutil.parser.parse('2020-11-22 18:40:32.568000')}}]})
print("HS incidents with 1 report : {}".format(hs_incidents_1_report))

HS incidents with 1 report : 433


In [29]:
#hate speech incidents with > 1 report and hate speech in title
hs_incidents_gt_1_reports = db.incidents.count_documents({'$and': [{'totalReports':{'$gt': 1}}, {"title" : {'$regex' : ".*[hH]ate.*"}}, {"storedAt": {'$lte': dateutil.parser.parse('2020-11-22 18:40:32.568000')}}]})
print("HS incidents with > 1 report and HS in title: {}".format(hs_incidents_gt_1_reports))

HS incidents with > 1 report and HS in title: 125


In [22]:
# no longer necessary because if HS is in title..it applies to all reports
# print("Incidents with multiple tags including hate speech (can't determine if reports are HS) : {}".format(hs_incidents - (hs_incidents_1_report + hs_incidents_gt_1_reports)))

In [9]:
#creating a list of HS incidents
hs_incidents_1_report_list = list(db.incidents.find({'$and': [{'totalReports': 1}, {"title" : {'$regex' : ".*[hH]ate.*"}}, {"storedAt": {'$lte': dateutil.parser.parse('2020-11-22 18:40:32.568000')}}]}))
hs_incidents_gt_1_reports_list = list(db.incidents.find({'$and': [{'totalReports':{'$gt': 1}}, {"title" : {'$regex' : ".*[hH]ate.*"}}, {"storedAt": {'$lte': dateutil.parser.parse('2020-11-22 18:40:32.568000')}}]}))
hs_incidents_list = hs_incidents_1_report_list + hs_incidents_gt_1_reports_list
print(len(hs_incidents_list))

558


In [10]:
#creating a list of hs incident ids for cross referencing
hs_incidents_ids = [str(incident["_id"]) for incident in hs_incidents_list]

In [11]:
#finding reports with non null incidents after HS activation
classifiable_hs_reports = db.reports.count_documents({'$and':[{"metadata.hateSpeechScore": { '$exists': True}}, {"metadata.hateSpeechScore":{'$ne' : None}}, {'$or' : [{"_sourceNicknames": ["ct"]}, {"_sourceNicknames": ["fb"]}]}, {"fetchedAt": {'$lte': dateutil.parser.parse('2020-11-22 18:40:32.568000')}}]})
print("Classifiable HS reports (has content in Burmese language and fetched after HS activation) : {}".format(classifiable_hs_reports))
non_null_incidents_reports_after_hs_activation = db.reports.count_documents({'$and': [{"_incident":{'$exists':True}}, {"_incident": {'$nin': [None, ""]}}, {"fetchedAt": {'$lte': dateutil.parser.parse('2020-11-22 18:40:32.568000')}}]})
print("Non null incident field in reports after HS activation : {}".format(non_null_incidents_reports_after_hs_activation))

non_null_incidents_reports_after_hs_activation_list = list(db.reports.find({'$and': [{"_incident":{'$exists':True}}, {"_incident": {'$nin': [None, ""]}},{"fetchedAt": {'$lte': dateutil.parser.parse('2020-11-22 18:40:32.568000')}}]}, {"metadata.rawAPIResponse":0}))

Classifiable HS reports (has content in Burmese language and fetched after HS activation) : 207297
Non null incident field in reports after HS activation : 4560


In [12]:
#creating a incident ->reports dict
incident_report_dict = defaultdict(list)
for report in non_null_incidents_reports_after_hs_activation_list:
    incident_report_dict[report["_incident"]].append(report)
print("Unique incidents created between the timeframe : {}".format(len(incident_report_dict.keys())))

Unique incidents created between the timeframe : 2006


In [13]:
# finding intersection between HS incidents and reports that have incidents after HS activation
report_incidents_ids = list(incident_report_dict.keys())
hs_reports_incidents_ids =  list((set(report_incidents_ids) & set(hs_incidents_ids)))
print("# of HS incidents created in the timeframe that are referenced in the reports collection : {}".format(len(hs_reports_incidents_ids)))

# of HS incidents created in the timeframe that are referenced in the reports collection : 556


In [14]:
reports_in_hs_incidents = [v for k, v in incident_report_dict.items() if k in hs_reports_incidents_ids]
print(len(reports_in_hs_incidents))

556


In [16]:
reports_in_hs_incidents_dcntr = []  #flattening
for reports in reports_in_hs_incidents:
    reports_in_hs_incidents_dcntr.extend(reports)

print("Reports added to HS incidents after HS activation : {}".format(len(reports_in_hs_incidents_dcntr)))

Reports added to HS incidents after HS activation : 1092


In [17]:
reports_in_hs_incidents_with_burmese = [r for r in reports_in_hs_incidents_dcntr if "hateSpeechScore" in r["metadata"] and r["metadata"]["hateSpeechScore"] is not None]
print("Classifiable reports with Burmese content : {}".format(len(reports_in_hs_incidents_with_burmese)))
# for r in reports_in_hs_incidents_with_burmese:
#     print(str(r["metadata"]["hateSpeechScore"]) + ",")
for r in reports_in_hs_incidents_with_burmese:
    print(str(r["metadata"]["hateSpeechScore"]) + ",")

Classifiable reports with Burmese content : 308
0.4187661932748155,
0.6612644845706683,
0.6589945679848431,
0.6250412186244548,
0.039547387896563396,
0.21029909854363676,
0.18789589975200913,
0.10346899923253268,
0.3965140180163364,
0.6250412186244548,
0.3085547522786059,
0.2906474554435688,
0.5998689327914108,
0.6137972165136802,
0.34499680180780956,
0.34499680180780956,
0.3102394593702947,
0.4253384644028445,
0.5668188107708787,
0.599035498007595,
0.32233448649842517,
0.5441910924969257,
0.06494117647058824,
0.4070938142082908,
0.11106370916897232,
0.42820926502569745,
0.6115014686179403,
0.4414823742240098,
0.1390024953466172,
0.5877479148899869,
0.6523378723593264,
0.6523378723593264,
0.6157828508315746,
0.5764496677966591,
0.6330227148522344,
0.599035498007595,
0.617747342449965,
0.6523378723593264,
0.2250053930408801,
0.19520745153048943,
0.6250412186244548,
0.4357738615219883,
0.24075299330217736,
0.24075299330217736,
0.40381097099001745,
0.5266472254533126,
0.6250412186244548,


In [47]:
classifiable_reports_in_hs_incidents_45 = [r for r in reports_in_hs_incidents_with_burmese if r["metadata"]["hateSpeechScore"] >= 0.45]
print("Classifiable reports with Burmese content in HS incidents with HS score > 0.45: {} ({})".format(len(classifiable_reports_in_hs_incidents_45), len(classifiable_reports_in_hs_incidents_45)/len(reports_in_hs_incidents_with_burmese)))

Classifiable reports with Burmese content in HS incidents with HS score > 0.45: 171 (0.5551948051948052)
