In [1]:
from tqdm.auto import tqdm

In [2]:
import django

In [3]:
from api.models import Incident, Report

# Load AIID data

In [4]:
import pandas as pd

def load_datasets():
    incidents = pd.read_csv('../data_backup/mongodump/incidents.csv')
    incidents.columns = incidents.columns.str.lower()
    cols = [
        'alleged deployer of ai system',
        'alleged developer of ai system',
        'alleged harmed or nearly harmed parties',
        'reports'
    ]
    for c in cols:
        incidents[c] = incidents[c].apply(eval)

    reports = pd.read_csv('../data_backup/mongodump/reports.csv')
    reports.columns = reports.columns.str.lower()

    cols = ['submitters']
    for c in cols:
        reports[c] = reports[c].apply(eval)

    date_cols = reports.columns[reports.columns.str.startswith('date')]
    for c in date_cols:
        reports[c] = pd.to_datetime(reports[f'epoch_{c}'], unit='s')

    return incidents, reports

incidents, reports = load_datasets()

In [4]:
incidents.head()

Unnamed: 0,_id,incident_id,date,reports,alleged deployer of ai system,alleged developer of ai system,alleged harmed or nearly harmed parties,description,title
0,ObjectId(625763de343edc875fe63a15),23,2017-11-08,"[242, 243, 244, 245, 246, 247, 248, 249, 250, ...","[navya, keolis-north-america]","[navya, keolis-north-america]","[navya, keolis-north-america, bus-passengers]",A self-driving public shuttle by Keolis North ...,Las Vegas Self-Driving Bus Involved in Accident
1,ObjectId(625763dc343edc875fe63a02),4,2018-03-18,"[629, 630, 631, 632, 633, 634, 635, 636, 637, ...",[uber],[uber],"[elaine-herzberg, pedestrians]",An Uber autonomous vehicle (AV) in autonomous ...,Uber AV Killed Pedestrian in Arizona
2,ObjectId(625763db343edc875fe639ff),1,2015-05-19,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15]",[youtube],[youtube],[children],YouTube’s content filtering and recommendation...,Google’s YouTube Kids App Presents Inappropria...
3,ObjectId(625763de343edc875fe63a10),18,2015-04-04,"[130, 131, 132, 133, 134, 135, 136, 137, 138, ...",[google],[google],[women],Google Image returns results that under-repres...,Gender Biases of Google Image Search
4,ObjectId(625763dd343edc875fe63a0a),12,2016-07-21,[42],"[microsoft-research, boston-university]","[microsoft-research, google, boston-university]","[women, minority-groups]",Researchers from Boston University and Microso...,Common Biases of Vector Embeddings


In [5]:
reports.head()

Unnamed: 0,_id,incident_id,authors,date_downloaded,date_modified,date_published,date_submitted,description,epoch_date_downloaded,epoch_date_modified,...,image_url,language,ref_number,report_number,source_domain,submitters,text,title,url,tags
0,ObjectId(5d34b8c29ced494f010ed45a),,"[""Alistair Barr""]",2019-04-13,2020-06-14,2015-05-19,2019-06-01,Child and consumer advocacy groups complained ...,1555113600,1592092800,...,http://si.wsj.net/public/resources/images/BN-I...,en,,1,blogs.wsj.com,[Roman Yampolskiy],Child and consumer advocacy groups complained ...,Google’s YouTube Kids App Criticized for ‘Inap...,https://blogs.wsj.com/digits/2015/05/19/google...,[]
1,ObjectId(5d34b8c29ced494f010ed461),,"[""Sapna Maheshwari""]",2019-04-13,2020-06-14,2018-04-26,2019-06-01,Parents will be able to handpick the channels ...,1555113600,1592092800,...,https://static01.nyt.com/images/2017/11/07/bus...,en,,8,nytimes.com,[Roman Yampolskiy],"YouTube Kids, which has been criticized for in...","YouTube Kids, Criticized for Content, Introduc...",https://www.nytimes.com/2018/04/25/business/me...,[]
2,ObjectId(5d34b8c29ced494f010ed464),,"[""K.G Orphanides""]",2019-04-13,2020-06-14,2018-03-23,2019-06-01,Children's search terms on YouTube are still a...,1555113600,1592092800,...,https://wi-images.condecdn.net/image/ye8GWkPPM...,en,,11,wired.co.uk,[Roman Yampolskiy],Video still of a reproduced version of Minnie ...,Children's YouTube is still churning out blood...,https://www.wired.co.uk/article/youtube-for-ki...,[]
3,ObjectId(5d34b8c29ced494f010ed45b),,"[""Phoebe Weston""]",2019-04-13,2020-06-14,2018-02-07,2019-06-01,Investigators found several unsuitable videos ...,1555113600,1592092800,...,https://i.dailymail.co.uk/i/pix/2018/02/06/15/...,en,,2,dailymail.co.uk,[Roman Yampolskiy],Google-owned YouTube has apologised again afte...,YouTube Kids app is STILL showing disturbing v...,https://www.dailymail.co.uk/sciencetech/articl...,[]
4,ObjectId(5d34b8c29ced494f010ed462),,"[""James Cook""]",2019-04-13,2020-06-14,2018-03-17,2019-06-01,YouTube removed videos from conspiracy theoris...,1555113600,1592092800,...,https://amp.businessinsider.com/images/5aaa960...,en,,9,businessinsider.com,[Roman Yampolskiy],Children were able to watch David Icke's consp...,YouTube suggested conspiracy videos to childre...,https://www.businessinsider.com/youtube-sugges...,[]


In [5]:
qs = Report.objects.filter(report_number__in=list(incidents['reports'].loc[5]))

In [6]:
qs

<QuerySet [<Report: Report object (135)>, <Report: Report object (136)>, <Report: Report object (144)>, <Report: Report object (145)>, <Report: Report object (146)>, <Report: Report object (152)>, <Report: Report object (153)>, <Report: Report object (155)>, <Report: Report object (157)>, <Report: Report object (163)>, <Report: Report object (164)>, <Report: Report object (175)>, <Report: Report object (179)>, <Report: Report object (185)>, <Report: Report object (186)>, <Report: Report object (191)>, <Report: Report object (192)>, <Report: Report object (195)>, <Report: Report object (215)>, <Report: Report object (258)>, '...(remaining elements truncated)...']>

In [7]:
reports[reports['report_number'].isin(incidents['reports'].loc[5])]['report_number']

134    58
135    73
143    60
144    63
145    65
151    69
152    72
154    61
156    57
162    64
163    68
174    67
178    62
184    66
185    70
190    59
191    75
194    76
214    79
257    81
271    74
279    77
298    78
299    80
Name: report_number, dtype: int64

In [8]:
reports.report_number

0          1
1          8
2         11
3          2
4          9
        ... 
3012    3200
3013    3201
3014    3202
3015    3203
3016    3204
Name: report_number, Length: 3017, dtype: int64

# Save to Django database

In [7]:
# saving the reports

for ind, row in tqdm(reports.iterrows(), total=len(reports)):
    r = Report(authors=row['authors'],
               date_downloaded=row['date_downloaded'],
               date_modified=row['date_modified'],
               date_published=row['date_published'],
               date_submitted=row['date_submitted'],
               description=row['description'],
               image_url=row['image_url'],
               language=row['language'],
               report_number = row['report_number'],
               source_domain = row['source_domain'],
               submitters = ','.join(row['submitters']),
               text = row['text'],
               title = row['title'],
               url = row['url'])
    r.save()

  0%|          | 0/3017 [00:00<?, ?it/s]



## Check saved reports

In [9]:
### Get Report ids by incident id, so that it can be used when dumping the incident data
qs = Report.objects.filter(report_number__in=list(incidents['reports'].loc[5]))

## Load incidents

In [17]:
for ind, row in tqdm(incidents.iterrows(), total=len(incidents)):
    # get related reports
    reports = Report.objects.filter(report_number__in=list(incidents['reports'].loc[5])).all()
    
    i = Incident(incident_id = row['incident_id'],
                 date = row['date'],
                 algd_deployer = row['alleged deployer of ai system'],
                 algd_developer = row['alleged developer of ai system'],
                 algd_harm = row['alleged harmed or nearly harmed parties'],
                 description = row['description'],
                 title = row['title'])
    i.save()
    
    i.reports.set(reports)
    i.save()
    
    # reports

  0%|          | 0/551 [00:00<?, ?it/s]



### Check incidents related to reports

In [24]:
i = Incident.objects.get(incident_id=23)
i

<Incident: Incident object (552)>

In [25]:
i.reports.all()

<QuerySet [<Report: Report object (135)>, <Report: Report object (136)>, <Report: Report object (144)>, <Report: Report object (145)>, <Report: Report object (146)>, <Report: Report object (152)>, <Report: Report object (153)>, <Report: Report object (155)>, <Report: Report object (157)>, <Report: Report object (163)>, <Report: Report object (164)>, <Report: Report object (175)>, <Report: Report object (179)>, <Report: Report object (185)>, <Report: Report object (186)>, <Report: Report object (191)>, <Report: Report object (192)>, <Report: Report object (195)>, <Report: Report object (215)>, <Report: Report object (258)>, '...(remaining elements truncated)...']>

In [30]:
r = i.reports.all()[0]
r

<Report: Report object (135)>

In [32]:
r.report_number

58

In [33]:
r2 = Report.objects.get(report_number=58)

In [36]:
r2.incident_set.all()

<QuerySet [<Incident: Incident object (552)>, <Incident: Incident object (553)>, <Incident: Incident object (554)>, <Incident: Incident object (555)>, <Incident: Incident object (556)>, <Incident: Incident object (557)>, <Incident: Incident object (558)>, <Incident: Incident object (559)>, <Incident: Incident object (560)>, <Incident: Incident object (561)>, <Incident: Incident object (562)>, <Incident: Incident object (563)>, <Incident: Incident object (564)>, <Incident: Incident object (565)>, <Incident: Incident object (566)>, <Incident: Incident object (567)>, <Incident: Incident object (568)>, <Incident: Incident object (569)>, <Incident: Incident object (570)>, <Incident: Incident object (571)>, '...(remaining elements truncated)...']>

In [39]:
cd aiidprod

/Users/scottcambo/Documents/GitHub/aiid_demos/notebooks/aiidprod


In [40]:
ls -lh

total 549368
-rw-r--r--@ 1 scottcambo  staff   201M Oct 28 01:11 candidates.bson
-rw-r--r--@ 1 scottcambo  staff   343B Oct 28 01:11 candidates.metadata.json
-rw-r--r--@ 1 scottcambo  staff   2.0M Oct 28 01:11 classifications.bson
-rw-r--r--@ 1 scottcambo  staff   140B Oct 28 01:11 classifications.metadata.json
-rw-r--r--@ 1 scottcambo  staff   869B Oct 28 01:11 duplicates.bson
-rw-r--r--@ 1 scottcambo  staff   140B Oct 28 01:11 duplicates.metadata.json
-rw-r--r--@ 1 scottcambo  staff   111K Oct 28 01:11 entities.bson
-rw-r--r--@ 1 scottcambo  staff   242B Oct 28 01:11 entities.metadata.json
-rw-r--r--@ 1 scottcambo  staff   5.6M Oct 28 01:11 incidents.bson
-rw-r--r--@ 1 scottcambo  staff   279B Oct 28 01:11 incidents.metadata.json
-rw-r--r--@ 1 scottcambo  staff   5.6K Oct 28 01:11 migrations.bson
-rw-r--r--@ 1 scottcambo  staff   140B Oct 28 01:11 migrations.metadata.json
-rw-r--r--@ 1 scottcambo  staff   789B Oct 28 01:11 quickadd.bson
-rw-r--r--@ 1 scottcambo  staff   140B Oct 28 0

# Load with Django-Pandas

In [13]:
from django_pandas.io import read_frame
qs = Incident.objects.all()