-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathautoritiesCheck.py
95 lines (72 loc) · 2.76 KB
/
autoritiesCheck.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argparse
import pprint
import MySQLdb
import re
import json
import csv
import gzip # Using gzip file
pp = pprint.PrettyPrinter(indent=4)
parser = argparse.ArgumentParser(description="""Script for parsing authorities from dumps and DB""")
parser.add_argument("-dump",help="""Path to Dump file""")
parser.add_argument("-authorities",help="""Path to Authorities file""")
parser.add_argument("-config",help="""Path to a JSON file with configuration options!""")
args = parser.parse_args()
data = {}
authorities = {}
mysqlmode = False
conn = None
if "config" in args:
if args.config is not None:
with open(args.config) as json_data_file:
data = json.load(json_data_file)
if "mysql" in data:
dbfile = None
mysqlmode = True
conn = MySQLdb.connect(host=data["mysql"]["host"], user=data["mysql"]["user"], passwd=data["mysql"]["password"], db=data["mysql"]["database"], use_unicode=True, charset='utf8', init_command='SET NAMES UTF8')
if conn is None:
print("NO CONNECTION")
exit()
def addToDb( id, props, conn, iter ):
c = conn.cursor()
records = []
for prop in props:
records.append( ( id, prop ) )
c.executemany( "INSERT INTO `authorities` (`id`, `authority`) VALUES ( %s, %s )", records )
if iter > 10000 :
conn.commit()
iter = 0
else :
iter = iter + 1
return iter
if "authorities" in args:
if args.authorities is not None:
with open(args.authorities) as authorities_file:
csvreader = csv.reader(authorities_file, delimiter='\t')
for row in csvreader:
authorities[row[2]] = 1
#pp.pprint( authorities )
if "dump" in args:
if args.dump is not None:
cur = conn.cursor()
cur.execute("DROP TABLE IF EXISTS `authorities`;")
cur.execute("CREATE TABLE IF NOT EXISTS `authorities` ( `id` VARCHAR(25), `authority` VARCHAR(25), PRIMARY KEY (`id`, `authority`) ) ;")
cur.execute("CREATE INDEX idx_id ON authorities (id);")
cur.execute("CREATE INDEX idx_authorities ON authorities (authority);")
iter = 0
with gzip.open(args.dump,'rt') as f:
for line in f:
detectid = re.findall( r'\"id\":\"(Q\d+)\"', line )
if len( detectid ) > 0:
id = detectid[0]
# print( id )
listp = re.findall( r'\"(P\d+)\"', line )
authp = []
for prop in listp:
if prop in authorities:
authp.append( prop )
# pp.pprint( authp )
if len( authp ) > 0:
iter = addToDb( id, list(set(authp)), conn, iter )
conn.commit()