/
topurbi_createmwpages_transcriptionalcedo.py
101 lines (67 loc) · 2.73 KB
/
topurbi_createmwpages_transcriptionalcedo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# -*- coding: utf-8 -*-
"""TopUrbi_createMWpages_transcriptionAlcedo.ipynb
Author: Carmen Brando (EHESS) - ANR TopUrbi
Automatically generated by Colaboratory.
"""
import pandas as pd
csv_folder = "TO_FILL"
ids_MW_pages = pd.read_csv(csv_folder + "omeka_items_export.csv")
ids_MW_pages.sample()
ids_MW_pages['o:id']
ids_MW_pages['o:id'] = ids_MW_pages['o:id'].apply(str)
ids_MW_pages['o:id']
ids_MW_pages['o:media']
ids_MW_pages['o:media'] = ids_MW_pages['o:media'].apply(str)
ids_MW_pages['o:media']
ids_MW_pages['o:title']
ids_MW_pages.set_index('o:title')
transcriptions_pages = pd.read_csv(csv_folder + "items-transcription.csv", sep=';') #items-transcription.csv
transcriptions_pages
transcriptions_pages['titre'] #titre
transcriptions_pages.set_index('titre') #titre
transcriptions_wikipages = pd.merge(transcriptions_pages, ids_MW_pages, left_index=True, right_index=True)
transcriptions_wikipages
transcriptions_wikipages['transcr'] #transcr
transcriptions_wikipages['transcr']= transcriptions_wikipages['transcr'].str.replace('\r\n','\n\n')
pd.set_option('max_colwidth', 100)
transcriptions_wikipages['transcr']
transcriptions_wikipages['transcr']= transcriptions_wikipages['transcr'].str.replace('\n ','\n')
# detectar lineas que empiezan con espacio y eliminarlo
pd.set_option('max_rows', 99)
transcriptions_wikipages['transcr']
transcriptions_wikipages['o:id']
transcriptions_wikipages['o:media']
import requests
S = requests.Session()
URL = "http://TO_FILL/mediawiki/api.php"
def create_edit_pagesMW_api(row):
itemid = row['o:id']
mediaid = row['o:media']
text = row['transcr'] #transcr
# Step 1: GET request to fetch login token
PARAMS_0 = { "action": "query", "meta": "tokens", "type": "login", "format": "json"}
R = S.get(url=URL, params=PARAMS_0)
DATA = R.json()
LOGIN_TOKEN = DATA['query']['tokens']['logintoken']
# Step 2: POST request to log in. Use of main account for login is not
PARAMS_1 = {"action": "login", "lgname": "TO_FILL", "lgpassword": "TO_FILL", "lgtoken": LOGIN_TOKEN, "format": "json"}
R = S.post(URL, data=PARAMS_1)
# Step 3: GET request to fetch CSRF token
PARAMS_2 = { "action": "query", "meta": "tokens", "format": "json"}
R = S.get(url=URL, params=PARAMS_2)
DATA = R.json()
CSRF_TOKEN = DATA['query']['tokens']['csrftoken']
# Step 4: POST request to edit a page
PARAMS_3 = { "action": "edit",
"title": "2:"+str(itemid)+":"+str(mediaid),
"token": CSRF_TOKEN,
"format": "json",
"text": text #appendtext
}
print(PARAMS_3)
R = S.post(URL, data=PARAMS_3)
DATA = R.json()
print(DATA)
return DATA
response = transcriptions_wikipages.apply(create_edit_pagesMW_api, axis=1) # axis=1 is important to use the row itself
response.to_csv("output_createpages.csv")