/
smell_datamine_multiprocessing.py
264 lines (214 loc) · 10.4 KB
/
smell_datamine_multiprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
'''This script datamines the reports by simple python operations to find smell related words in the reports and
categorises by smell types. Only nltk sentence tokenizer is used.
SQLite set up. b19952788 Offensive Effluvia.—every purveyor of milk , or person sell milk by
'''
import progressbar
from map2 import mapping
import concurrent.futures
from timeit import default_timer as timer
# walk through the os and get all files
# read each file in tern and go through line by line
#problematic reports Acton.1915.b19783905.txt LondonCountyCouncil.1929.b1825276x.txt
#PortandCityofLondon.1975.b19884084.txt PortandCityofLondon.1976.b19884096.txt
#PortandCityofLondon.1978.b19884114.txt
from os import listdir
import nltk.data
import dataset
SMELL_WORDS = ['smell', 'stench', 'stink', 'odour', 'sniff', 'effluvium', 'aroma', 'pungent', 'pungency']
REPORTS_DIR = '../Full text'
class SmellType(object):
def __init__(self, name, synonyms):
self.name = name
self.synonyms = synonyms
class Smell(object):
def __init__(self, borough, category, sentence, year, bID, url, mohRegion):
self.borough = borough
self.category = category
self.sentence = sentence
self.year = year
self.bID = bID
self.url = url
self.mohRegion = mohRegion
def __repr__(self):
return "Smell(%s, %s, %s, %s)" % (repr(self.borough), repr(self.category), repr(self.sentence), repr(self.year))
def __eq__(self, other):
return repr(self) == repr(other)
# TEMPLATE: category = SmellType('category_name', ['synonym1', 'synonym2'])
# Smell categories here
sewer = SmellType('sewer', ['sewer', 'drain', 'sewage', 'manhole', 'gully', 'gulley', 'cesspool', 'ventilator', 'ventilation'])
thames = SmellType('thames', ['quay', 'sediment', 'thames', 'river'])
water = SmellType('water', ['water'])
waste_rubbish = SmellType('waste-rubbish', ['refuse', 'dust', 'waste', 'dump', 'rubbish', 'offensive matter'])
waste_excrement = SmellType('waste-excrement', ['excrement', 'excreta', 'privy', 'manure', 'dung'])
food = SmellType('food', ['food', 'stock', 'yeast', 'pie', 'sauce', 'lemonade', 'bread', 'onion', 'vinegar', 'cherry', 'flavour', 'coffee', 'chocolate', 'cream', 'fruit', 'vegetable', 'salad', 'cheese',
'pickle', 'tea', 'gherkin', 'fish', 'kipper', 'fillet', 'steak', 'mutton', 'tripe', 'cake', 'milk',
'yoghurt', 'butter', 'ice', 'caramel', 'can', 'egg', 'preserve', 'cook', 'veal',
'lamb', 'soup', 'peel', 'ham', 'sausage', 'cow', 'meat', 'sour', 'beef', 'rice', 'trough'])
trade = SmellType('trade', ['trade', 'business', 'laboratory', 'copper', 'dry cleaning', 'launderette',
'laundrette', 'chemist', 'hide', 'bladder', 'glue', 'tannery', 'tanneries', 'rubber', 'gum', 'fat', 'oil', 'fellmonger', 'slaughter',
'costermonger', 'manufacture', 'ferment', 'butcher', 'burning'])
animal = SmellType('animal', ['animal', 'pig', 'stable', 'piggeries', 'piggery', 'manure', 'excrement', 'cowhouse'])
disinfectant = SmellType('disinfectant', ['disinfect', 'antiseptic'])
factory_fuel = SmellType('factory-fuel', ['factory', 'factories', 'industrial', 'rubber', 'naphtha', 'fuel', 'works'])
school = SmellType('school', ['school', 'lavatories', 'lavatory', 'discharge', 'playground'])
air = SmellType('air', ['gas', 'air', 'atmosphere', 'coal', 'carbonic acid', 'hydrogen', 'vapour', 'smoke', 'sulphide'])
decomposition = SmellType('decomposition', ['mortuary', 'coffin', 'decomposition', 'burial', 'dead', 'body', 'church', 'chapel'])
habitation = SmellType('habitation', ['house', 'premise' 'flat', 'dwell', 'cottage', 'room', 'home', 'ward', 'clothing', 'bed', 'barge', 'cupola'])
absence_of_smell = SmellType('absence of smell', ['no offensive smell', 'no effluvium', 'smell-none', 'no smell', 'no nuisance from smell', 'absence of smell', 'no offensive odour', 'no bad odour', 'odourless', 'no disagreeable smell', 'devoid of aroma', 'no aroma','deficient in aroma', 'deficient of aroma'])
def get_file_names():
"""Retrieve file names"""
fileNames = [f for f in listdir(REPORTS_DIR) if f.endswith('txt')]
#return [f for f in fileNames if "b19952788" in f]
return fileNames
def get_new_pos(old_pos):
"""This is a function that converts part-of-speech codes to abbreviated parts-of-speech"""
new_pos = ""
if old_pos.startswith('J'):
new_pos = "a"
elif old_pos.startswith('V'):
new_pos = "v"
elif old_pos.startswith('N'):
new_pos = "n"
elif old_pos.startswith('R'):
new_pos = "r"
else:
new_pos = ""
return new_pos
# DEBUG
# lemmatization_variants = {}
# example_sentences = {}
def lemmatize_sentence(sentence):
"""This is a fuction that takes a sentence and lemmatize the sentence."""
lemmatized = []
tokens = nltk.word_tokenize(sentence)
pos_tagging = nltk.pos_tag(tokens)
wnl = nltk.WordNetLemmatizer()
for (word, pos) in pos_tagging:
wordnet_pos = get_new_pos(pos)
if wordnet_pos != "":
lemma = wnl.lemmatize(word.lower(), wordnet_pos)
# DEBUG
# if word not in lemmatization_variants:
# lemmatization_variants[word] = {}
# example_sentences[word] = {}
# lemmatization_variants[word][pos] = lemma
# example_sentences[word][pos] = [sentence, pos_tagging]
# if len(set(lemmatization_variants[word].values())) > 1:
# print("Lemmatizations for", word, lemmatization_variants[word])
# print("Examples", example_sentences[word])
else:
lemma = word.lower()
if word.istitle():
lemma = lemma.capitalize()
elif word.upper() == word:
lemma = lemma.upper()
# if word != lemma:
# print((word, lemma, pos, wordnet_pos))
lemmatized.append(lemma)
return " ".join(lemmatized)
def tokenize_to_sentence(text):
parser = nltk.data.load('tokenizers/punkt/english.pickle')
# split into sentences
sentences = parser.tokenize(text.strip())
return [lemmatize_sentence(sentence) for sentence in sentences]
def worker(file_name):
"""This is a function used to calculate the result in concurrent futures."""
dataminer = SmellDataMine()
dataminer.process_file(file_name)
return dataminer.results + dataminer.uncategorised
class SmellDataMine(object):
def __init__(self):
self.smellTypes = [sewer, thames, water, waste_rubbish, waste_excrement, trade, school, air, factory_fuel,
decomposition, animal, food, habitation, absence_of_smell, disinfectant]
self.results = []
self.uncategorised = []
def save_to_database(self, results):
"""Save results to the database."""
db = dataset.connect('sqlite:///../database/smells.sqlite')
table = db['smells']
for result in results:
table.insert({'Category': result.category,
'Borough': result.borough,
'Year': result.year,
'Sentence': result.sentence,
'bID': result.bID,
'URL': result.url,
'MOH': result.mohRegion})
def getUrl(self, bID):
"""Return url for the website given bID."""
website = 'http://wellcomelibrary.org/item/'
return website + bID
def getMeta(self, fileName):
"""Return the meta data for a given fileName e.g year, url, MOH, borough, bID. """
splitReport = fileName.split('.')
bID = splitReport[2]
year = splitReport[1]
url = self.getUrl(bID)
try:
region = mapping[bID][1]
mohRegion = mapping[bID][0]
except:
# TODO there is a problem with mappings e.g Acton.1915.b19783905.txt. Region cannot be found
print(fileName)
return (None, None, None, None, None)
return year, region, bID, url, mohRegion
def process_file(self, fileName):
path = REPORTS_DIR + '/' + fileName
references = []
year, region, bID, url, mohRegion = self.getMeta(fileName)
if not all([year, region]):
return
# reassign global fns to local variable
appendResults = self.results.append
appendUncategorised = self.uncategorised.append
with open(path) as f:
for line in f:
report_tokenized = tokenize_to_sentence(line)
# break into sentences
for sentence in report_tokenized:
for word in SMELL_WORDS:
if word in sentence.lower():
categories = self.categorise_sentence(sentence)
if categories:
for category in categories:
o = Smell(region, category, sentence, year, bID, url, mohRegion)
appendResults(o)
break
else:
o = Smell(region, 'Uncategorised', sentence, year, bID, url, mohRegion)
appendUncategorised(o)
break
def categorise_sentence(self, sentence):
""" This is a function that categorises"""
categories = []
for category in self.smellTypes:
for synonym in category.synonyms:
if synonym in sentence.lower():
categories.append(category.name)
return categories
def main():
start = timer()
files = get_file_names()
smell_results = []
bar = progressbar.ProgressBar(max_value=len(files))
processed_files = 0
with concurrent.futures.ProcessPoolExecutor() as executor:
for file, smell in zip(files, executor.map(worker, files)):
smell_results = smell_results + smell
processed_files += 1
bar.update(processed_files)
smell_results = [x for x in smell_results if x]
end = timer()
print(end - start)
dataminer = SmellDataMine()
dataminer.save_to_database(smell_results)
def delete_database():
import os.path
path = '../database/smells.sqlite'
if os.path.isfile(path):
os.remove(path)
else:
print('No DB found for removal!')
if __name__ == '__main__':
delete_database()
main()